bio-twobit 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,32 @@
1
+ # https://github.com/red-data-tools/red-datasets/blob/master/lib/datasets/cache-path.rb
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class CachePath
6
+ def initialize(id)
7
+ @id = id
8
+ end
9
+
10
+ def base_dir
11
+ Pathname(system_cache_dir).expand_path + "bio-twobit" + @id
12
+ end
13
+
14
+ def remove
15
+ FileUtils.rmtree(base_dir.to_s, secure: true) if base_dir.exist?
16
+ end
17
+
18
+ private
19
+
20
+ def system_cache_dir
21
+ case RUBY_PLATFORM
22
+ when /mswin/, /mingw/
23
+ ENV["LOCALAPPDATA"] || "~/AppData/Local"
24
+ when /darwin/
25
+ "~/Library/Caches"
26
+ else
27
+ ENV["XDG_CACHE_HOME"] || "~/.cache"
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,282 @@
1
+ # https://github.com/red-data-tools/red-datasets/blob/master/lib/datasets/downloader.rb
2
+
3
+ require "fileutils"
4
+ begin
5
+ require "io/console"
6
+ rescue LoadError
7
+ end
8
+ require "net/http"
9
+ require "pathname"
10
+
11
+ module Bio
12
+ class TwoBit
13
+ class Downloader
14
+ class TooManyRedirects < StandardError; end
15
+
16
+ def initialize(url)
17
+ url = if url.is_a?(URI::Generic)
18
+ url.dup
19
+ else
20
+ URI.parse(url)
21
+ end
22
+ @url = url
23
+ return if @url.is_a?(URI::HTTP)
24
+
25
+ raise ArgumentError, "download URL must be HTTP or HTTPS: <#{@url}>"
26
+ end
27
+
28
+ def download(output_path)
29
+ return if output_path.exist?
30
+
31
+ output_path.parent.mkpath
32
+
33
+ headers = {
34
+ "Accept-Encoding" => "identity",
35
+ "User-Agent" => "BioTwobit/#{VERSION}"
36
+ }
37
+ start = nil
38
+ partial_output_path = Pathname.new("#{output_path}.partial")
39
+ if partial_output_path.exist?
40
+ start = partial_output_path.size
41
+ headers["Range"] = "bytes=#{start}-"
42
+ end
43
+
44
+ start_http(@url, headers) do |response|
45
+ if response.is_a?(Net::HTTPPartialContent)
46
+ mode = "ab"
47
+ else
48
+ start = nil
49
+ mode = "wb"
50
+ end
51
+
52
+ base_name = @url.path.split("/").last
53
+ size_current = 0
54
+ size_max = response.content_length
55
+ if start
56
+ size_current += start
57
+ size_max += start
58
+ end
59
+ progress_reporter = ProgressReporter.new(base_name, size_max)
60
+ partial_output_path.open(mode) do |output|
61
+ response.read_body do |chunk|
62
+ size_current += chunk.bytesize
63
+ progress_reporter.report(size_current)
64
+ output.write(chunk)
65
+ end
66
+ end
67
+ end
68
+ FileUtils.mv(partial_output_path, output_path)
69
+ rescue TooManyRedirects => e
70
+ last_url = e.message[/\Atoo many redirections: (.+)\z/, 1]
71
+ raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
72
+ end
73
+
74
+ private def start_http(url, headers, limit = 10, &block)
75
+ raise TooManyRedirects, "too many redirections: #{url}" if limit == 0
76
+
77
+ http = Net::HTTP.new(url.hostname, url.port)
78
+ # http.set_debug_output($stderr)
79
+ http.use_ssl = (url.scheme == "https")
80
+ http.start do
81
+ path = url.path
82
+ path += "?#{url.query}" if url.query
83
+ request = Net::HTTP::Get.new(path, headers)
84
+ http.request(request) do |response|
85
+ case response
86
+ when Net::HTTPSuccess, Net::HTTPPartialContent
87
+ return block.call(response)
88
+ when Net::HTTPRedirection
89
+ url = URI.parse(response[:location])
90
+ warn "Redirect to #{url}"
91
+ return start_http(url, headers, limit - 1, &block)
92
+ else
93
+ message = response.code
94
+ message += ": #{response.message}" if response.message and !response.message.empty?
95
+ message += ": #{url}"
96
+ raise response.error_type.new(message, response)
97
+ end
98
+ end
99
+ end
100
+ end
101
+
102
+ class ProgressReporter
103
+ def initialize(base_name, size_max)
104
+ @base_name = base_name
105
+ @size_max = size_max
106
+
107
+ @time_previous = Time.now
108
+ @size_previous = 0
109
+
110
+ @need_report = ($stderr == STDERR and $stderr.tty?)
111
+ end
112
+
113
+ def report(size_current)
114
+ return unless @need_report
115
+ return if @size_max.nil?
116
+ return unless foreground?
117
+
118
+ done = (size_current == @size_max)
119
+ time_current = Time.now
120
+ return if !done and time_current - @time_previous <= 1
121
+
122
+ read_bytes = size_current - @size_previous
123
+ throughput = read_bytes.to_f / (time_current - @time_previous)
124
+ @time_previous = time_current
125
+ @size_previous = size_current
126
+
127
+ message = build_message(size_current, throughput)
128
+ $stderr.print("\r#{message}") if message
129
+ $stderr.puts if done
130
+ end
131
+
132
+ private
133
+
134
+ def build_message(size_current, throughput)
135
+ percent = (size_current / @size_max.to_f) * 100
136
+ formatted_size = format("[%s/%s]", format_size(size_current), format_size(@size_max))
137
+ rest_second = (@size_max - size_current) / throughput
138
+ separator = " - "
139
+ progress = format("%05.1f%% %s %s %s", percent, formatted_size, format_time_interval(rest_second),
140
+ format_throughput(throughput))
141
+ base_name = @base_name
142
+
143
+ width = guess_terminal_width
144
+ return "#{base_name}#{separator}#{progress}" if width.nil?
145
+
146
+ return nil if progress.size > width
147
+
148
+ base_name_width = width - progress.size - separator.size
149
+ if base_name.size > base_name_width
150
+ ellipsis = "..."
151
+ shorten_base_name_width = base_name_width - ellipsis.size
152
+ return progress if shorten_base_name_width < 1
153
+
154
+ base_name = base_name[0, shorten_base_name_width] + ellipsis
155
+
156
+ end
157
+ "#{base_name}#{separator}#{progress}"
158
+ end
159
+
160
+ def format_size(size)
161
+ if size < 1000
162
+ "%d" % size
163
+ elsif size < (1000**2)
164
+ format("%6.2fKB", (size.to_f / 1000))
165
+ elsif size < (1000**3)
166
+ format("%6.2fMB", (size.to_f / (1000**2)))
167
+ elsif size < (1000**4)
168
+ format("%6.2fGB", (size.to_f / (1000**3)))
169
+ else
170
+ format("%.2fTB", (size.to_f / (1000**4)))
171
+ end
172
+ end
173
+
174
+ def format_time_interval(interval)
175
+ if interval < 60
176
+ "00:00:%02d" % interval
177
+ elsif interval < (60 * 60)
178
+ minute, second = interval.divmod(60)
179
+ format("00:%02d:%02d", minute, second)
180
+ elsif interval < (60 * 60 * 24)
181
+ minute, second = interval.divmod(60)
182
+ hour, minute = minute.divmod(60)
183
+ format("%02d:%02d:%02d", hour, minute, second)
184
+ else
185
+ minute, second = interval.divmod(60)
186
+ hour, minute = minute.divmod(60)
187
+ day, hour = hour.divmod(24)
188
+ format("%dd %02d:%02d:%02d", day, hour, minute, second)
189
+ end
190
+ end
191
+
192
+ def format_throughput(throughput)
193
+ throughput_byte = throughput / 8
194
+ if throughput_byte <= 1000
195
+ "%3dB/s" % throughput_byte
196
+ elsif throughput_byte <= (1000**2)
197
+ format("%3dKB/s", (throughput_byte / 1000))
198
+ elsif throughput_byte <= (1000**3)
199
+ format("%3dMB/s", (throughput_byte / (1000**2)))
200
+ else
201
+ format("%3dGB/s", (throughput_byte / (1000**3)))
202
+ end
203
+ end
204
+
205
+ def foreground?
206
+ proc_stat_path = "/proc/self/stat"
207
+ ps_path = "/bin/ps"
208
+
209
+ if File.exist?(proc_stat_path)
210
+ stat = File.read(proc_stat_path).sub(/\A.+\) /, "").split
211
+ process_group_id = stat[2]
212
+ terminal_process_group_id = stat[5]
213
+ process_group_id == terminal_process_group_id
214
+ elsif File.executable?(ps_path)
215
+ IO.pipe do |input, output|
216
+ pid = spawn(ps_path, "-o", "stat", "-p", Process.pid.to_s,
217
+ { out: output, err: output })
218
+ output.close
219
+ _, status = Process.waitpid2(pid)
220
+ return false unless status.success?
221
+
222
+ input.each_line.to_a.last.include?("+")
223
+ end
224
+ else
225
+ false
226
+ end
227
+ end
228
+
229
+ def guess_terminal_width
230
+ guess_terminal_width_from_io ||
231
+ guess_terminal_width_from_command ||
232
+ guess_terminal_width_from_env ||
233
+ 80
234
+ end
235
+
236
+ def guess_terminal_width_from_io
237
+ if IO.respond_to?(:console)
238
+ IO.console.winsize[1]
239
+ elsif $stderr.respond_to?(:winsize)
240
+ begin
241
+ $stderr.winsize[1]
242
+ rescue SystemCallError
243
+ nil
244
+ end
245
+ end
246
+ end
247
+
248
+ def guess_terminal_width_from_command
249
+ IO.pipe do |input, output|
250
+ begin
251
+ pid = spawn("tput", "cols", { out: output, err: output })
252
+ rescue SystemCallError
253
+ return nil
254
+ end
255
+
256
+ output.close
257
+ _, status = Process.waitpid2(pid)
258
+ return nil unless status.success?
259
+
260
+ result = input.read.chomp
261
+ begin
262
+ Integer(result, 10)
263
+ rescue ArgumentError
264
+ nil
265
+ end
266
+ end
267
+ end
268
+
269
+ def guess_terminal_width_from_env
270
+ env = ENV["COLUMNS"] || ENV["TERM_WIDTH"]
271
+ return nil if env.nil?
272
+
273
+ begin
274
+ Integer(env, 10)
275
+ rescue ArgumentError
276
+ nil
277
+ end
278
+ end
279
+ end
280
+ end
281
+ end
282
+ end
@@ -0,0 +1,9 @@
1
+ module Bio
2
+ class TwoBit
3
+ class Metadata < Struct.new(:id,
4
+ :name,
5
+ :url,
6
+ :description)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,24 @@
1
+ module Bio
2
+ class TwoBit
3
+ class ReferenceGenome < TwoBit
4
+ def self.open(*args, **kwargs, &block)
5
+ raise ArgumentError, "#{self}#open() does not accept arguments" if args.any?
6
+
7
+ super(**kwargs, &block)
8
+ end
9
+
10
+ def initialize(**kwargs)
11
+ @local_name ||= File.basename(@data_url)
12
+ super(prepare_data, **kwargs)
13
+ end
14
+
15
+ private
16
+
17
+ def prepare_data
18
+ data_path = cache_dir_path + @local_name
19
+ download(data_path, @data_url)
20
+ data_path
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class DanRer10 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "danRer10"
9
+ @metadata.name = "danRer10"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/danRer10/bigZips/danRer10.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class DanRer11 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "danRer11"
9
+ @metadata.name = "danRer11"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/danRer11/bigZips/danRer11.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class Dm6 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "dm6"
9
+ @metadata.name = "dm6"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/dm6/bigZips/dm6.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class Hg19 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "hg19"
9
+ @metadata.name = "hg19"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class Hg38 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "hg38"
9
+ @metadata.name = "hg38"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class Hs1 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "hs1"
9
+ @metadata.name = "hs1"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class Mm10 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "mm10"
9
+ @metadata.name = "mm10"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/mm10.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class Mm39 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "mm39"
9
+ @metadata.name = "mm39"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/mm39/bigZips/mm39.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class Mm9 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "mm9"
9
+ @metadata.name = "mm9"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/mm9/bigZips/mm9.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Bio
4
4
  class TwoBit
5
- VERSION = "0.1.3"
5
+ VERSION = "0.2.0"
6
6
  end
7
7
  end
data/lib/bio/twobit.rb CHANGED
@@ -1,26 +1,51 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "pathname"
3
4
  require_relative "twobit/version"
4
5
  require_relative "twobit/twobit"
6
+ require_relative "twobit/cache_path"
7
+ require_relative "twobit/downloader"
8
+ require_relative "twobit/metadata"
5
9
 
6
10
  module Bio
7
11
  # Reader for .2bit files (i.e., from UCSC genome browser)
8
12
  class TwoBit
9
- def self.open(*args, **kwargs)
10
- file = new(*args, **kwargs)
11
- return file unless block_given?
12
-
13
- begin
14
- yield file
15
- ensure
16
- file.close
13
+ attr_reader :metadata
14
+
15
+ class << self
16
+ def open(*args, **kwargs)
17
+ file = new(*args, **kwargs)
18
+ return file unless block_given?
19
+
20
+ begin
21
+ yield file
22
+ ensure
23
+ file.close
24
+ end
25
+ file
26
+ end
27
+
28
+ # Here, const_missing is used instead of autoload.
29
+
30
+ def const_missing(name)
31
+ # prevents const_get from being called recursively many times.
32
+ @missing_const ||= []
33
+ super if @missing_const.include? name
34
+ @missing_const << name
35
+
36
+ path = File.join(__dir__, "twobit/references", "#{name.to_s.downcase}.rb")
37
+ if File.exist?(path)
38
+ require path
39
+ return const_get(name)
40
+ end
41
+ super
17
42
  end
18
- file
19
43
  end
20
44
 
21
45
  def initialize(fname, masked: false)
22
46
  raise "TwoBit::new() does not take block; use TwoBit::open() instead" if block_given?
23
47
 
48
+ fname = fname.to_path if fname.respond_to?(:to_path)
24
49
  @fname = fname
25
50
  if masked
26
51
  mskd = 1
@@ -49,6 +74,8 @@ module Bio
49
74
  sequence_raw(chrom, start, stop)
50
75
  end
51
76
 
77
+ alias seq sequence
78
+
52
79
  def bases(chrom, start = 0, stop = 0, fraction: true)
53
80
  raise ArgumentError, "negative start position" if start.negative?
54
81
  raise ArgumentError, "negative stop position" if stop.negative?
@@ -69,5 +96,24 @@ module Bio
69
96
 
70
97
  soft_masked_blocks_raw(chrom, start, stop)
71
98
  end
99
+
100
+ def clear_cache!
101
+ cache_path.remove
102
+ end
103
+
104
+ private
105
+
106
+ def cache_dir_path
107
+ cache_path.base_dir
108
+ end
109
+
110
+ def cache_path
111
+ @cache_path ||= CachePath.new(@metadata.id)
112
+ end
113
+
114
+ def download(output_path, url)
115
+ downloader = Downloader.new(url)
116
+ downloader.download(output_path)
117
+ end
72
118
  end
73
119
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-twobit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - kojix2
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-07-29 00:00:00.000000000 Z
11
+ date: 2023-01-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: This is a Ruby binding for lib2bit(https://github.com/dpryan79/lib2bit),
14
14
  which provides high-speed access to genomic data in 2bit file format.
@@ -19,12 +19,8 @@ extensions:
19
19
  - ext/bio/twobit/extconf.rb
20
20
  extra_rdoc_files: []
21
21
  files:
22
- - ".rubocop.yml"
23
- - Gemfile
24
22
  - LICENSE.txt
25
23
  - README.md
26
- - Rakefile
27
- - bio-twobit.gemspec
28
24
  - ext/bio/twobit/2bit.c
29
25
  - ext/bio/twobit/2bit.h
30
26
  - ext/bio/twobit/LICENSE
@@ -32,6 +28,19 @@ files:
32
28
  - ext/bio/twobit/twobit.c
33
29
  - ext/bio/twobit/twobit.h
34
30
  - lib/bio/twobit.rb
31
+ - lib/bio/twobit/cache_path.rb
32
+ - lib/bio/twobit/downloader.rb
33
+ - lib/bio/twobit/metadata.rb
34
+ - lib/bio/twobit/reference_genome.rb
35
+ - lib/bio/twobit/references/danrer10.rb
36
+ - lib/bio/twobit/references/danrer11.rb
37
+ - lib/bio/twobit/references/dm6.rb
38
+ - lib/bio/twobit/references/hg19.rb
39
+ - lib/bio/twobit/references/hg38.rb
40
+ - lib/bio/twobit/references/hs1.rb
41
+ - lib/bio/twobit/references/mm10.rb
42
+ - lib/bio/twobit/references/mm39.rb
43
+ - lib/bio/twobit/references/mm9.rb
35
44
  - lib/bio/twobit/version.rb
36
45
  homepage: https://github.com/ruby-on-bioc/bio-twobit
37
46
  licenses:
@@ -52,7 +61,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
52
61
  - !ruby/object:Gem::Version
53
62
  version: '0'
54
63
  requirements: []
55
- rubygems_version: 3.3.7
64
+ rubygems_version: 3.4.1
56
65
  signing_key:
57
66
  specification_version: 4
58
67
  summary: A ruby library for accessing 2bit files
data/.rubocop.yml DELETED
@@ -1,13 +0,0 @@
1
- AllCops:
2
- TargetRubyVersion: 2.6
3
-
4
- Style/StringLiterals:
5
- Enabled: true
6
- EnforcedStyle: double_quotes
7
-
8
- Style/StringLiteralsInInterpolation:
9
- Enabled: true
10
- EnforcedStyle: double_quotes
11
-
12
- Layout/LineLength:
13
- Max: 120