bio-twobit 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ # https://github.com/red-data-tools/red-datasets/blob/master/lib/datasets/cache-path.rb
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class CachePath
6
+ def initialize(id)
7
+ @id = id
8
+ end
9
+
10
+ def base_dir
11
+ Pathname(system_cache_dir).expand_path + "bio-twobit" + @id
12
+ end
13
+
14
+ def remove
15
+ FileUtils.rmtree(base_dir.to_s, secure: true) if base_dir.exist?
16
+ end
17
+
18
+ private
19
+
20
+ def system_cache_dir
21
+ case RUBY_PLATFORM
22
+ when /mswin/, /mingw/
23
+ ENV["LOCALAPPDATA"] || "~/AppData/Local"
24
+ when /darwin/
25
+ "~/Library/Caches"
26
+ else
27
+ ENV["XDG_CACHE_HOME"] || "~/.cache"
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,282 @@
1
+ # https://github.com/red-data-tools/red-datasets/blob/master/lib/datasets/downloader.rb
2
+
3
+ require "fileutils"
4
+ begin
5
+ require "io/console"
6
+ rescue LoadError
7
+ end
8
+ require "net/http"
9
+ require "pathname"
10
+
11
+ module Bio
12
+ class TwoBit
13
+ class Downloader
14
+ class TooManyRedirects < StandardError; end
15
+
16
+ def initialize(url)
17
+ url = if url.is_a?(URI::Generic)
18
+ url.dup
19
+ else
20
+ URI.parse(url)
21
+ end
22
+ @url = url
23
+ return if @url.is_a?(URI::HTTP)
24
+
25
+ raise ArgumentError, "download URL must be HTTP or HTTPS: <#{@url}>"
26
+ end
27
+
28
+ def download(output_path)
29
+ return if output_path.exist?
30
+
31
+ output_path.parent.mkpath
32
+
33
+ headers = {
34
+ "Accept-Encoding" => "identity",
35
+ "User-Agent" => "BioTwobit/#{VERSION}"
36
+ }
37
+ start = nil
38
+ partial_output_path = Pathname.new("#{output_path}.partial")
39
+ if partial_output_path.exist?
40
+ start = partial_output_path.size
41
+ headers["Range"] = "bytes=#{start}-"
42
+ end
43
+
44
+ start_http(@url, headers) do |response|
45
+ if response.is_a?(Net::HTTPPartialContent)
46
+ mode = "ab"
47
+ else
48
+ start = nil
49
+ mode = "wb"
50
+ end
51
+
52
+ base_name = @url.path.split("/").last
53
+ size_current = 0
54
+ size_max = response.content_length
55
+ if start
56
+ size_current += start
57
+ size_max += start
58
+ end
59
+ progress_reporter = ProgressReporter.new(base_name, size_max)
60
+ partial_output_path.open(mode) do |output|
61
+ response.read_body do |chunk|
62
+ size_current += chunk.bytesize
63
+ progress_reporter.report(size_current)
64
+ output.write(chunk)
65
+ end
66
+ end
67
+ end
68
+ FileUtils.mv(partial_output_path, output_path)
69
+ rescue TooManyRedirects => e
70
+ last_url = e.message[/\Atoo many redirections: (.+)\z/, 1]
71
+ raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
72
+ end
73
+
74
+ private def start_http(url, headers, limit = 10, &block)
75
+ raise TooManyRedirects, "too many redirections: #{url}" if limit == 0
76
+
77
+ http = Net::HTTP.new(url.hostname, url.port)
78
+ # http.set_debug_output($stderr)
79
+ http.use_ssl = (url.scheme == "https")
80
+ http.start do
81
+ path = url.path
82
+ path += "?#{url.query}" if url.query
83
+ request = Net::HTTP::Get.new(path, headers)
84
+ http.request(request) do |response|
85
+ case response
86
+ when Net::HTTPSuccess, Net::HTTPPartialContent
87
+ return block.call(response)
88
+ when Net::HTTPRedirection
89
+ url = URI.parse(response[:location])
90
+ warn "Redirect to #{url}"
91
+ return start_http(url, headers, limit - 1, &block)
92
+ else
93
+ message = response.code
94
+ message += ": #{response.message}" if response.message and !response.message.empty?
95
+ message += ": #{url}"
96
+ raise response.error_type.new(message, response)
97
+ end
98
+ end
99
+ end
100
+ end
101
+
102
+ class ProgressReporter
103
+ def initialize(base_name, size_max)
104
+ @base_name = base_name
105
+ @size_max = size_max
106
+
107
+ @time_previous = Time.now
108
+ @size_previous = 0
109
+
110
+ @need_report = ($stderr == STDERR and $stderr.tty?)
111
+ end
112
+
113
+ def report(size_current)
114
+ return unless @need_report
115
+ return if @size_max.nil?
116
+ return unless foreground?
117
+
118
+ done = (size_current == @size_max)
119
+ time_current = Time.now
120
+ return if !done and time_current - @time_previous <= 1
121
+
122
+ read_bytes = size_current - @size_previous
123
+ throughput = read_bytes.to_f / (time_current - @time_previous)
124
+ @time_previous = time_current
125
+ @size_previous = size_current
126
+
127
+ message = build_message(size_current, throughput)
128
+ $stderr.print("\r#{message}") if message
129
+ $stderr.puts if done
130
+ end
131
+
132
+ private
133
+
134
+ def build_message(size_current, throughput)
135
+ percent = (size_current / @size_max.to_f) * 100
136
+ formatted_size = format("[%s/%s]", format_size(size_current), format_size(@size_max))
137
+ rest_second = (@size_max - size_current) / throughput
138
+ separator = " - "
139
+ progress = format("%05.1f%% %s %s %s", percent, formatted_size, format_time_interval(rest_second),
140
+ format_throughput(throughput))
141
+ base_name = @base_name
142
+
143
+ width = guess_terminal_width
144
+ return "#{base_name}#{separator}#{progress}" if width.nil?
145
+
146
+ return nil if progress.size > width
147
+
148
+ base_name_width = width - progress.size - separator.size
149
+ if base_name.size > base_name_width
150
+ ellipsis = "..."
151
+ shorten_base_name_width = base_name_width - ellipsis.size
152
+ return progress if shorten_base_name_width < 1
153
+
154
+ base_name = base_name[0, shorten_base_name_width] + ellipsis
155
+
156
+ end
157
+ "#{base_name}#{separator}#{progress}"
158
+ end
159
+
160
+ def format_size(size)
161
+ if size < 1000
162
+ "%d" % size
163
+ elsif size < (1000**2)
164
+ format("%6.2fKB", (size.to_f / 1000))
165
+ elsif size < (1000**3)
166
+ format("%6.2fMB", (size.to_f / (1000**2)))
167
+ elsif size < (1000**4)
168
+ format("%6.2fGB", (size.to_f / (1000**3)))
169
+ else
170
+ format("%.2fTB", (size.to_f / (1000**4)))
171
+ end
172
+ end
173
+
174
+ def format_time_interval(interval)
175
+ if interval < 60
176
+ "00:00:%02d" % interval
177
+ elsif interval < (60 * 60)
178
+ minute, second = interval.divmod(60)
179
+ format("00:%02d:%02d", minute, second)
180
+ elsif interval < (60 * 60 * 24)
181
+ minute, second = interval.divmod(60)
182
+ hour, minute = minute.divmod(60)
183
+ format("%02d:%02d:%02d", hour, minute, second)
184
+ else
185
+ minute, second = interval.divmod(60)
186
+ hour, minute = minute.divmod(60)
187
+ day, hour = hour.divmod(24)
188
+ format("%dd %02d:%02d:%02d", day, hour, minute, second)
189
+ end
190
+ end
191
+
192
+ def format_throughput(throughput)
193
+ throughput_byte = throughput / 8
194
+ if throughput_byte <= 1000
195
+ "%3dB/s" % throughput_byte
196
+ elsif throughput_byte <= (1000**2)
197
+ format("%3dKB/s", (throughput_byte / 1000))
198
+ elsif throughput_byte <= (1000**3)
199
+ format("%3dMB/s", (throughput_byte / (1000**2)))
200
+ else
201
+ format("%3dGB/s", (throughput_byte / (1000**3)))
202
+ end
203
+ end
204
+
205
+ def foreground?
206
+ proc_stat_path = "/proc/self/stat"
207
+ ps_path = "/bin/ps"
208
+
209
+ if File.exist?(proc_stat_path)
210
+ stat = File.read(proc_stat_path).sub(/\A.+\) /, "").split
211
+ process_group_id = stat[2]
212
+ terminal_process_group_id = stat[5]
213
+ process_group_id == terminal_process_group_id
214
+ elsif File.executable?(ps_path)
215
+ IO.pipe do |input, output|
216
+ pid = spawn(ps_path, "-o", "stat", "-p", Process.pid.to_s,
217
+ { out: output, err: output })
218
+ output.close
219
+ _, status = Process.waitpid2(pid)
220
+ return false unless status.success?
221
+
222
+ input.each_line.to_a.last.include?("+")
223
+ end
224
+ else
225
+ false
226
+ end
227
+ end
228
+
229
+ def guess_terminal_width
230
+ guess_terminal_width_from_io ||
231
+ guess_terminal_width_from_command ||
232
+ guess_terminal_width_from_env ||
233
+ 80
234
+ end
235
+
236
+ def guess_terminal_width_from_io
237
+ if IO.respond_to?(:console)
238
+ IO.console.winsize[1]
239
+ elsif $stderr.respond_to?(:winsize)
240
+ begin
241
+ $stderr.winsize[1]
242
+ rescue SystemCallError
243
+ nil
244
+ end
245
+ end
246
+ end
247
+
248
+ def guess_terminal_width_from_command
249
+ IO.pipe do |input, output|
250
+ begin
251
+ pid = spawn("tput", "cols", { out: output, err: output })
252
+ rescue SystemCallError
253
+ return nil
254
+ end
255
+
256
+ output.close
257
+ _, status = Process.waitpid2(pid)
258
+ return nil unless status.success?
259
+
260
+ result = input.read.chomp
261
+ begin
262
+ Integer(result, 10)
263
+ rescue ArgumentError
264
+ nil
265
+ end
266
+ end
267
+ end
268
+
269
+ def guess_terminal_width_from_env
270
+ env = ENV["COLUMNS"] || ENV["TERM_WIDTH"]
271
+ return nil if env.nil?
272
+
273
+ begin
274
+ Integer(env, 10)
275
+ rescue ArgumentError
276
+ nil
277
+ end
278
+ end
279
+ end
280
+ end
281
+ end
282
+ end
@@ -0,0 +1,9 @@
1
+ module Bio
2
+ class TwoBit
3
+ class Metadata < Struct.new(:id,
4
+ :name,
5
+ :url,
6
+ :description)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,24 @@
1
+ module Bio
2
+ class TwoBit
3
+ class ReferenceGenome < TwoBit
4
+ def self.open(*args, **kwargs, &block)
5
+ raise ArgumentError, "#{self}#open() does not accept arguments" if args.any?
6
+
7
+ super(**kwargs, &block)
8
+ end
9
+
10
+ def initialize(**kwargs)
11
+ @local_name ||= File.basename(@data_url)
12
+ super(prepare_data, **kwargs)
13
+ end
14
+
15
+ private
16
+
17
+ def prepare_data
18
+ data_path = cache_dir_path + @local_name
19
+ download(data_path, @data_url)
20
+ data_path
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class DanRer10 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "danRer10"
9
+ @metadata.name = "danRer10"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/danRer10/bigZips/danRer10.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class DanRer11 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "danRer11"
9
+ @metadata.name = "danRer11"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/danRer11/bigZips/danRer11.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class Dm6 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "dm6"
9
+ @metadata.name = "dm6"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/dm6/bigZips/dm6.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class Hg19 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "hg19"
9
+ @metadata.name = "hg19"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class Hg38 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "hg38"
9
+ @metadata.name = "hg38"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class Hs1 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "hs1"
9
+ @metadata.name = "hs1"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class Mm10 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "mm10"
9
+ @metadata.name = "mm10"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/mm10.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class Mm39 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "mm39"
9
+ @metadata.name = "mm39"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/mm39/bigZips/mm39.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../reference_genome"
2
+
3
+ module Bio
4
+ class TwoBit
5
+ class Mm9 < ReferenceGenome
6
+ def initialize(**kwargs)
7
+ @metadata = Metadata.new
8
+ @metadata.id = "mm9"
9
+ @metadata.name = "mm9"
10
+ @metadata.url = "https://hgdownload.soe.ucsc.edu/downloads.html"
11
+ @data_url = "https://hgdownload.soe.ucsc.edu/goldenPath/mm9/bigZips/mm9.2bit"
12
+ super(**kwargs)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Bio
4
4
  class TwoBit
5
- VERSION = "0.1.3"
5
+ VERSION = "0.2.0"
6
6
  end
7
7
  end
data/lib/bio/twobit.rb CHANGED
@@ -1,26 +1,51 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "pathname"
3
4
  require_relative "twobit/version"
4
5
  require_relative "twobit/twobit"
6
+ require_relative "twobit/cache_path"
7
+ require_relative "twobit/downloader"
8
+ require_relative "twobit/metadata"
5
9
 
6
10
  module Bio
7
11
  # Reader for .2bit files (i.e., from UCSC genome browser)
8
12
  class TwoBit
9
- def self.open(*args, **kwargs)
10
- file = new(*args, **kwargs)
11
- return file unless block_given?
12
-
13
- begin
14
- yield file
15
- ensure
16
- file.close
13
+ attr_reader :metadata
14
+
15
+ class << self
16
+ def open(*args, **kwargs)
17
+ file = new(*args, **kwargs)
18
+ return file unless block_given?
19
+
20
+ begin
21
+ yield file
22
+ ensure
23
+ file.close
24
+ end
25
+ file
26
+ end
27
+
28
+ # Here, const_missing is used instead of autoload.
29
+
30
+ def const_missing(name)
31
+ # prevents const_get from being called recursively many times.
32
+ @missing_const ||= []
33
+ super if @missing_const.include? name
34
+ @missing_const << name
35
+
36
+ path = File.join(__dir__, "twobit/references", "#{name.to_s.downcase}.rb")
37
+ if File.exist?(path)
38
+ require path
39
+ return const_get(name)
40
+ end
41
+ super
17
42
  end
18
- file
19
43
  end
20
44
 
21
45
  def initialize(fname, masked: false)
22
46
  raise "TwoBit::new() does not take block; use TwoBit::open() instead" if block_given?
23
47
 
48
+ fname = fname.to_path if fname.respond_to?(:to_path)
24
49
  @fname = fname
25
50
  if masked
26
51
  mskd = 1
@@ -49,6 +74,8 @@ module Bio
49
74
  sequence_raw(chrom, start, stop)
50
75
  end
51
76
 
77
+ alias seq sequence
78
+
52
79
  def bases(chrom, start = 0, stop = 0, fraction: true)
53
80
  raise ArgumentError, "negative start position" if start.negative?
54
81
  raise ArgumentError, "negative stop position" if stop.negative?
@@ -69,5 +96,24 @@ module Bio
69
96
 
70
97
  soft_masked_blocks_raw(chrom, start, stop)
71
98
  end
99
+
100
+ def clear_cache!
101
+ cache_path.remove
102
+ end
103
+
104
+ private
105
+
106
+ def cache_dir_path
107
+ cache_path.base_dir
108
+ end
109
+
110
+ def cache_path
111
+ @cache_path ||= CachePath.new(@metadata.id)
112
+ end
113
+
114
+ def download(output_path, url)
115
+ downloader = Downloader.new(url)
116
+ downloader.download(output_path)
117
+ end
72
118
  end
73
119
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-twobit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - kojix2
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-07-29 00:00:00.000000000 Z
11
+ date: 2023-01-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: This is a Ruby binding for lib2bit(https://github.com/dpryan79/lib2bit),
14
14
  which provides high-speed access to genomic data in 2bit file format.
@@ -19,12 +19,8 @@ extensions:
19
19
  - ext/bio/twobit/extconf.rb
20
20
  extra_rdoc_files: []
21
21
  files:
22
- - ".rubocop.yml"
23
- - Gemfile
24
22
  - LICENSE.txt
25
23
  - README.md
26
- - Rakefile
27
- - bio-twobit.gemspec
28
24
  - ext/bio/twobit/2bit.c
29
25
  - ext/bio/twobit/2bit.h
30
26
  - ext/bio/twobit/LICENSE
@@ -32,6 +28,19 @@ files:
32
28
  - ext/bio/twobit/twobit.c
33
29
  - ext/bio/twobit/twobit.h
34
30
  - lib/bio/twobit.rb
31
+ - lib/bio/twobit/cache_path.rb
32
+ - lib/bio/twobit/downloader.rb
33
+ - lib/bio/twobit/metadata.rb
34
+ - lib/bio/twobit/reference_genome.rb
35
+ - lib/bio/twobit/references/danrer10.rb
36
+ - lib/bio/twobit/references/danrer11.rb
37
+ - lib/bio/twobit/references/dm6.rb
38
+ - lib/bio/twobit/references/hg19.rb
39
+ - lib/bio/twobit/references/hg38.rb
40
+ - lib/bio/twobit/references/hs1.rb
41
+ - lib/bio/twobit/references/mm10.rb
42
+ - lib/bio/twobit/references/mm39.rb
43
+ - lib/bio/twobit/references/mm9.rb
35
44
  - lib/bio/twobit/version.rb
36
45
  homepage: https://github.com/ruby-on-bioc/bio-twobit
37
46
  licenses:
@@ -52,7 +61,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
52
61
  - !ruby/object:Gem::Version
53
62
  version: '0'
54
63
  requirements: []
55
- rubygems_version: 3.3.7
64
+ rubygems_version: 3.4.1
56
65
  signing_key:
57
66
  specification_version: 4
58
67
  summary: A ruby library for accessing 2bit files
data/.rubocop.yml DELETED
@@ -1,13 +0,0 @@
1
- AllCops:
2
- TargetRubyVersion: 2.6
3
-
4
- Style/StringLiterals:
5
- Enabled: true
6
- EnforcedStyle: double_quotes
7
-
8
- Style/StringLiteralsInInterpolation:
9
- Enabled: true
10
- EnforcedStyle: double_quotes
11
-
12
- Layout/LineLength:
13
- Max: 120