miga-base 1.3.10.1 → 1.3.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e3a7f2c88342fa48b9b5a1485c3b371bdff02e2e5bf19af7c24ab9b19b3fd373
4
- data.tar.gz: cffaf705081cb22a9c3834e91a6ed46ccd86d138433d65a4e78269c6698f73be
3
+ metadata.gz: 601e480270ea7f04ff3f5057fa5f031146ac2e73247a7c191ea391a17e971bfc
4
+ data.tar.gz: 1577f19ec6b7798305a847da42556a16ff9e6a144c2e317e3349a8b49a0069fd
5
5
  SHA512:
6
- metadata.gz: '093441822af4f362f210d0bbb0b4ade917757a5a5ba6938dec60038fdbdb73f75b0a479d2c930e223a54e81fce0ba9ab83b7f42ccc6a77944f33b66b53ad91a8'
7
- data.tar.gz: 688e9925740ac96b258dd25521c5c5d63be05c2f0ebdf82a0b669bac0bd0d5c4def96537bcd4e1d411aeb03e6662b631447a5ed748c7de6c3631ea23f0d1021b
6
+ metadata.gz: '0293ffeea41a27ddef7a7b077d8cd93f86f5b9e95537576f8ce87120242cf3ced5f9554966cf648dcb817134977fbe6ee13a45b0815069cf2471014d780ce8c4'
7
+ data.tar.gz: b7ab71aa3a78d32861e1e9f6ff685424530093fd485185be722c6cc139a81d30692e00ccb6cbad4799bbcbd077ce32c7693db8336c98e8dfe0b18196748af174
@@ -57,14 +57,14 @@ module MiGA::Cli::Action::Download::Ncbi
57
57
  def remote_list
58
58
  if cli[:ncbi_taxonomy_dump]
59
59
  cli.say "Reading NCBI Taxonomy dump: #{cli[:ncbi_taxonomy_dump]}"
60
- MiGA::RemoteDataset.use_ncbi_taxonomy_dump(cli[:ncbi_taxonomy_dump])
60
+ MiGA::RemoteDataset.use_ncbi_taxonomy_dump(cli[:ncbi_taxonomy_dump], cli)
61
61
  end
62
62
 
63
63
  if cli[:ncbi_list_json] && File.size?(cli[:ncbi_list_json])
64
- cli.say "Reusing remote list: #{cli[:ncbi_list_json]}"
65
- return MiGA::Json.parse(cli[:ncbi_list_json])
64
+ return read_ncbi_list_json(cli[:ncbi_list_json])
66
65
  end
67
66
 
67
+ cli.say "Obtaining remote list of datasets"
68
68
  list = {}
69
69
  query = remote_list_query
70
70
  loop do
@@ -79,18 +79,45 @@ module MiGA::Cli::Action::Download::Ncbi
79
79
  list.merge!(parse_reports_as_datasets(page[:reports]))
80
80
 
81
81
  # Next page
82
+ cli.advance('Datasets:', list.size, page[:total_count])
82
83
  break unless page[:next_page_token]
83
84
  query[:page_token] = page[:next_page_token]
84
85
  end
86
+ cli.say
85
87
 
86
- if cli[:ncbi_list_json]
87
- cli.say "Saving remote list: #{cli[:ncbi_list_json]}"
88
- MiGA::Json.generate_fast(list, cli[:ncbi_list_json])
88
+ write_ncbi_list_json(cli[:ncbi_list_json], list) if cli[:ncbi_list_json]
89
+ list
90
+ end
91
+
92
+ def read_ncbi_list_json(file)
93
+ cli.say "Reusing remote list: #{file}"
94
+ list = {}
95
+ n_tot = nil
96
+ File.open(file, 'r') do |fh|
97
+ n_tot = fh.gets.chomp.sub(/^# /, '').to_i
98
+ fh.each_with_index do |ln, k|
99
+ row = ln.chomp.split("\t", 2)
100
+ list[row[0]] = MiGA::Json.parse(row[1], contents: true)
101
+ cli.advance('Lines:', k, n_tot)
102
+ end
103
+ cli.say
89
104
  end
105
+ return list
106
+ end
90
107
 
91
- list
108
+ def write_ncbi_list_json(file, list)
109
+ cli.say "Saving remote list: #{file}"
110
+ File.open(file, 'w') do |fh|
111
+ fh.puts('# %i' % list.size)
112
+ kk = 0
113
+ list.each do |k, v|
114
+ fh.puts([k, MiGA::Json.generate_fast(v)].join("\t"))
115
+ cli.advance('Datasets:', kk += 1, list.size)
116
+ end
117
+ cli.say
118
+ end
92
119
  end
93
-
120
+
94
121
  def parse_reports_as_datasets(reports)
95
122
  ds = {}
96
123
  reports.each do |r|
data/lib/miga/dataset.rb CHANGED
@@ -3,6 +3,7 @@
3
3
  # @package MiGA
4
4
  # @license Artistic-2.0
5
5
 
6
+ require'set'
6
7
  require 'miga/metadata'
7
8
  require 'miga/dataset/result'
8
9
  require 'miga/dataset/status'
@@ -27,7 +28,7 @@ class MiGA::Dataset < MiGA::MiGA
27
28
  ##
28
29
  # Does the +project+ already have a dataset with that +name+?
29
30
  def exist?(project, name)
30
- !project.dataset_names_hash[name].nil?
31
+ project.dataset_names_set.include? name
31
32
  end
32
33
 
33
34
  ##
data/lib/miga/json.rb CHANGED
@@ -15,6 +15,8 @@ class MiGA::Json < MiGA::MiGA
15
15
  # - +:symbolize+: If names should be symbolized. By default it's true if
16
16
  # additions is false, or false otherwise. They can both be false, but an
17
17
  # exception will be raised if both are true
18
+ # - +:large_file+: If passed, the file is treated as a file with very long
19
+ # lines (possibly a single long line)
18
20
  def default_opts(opts = {})
19
21
  opts[:contents] ||= false
20
22
  opts[:additions] ||= false
@@ -36,11 +38,18 @@ class MiGA::Json < MiGA::MiGA
36
38
 
37
39
  # Read JSON
38
40
  cont = path
39
- 12.times do
40
- cont = File.read(path)
41
- break unless cont.empty?
42
- sleep 1 # Wait up to 12 seconds for racing processes (iff empty file)
43
- end unless opts[:contents]
41
+ if opts[:large_file]
42
+ cont = ''
43
+ File.open(path, 'r') do |fh|
44
+ cont += fh.read(2 ** 16) until fh.eof?
45
+ end
46
+ elsif !opts[:contents]
47
+ 12.times do
48
+ cont = File.read(path)
49
+ break unless cont.empty?
50
+ sleep 1 # Wait up to 12 seconds for racing processes (iff empty file)
51
+ end
52
+ end
44
53
  raise "Empty descriptor: #{opts[:contents] ? "''" : path}" if cont.empty?
45
54
 
46
55
  # Parse JSON
data/lib/miga/metadata.rb CHANGED
@@ -116,7 +116,7 @@ class MiGA::Metadata < MiGA::MiGA
116
116
  ##
117
117
  # Return the value of +k+ in #data
118
118
  def [](k)
119
- if k.to_s =~ /(.+):(.+)/
119
+ if k.to_s =~ /^([^:]+):(.+)$/
120
120
  data[$1.to_sym]&.fetch($2)
121
121
  else
122
122
  data[k.to_sym]
@@ -5,24 +5,32 @@
5
5
  # Helper module including specific functions handle datasets.
6
6
  module MiGA::Project::Dataset
7
7
  ##
8
- # Returns Array of MiGA::Dataset.
8
+ # Returns Array of MiGA::Dataset
9
9
  def datasets
10
10
  metadata[:datasets].map { |name| dataset(name) }
11
11
  end
12
12
 
13
13
  ##
14
- # Returns Array of String (without evaluating dataset objects).
14
+ # Returns Array of String (without evaluating dataset objects)
15
15
  def dataset_names
16
16
  metadata[:datasets]
17
17
  end
18
18
 
19
19
  ##
20
- # Returns Hash of Strings => true. Similar to +dataset_names+ but as
21
- # Hash for efficiency.
20
+ # Returns Hash of +{ String => true }+. Similar to +dataset_names+ but as
21
+ # Hash for efficiency
22
22
  def dataset_names_hash
23
+ warn 'The Project#dataset_names_hash method will be deprecated soon'
23
24
  @dataset_names_hash ||= Hash[dataset_names.map { |i| [i, true] }]
24
25
  end
25
26
 
27
+ ##
28
+ # Returns Set of Strings. Similar to +dataset_names+ but as Set for
29
+ # efficiency
30
+ def dataset_names_set
31
+ @dataset_names_set ||= Set.new(dataset_names)
32
+ end
33
+
26
34
  ##
27
35
  # Returns MiGA::Dataset
28
36
  def dataset(name)
@@ -50,7 +58,8 @@ module MiGA::Project::Dataset
50
58
  unless metadata[:datasets].include? name
51
59
  d = MiGA::Dataset.new(self, name)
52
60
  @metadata[:datasets] << name
53
- @dataset_names_hash = nil # Ensure loading even if +do_not_save+ is true
61
+ @dataset_names_hash[name] = true if @dataset_names_hash
62
+ @dataset_names_set << name if @dataset_names_set
54
63
  save
55
64
  if d.ref? && d.active?
56
65
  recalculate_tasks("Reference dataset added: #{d.name}")
data/lib/miga/project.rb CHANGED
@@ -77,6 +77,7 @@ class MiGA::Project < MiGA::MiGA
77
77
  def load
78
78
  @datasets = {}
79
79
  @dataset_names_hash = nil
80
+ @dataset_names_set = nil
80
81
  @metadata = MiGA::Metadata.load "#{path}/miga.project.json"
81
82
  raise "Couldn't find project metadata at #{path}" if metadata.nil?
82
83
 
@@ -16,7 +16,12 @@ class MiGA::RemoteDataset < MiGA::MiGA
16
16
  # Path to a directory with a recent NCBI Taxonomy dump to use instead of
17
17
  # making API calls to NCBI servers, which can be obtained at:
18
18
  # https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
19
- def use_ncbi_taxonomy_dump(path)
19
+ #
20
+ # The +cli+ parameter, if passed, should be a MiGA::Cli object that will
21
+ # be used to report advance in the reading. Other objects can be passed,
22
+ # minimally supporting the MiGA::Cli#say and MiGA::Cli#advance method
23
+ # interfaces
24
+ def use_ncbi_taxonomy_dump(path, cli = nil)
20
25
  raise "Directory doesn't exist: #{path}" unless File.directory?(path)
21
26
 
22
27
  # Structure: { TaxID => ["name", "rank", parent TaxID] }
@@ -24,23 +29,31 @@ class MiGA::RemoteDataset < MiGA::MiGA
24
29
  @ncbi_taxonomy_names = {}
25
30
 
26
31
  # Read names.dmp
27
- File.open(File.join(path, 'names.dmp')) do |fh|
32
+ File.open(file = File.join(path, 'names.dmp')) do |fh|
33
+ read = 0
34
+ size = File.size(file)
28
35
  fh.each do |ln|
36
+ cli&.advance('- names.dmp:', read += ln.size, size)
29
37
  row = ln.split(/\t\|\t?/)
30
38
  next unless row[3] == 'scientific name'
31
39
  @ncbi_taxonomy_names[row[0].to_i] = [row[1].strip]
32
40
  end
41
+ cli&.say
33
42
  end
34
43
 
35
44
  # Read nodes.dmp
36
- File.open(File.join(path, 'nodes.dmp')) do |fh|
45
+ File.open(file = File.join(path, 'nodes.dmp')) do |fh|
46
+ read = 0
47
+ size = File.size(file)
37
48
  fh.each do |ln|
49
+ cli&.advance('- nodes.dmp:', read += ln.size, size)
38
50
  row = ln.split(/\t\|\t?/)
39
51
  child = row[0].to_i
40
52
  parent = row[1].to_i
41
53
  @ncbi_taxonomy_names[child][1] = row[2]
42
54
  @ncbi_taxonomy_names[child][2] = parent unless parent == child
43
55
  end
56
+ cli&.say
44
57
  end
45
58
  end
46
59
 
data/lib/miga/taxonomy.rb CHANGED
@@ -188,7 +188,7 @@ class MiGA::Taxonomy < MiGA::MiGA
188
188
  when Array, Hash
189
189
  self << str
190
190
  else
191
- "#{str} ".scan(/([A-Za-z]+):([^:]*)( )/) { |r, n, _| self << { r => n } }
191
+ " #{str} ".scan(/(?<= )([A-Za-z]+):([^:]*) /) { |r, n| self << { r => n } }
192
192
  end
193
193
  end
194
194
 
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.3, 10, 1].freeze
15
+ VERSION = [1.3, 10, 2].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2024, 1, 31)
23
+ VERSION_DATE = Date.new(2024, 2, 6)
24
24
 
25
25
  ##
26
26
  # References of MiGA
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.10.1
4
+ version: 1.3.10.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-01-31 00:00:00.000000000 Z
11
+ date: 2024-02-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons