miga-base 1.3.10.0 → 1.3.10.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '0239e39a0588b73d042da7d970925d2d93a5334c858453e032d51b0af760fa27'
4
- data.tar.gz: 81e6903e1feba6571d76fe5d113a60414bd0d3b1b3090d6e26367a93cf8d0da7
3
+ metadata.gz: 601e480270ea7f04ff3f5057fa5f031146ac2e73247a7c191ea391a17e971bfc
4
+ data.tar.gz: 1577f19ec6b7798305a847da42556a16ff9e6a144c2e317e3349a8b49a0069fd
5
5
  SHA512:
6
- metadata.gz: d68e55d5335f3da03eb9cea737aad5fa21a7a272e3958db6130e7260387844c1bed92b0b2f655a5a5133772797212b717559da9701723c9885cc9ee7cffc962f
7
- data.tar.gz: 8d27c2f580106c0d1f74e6daaf1cb81ffd7c6fbabf86e7569342027ff9e09b1810c245ad560eff0179c3d06a88b2d1a087344c56e34a2d54c799004e2f6370c0
6
+ metadata.gz: '0293ffeea41a27ddef7a7b077d8cd93f86f5b9e95537576f8ce87120242cf3ced5f9554966cf648dcb817134977fbe6ee13a45b0815069cf2471014d780ce8c4'
7
+ data.tar.gz: b7ab71aa3a78d32861e1e9f6ff685424530093fd485185be722c6cc139a81d30692e00ccb6cbad4799bbcbd077ce32c7693db8336c98e8dfe0b18196748af174
@@ -29,7 +29,7 @@ module MiGA::Cli::Action::Download::Ncbi
29
29
  opt.on(
30
30
  '--ncbi-taxonomy-dump STRING',
31
31
  'Path to an NCBI Taxonomy dump directory to query instead of API calls'
32
- ) { |v| MiGA::RemoteDataset.use_ncbi_taxonomy_dump(v) }
32
+ ) { |v| cli[:ncbi_taxonomy_dump] = v }
33
33
  end
34
34
 
35
35
  def cli_name_modifiers(opt)
@@ -55,11 +55,16 @@ module MiGA::Cli::Action::Download::Ncbi
55
55
  end
56
56
 
57
57
  def remote_list
58
+ if cli[:ncbi_taxonomy_dump]
59
+ cli.say "Reading NCBI Taxonomy dump: #{cli[:ncbi_taxonomy_dump]}"
60
+ MiGA::RemoteDataset.use_ncbi_taxonomy_dump(cli[:ncbi_taxonomy_dump], cli)
61
+ end
62
+
58
63
  if cli[:ncbi_list_json] && File.size?(cli[:ncbi_list_json])
59
- cli.say "Reusing remote list: #{cli[:ncbi_list_json]}"
60
- return MiGA::Json.parse(cli[:ncbi_list_json])
64
+ return read_ncbi_list_json(cli[:ncbi_list_json])
61
65
  end
62
66
 
67
+ cli.say "Obtaining remote list of datasets"
63
68
  list = {}
64
69
  query = remote_list_query
65
70
  loop do
@@ -74,18 +79,45 @@ module MiGA::Cli::Action::Download::Ncbi
74
79
  list.merge!(parse_reports_as_datasets(page[:reports]))
75
80
 
76
81
  # Next page
82
+ cli.advance('Datasets:', list.size, page[:total_count])
77
83
  break unless page[:next_page_token]
78
84
  query[:page_token] = page[:next_page_token]
79
85
  end
86
+ cli.say
87
+
88
+ write_ncbi_list_json(cli[:ncbi_list_json], list) if cli[:ncbi_list_json]
89
+ list
90
+ end
80
91
 
81
- if cli[:ncbi_list_json]
82
- cli.say "Saving remote list: #{cli[:ncbi_list_json]}"
83
- MiGA::Json.generate_fast(list, cli[:ncbi_list_json])
92
+ def read_ncbi_list_json(file)
93
+ cli.say "Reusing remote list: #{file}"
94
+ list = {}
95
+ n_tot = nil
96
+ File.open(file, 'r') do |fh|
97
+ n_tot = fh.gets.chomp.sub(/^# /, '').to_i
98
+ fh.each_with_index do |ln, k|
99
+ row = ln.chomp.split("\t", 2)
100
+ list[row[0]] = MiGA::Json.parse(row[1], contents: true)
101
+ cli.advance('Lines:', k, n_tot)
102
+ end
103
+ cli.say
84
104
  end
105
+ return list
106
+ end
85
107
 
86
- list
108
+ def write_ncbi_list_json(file, list)
109
+ cli.say "Saving remote list: #{file}"
110
+ File.open(file, 'w') do |fh|
111
+ fh.puts('# %i' % list.size)
112
+ kk = 0
113
+ list.each do |k, v|
114
+ fh.puts([k, MiGA::Json.generate_fast(v)].join("\t"))
115
+ cli.advance('Datasets:', kk += 1, list.size)
116
+ end
117
+ cli.say
118
+ end
87
119
  end
88
-
120
+
89
121
  def parse_reports_as_datasets(reports)
90
122
  ds = {}
91
123
  reports.each do |r|
@@ -20,7 +20,10 @@ module MiGA::Dataset::Status
20
20
  old_status = metadata[:status]
21
21
  metadata[:status] =
22
22
  !active? ? 'inactive' : done_preprocessing? ? 'complete' : 'incomplete'
23
- self.save if save && (old_status.nil? || old_status != metadata[:status])
23
+ if save && (old_status.nil? || old_status != metadata[:status])
24
+ self.save
25
+ MiGA::MiGA.DEBUG "Status changed: #{old_status} -> #{metadata[:status]}"
26
+ end
24
27
  metadata[:status].to_sym
25
28
  end
26
29
  end
data/lib/miga/dataset.rb CHANGED
@@ -3,6 +3,7 @@
3
3
  # @package MiGA
4
4
  # @license Artistic-2.0
5
5
 
6
+ require'set'
6
7
  require 'miga/metadata'
7
8
  require 'miga/dataset/result'
8
9
  require 'miga/dataset/status'
@@ -27,7 +28,7 @@ class MiGA::Dataset < MiGA::MiGA
27
28
  ##
28
29
  # Does the +project+ already have a dataset with that +name+?
29
30
  def exist?(project, name)
30
- !project.dataset_names_hash[name].nil?
31
+ project.dataset_names_set.include? name
31
32
  end
32
33
 
33
34
  ##
@@ -61,6 +62,7 @@ class MiGA::Dataset < MiGA::MiGA
61
62
  @project, @name, @metadata = project, name, nil
62
63
  metadata[:ref] = is_ref
63
64
  metadata[:type] ||= :empty
65
+ metadata[:status] ||= 'incomplete'
64
66
  @metadata_future = [
65
67
  File.join(project.path, 'metadata', "#{name}.json"),
66
68
  metadata
@@ -84,15 +86,18 @@ class MiGA::Dataset < MiGA::MiGA
84
86
  ##
85
87
  # Save any changes you've made in the dataset
86
88
  def save
87
- MiGA.DEBUG "Dataset.metadata: #{metadata.data}"
89
+ MiGA.DEBUG "Dataset.save: #{name}"
88
90
  metadata.save
89
91
  pull_hook :on_save
90
92
  end
91
93
 
92
94
  ##
93
- # Currently +save!+ is simply an alias of +save+, for compatibility with the
94
- # +Project+ interface
95
- alias :save! :save
95
+ # Forces a save even if nothing has changed in the metadata
96
+ def save!
97
+ MiGA.DEBUG "Dataset.save!: #{name}"
98
+ metadata.save!
99
+ pull_hook :on_save
100
+ end
96
101
 
97
102
  ##
98
103
  # Delete the dataset with all it's contents (including results) and returns
@@ -148,7 +153,7 @@ class MiGA::Dataset < MiGA::MiGA
148
153
  ##
149
154
  # Is this dataset active?
150
155
  def active?
151
- metadata[:inactive].nil? or !metadata[:inactive]
156
+ metadata[:inactive].nil? || !metadata[:inactive]
152
157
  end
153
158
 
154
159
  ##
data/lib/miga/json.rb CHANGED
@@ -15,6 +15,8 @@ class MiGA::Json < MiGA::MiGA
15
15
  # - +:symbolize+: If names should be symbolized. By default it's true if
16
16
  # additions is false, or false otherwise. They can both be false, but an
17
17
  # exception will be raised if both are true
18
+ # - +:large_file+: If passed, the file is treated as a file with very long
19
+ # lines (possibly a single long line)
18
20
  def default_opts(opts = {})
19
21
  opts[:contents] ||= false
20
22
  opts[:additions] ||= false
@@ -36,11 +38,18 @@ class MiGA::Json < MiGA::MiGA
36
38
 
37
39
  # Read JSON
38
40
  cont = path
39
- 12.times do
40
- cont = File.read(path)
41
- break unless cont.empty?
42
- sleep 1 # Wait up to 12 seconds for racing processes (iff empty file)
43
- end unless opts[:contents]
41
+ if opts[:large_file]
42
+ cont = ''
43
+ File.open(path, 'r') do |fh|
44
+ cont += fh.read(2 ** 16) until fh.eof?
45
+ end
46
+ elsif !opts[:contents]
47
+ 12.times do
48
+ cont = File.read(path)
49
+ break unless cont.empty?
50
+ sleep 1 # Wait up to 12 seconds for racing processes (iff empty file)
51
+ end
52
+ end
44
53
  raise "Empty descriptor: #{opts[:contents] ? "''" : path}" if cont.empty?
45
54
 
46
55
  # Parse JSON
data/lib/miga/metadata.rb CHANGED
@@ -26,12 +26,17 @@ class MiGA::Metadata < MiGA::MiGA
26
26
  # Path to the JSON file describing the metadata
27
27
  attr_reader :path
28
28
 
29
+ ##
30
+ # Hash (Integer) of the last saved data Hash (object)
31
+ attr_reader :saved_hash
32
+
29
33
  ##
30
34
  # Initiate a MiGA::Metadata object with description in +path+.
31
35
  # It will create it if it doesn't exist.
32
36
  def initialize(path, defaults = {})
33
37
  @data = nil
34
38
  @path = File.absolute_path(path)
39
+ @saved_hash = nil
35
40
  unless File.exist? path
36
41
  @data = {}
37
42
  defaults.each { |k, v| self[k] = v }
@@ -57,35 +62,41 @@ class MiGA::Metadata < MiGA::MiGA
57
62
  # Save the metadata into #path
58
63
  def save
59
64
  return if self[:never_save]
65
+ return if !saved_hash.nil? && saved_hash == data.hash
60
66
 
61
67
  MiGA::MiGA.DEBUG "Metadata.save #{path}"
68
+ path_tmp = "#{path}.tmp"
62
69
  self[:updated] = Time.now.to_s
70
+ @saved_hash = data.hash
63
71
  json = to_json
64
72
  wait_for_lock
65
73
  FileUtils.touch(lock_file)
66
- ofh = File.open("#{path}.tmp", 'w')
67
- ofh.puts json
68
- ofh.close
74
+ File.open(path_tmp, 'w') { |ofh| ofh.puts json }
69
75
 
70
- unless File.exist?("#{path}.tmp") && File.exist?(lock_file)
76
+ unless File.exist?(path_tmp) && File.exist?(lock_file)
71
77
  raise "Lock-racing detected for #{path}"
72
78
  end
73
79
 
74
- File.rename("#{path}.tmp", path)
80
+ File.rename(path_tmp, path)
75
81
  File.unlink(lock_file)
76
82
  end
77
83
 
84
+ ##
85
+ # Force +save+ even if nothing has changed since the last save
86
+ # or load. However, it doesn't save if +:never_save+ is true.
87
+ def save!
88
+ @saved_hash = nil
89
+ save
90
+ end
91
+
78
92
  ##
79
93
  # (Re-)load metadata stored in #path
80
94
  def load
81
- sleeper = 0.0
82
- while File.exist? lock_file
83
- sleeper += 0.1 if sleeper <= 10.0
84
- sleep(sleeper.to_i)
85
- end
95
+ wait_for_lock
86
96
  tmp = MiGA::Json.parse(path, additions: true)
87
97
  @data = {}
88
98
  tmp.each { |k, v| self[k] = v }
99
+ @saved_hash = data.hash
89
100
  end
90
101
 
91
102
  ##
@@ -105,7 +116,7 @@ class MiGA::Metadata < MiGA::MiGA
105
116
  ##
106
117
  # Return the value of +k+ in #data
107
118
  def [](k)
108
- if k.to_s =~ /(.+):(.+)/
119
+ if k.to_s =~ /^([^:]+):(.+)$/
109
120
  data[$1.to_sym]&.fetch($2)
110
121
  else
111
122
  data[k.to_sym]
@@ -5,24 +5,32 @@
5
5
  # Helper module including specific functions handle datasets.
6
6
  module MiGA::Project::Dataset
7
7
  ##
8
- # Returns Array of MiGA::Dataset.
8
+ # Returns Array of MiGA::Dataset
9
9
  def datasets
10
10
  metadata[:datasets].map { |name| dataset(name) }
11
11
  end
12
12
 
13
13
  ##
14
- # Returns Array of String (without evaluating dataset objects).
14
+ # Returns Array of String (without evaluating dataset objects)
15
15
  def dataset_names
16
16
  metadata[:datasets]
17
17
  end
18
18
 
19
19
  ##
20
- # Returns Hash of Strings => true. Similar to +dataset_names+ but as
21
- # Hash for efficiency.
20
+ # Returns Hash of +{ String => true }+. Similar to +dataset_names+ but as
21
+ # Hash for efficiency
22
22
  def dataset_names_hash
23
+ warn 'The Project#dataset_names_hash method will be deprecated soon'
23
24
  @dataset_names_hash ||= Hash[dataset_names.map { |i| [i, true] }]
24
25
  end
25
26
 
27
+ ##
28
+ # Returns Set of Strings. Similar to +dataset_names+ but as Set for
29
+ # efficiency
30
+ def dataset_names_set
31
+ @dataset_names_set ||= Set.new(dataset_names)
32
+ end
33
+
26
34
  ##
27
35
  # Returns MiGA::Dataset
28
36
  def dataset(name)
@@ -50,7 +58,8 @@ module MiGA::Project::Dataset
50
58
  unless metadata[:datasets].include? name
51
59
  d = MiGA::Dataset.new(self, name)
52
60
  @metadata[:datasets] << name
53
- @dataset_names_hash = nil # Ensure loading even if +do_not_save+ is true
61
+ @dataset_names_hash[name] = true if @dataset_names_hash
62
+ @dataset_names_set << name if @dataset_names_set
54
63
  save
55
64
  if d.ref? && d.active?
56
65
  recalculate_tasks("Reference dataset added: #{d.name}")
data/lib/miga/project.rb CHANGED
@@ -67,7 +67,7 @@ class MiGA::Project < MiGA::MiGA
67
67
  ##
68
68
  # Save any changes persistently, regardless of +do_not_save+
69
69
  def save!
70
- metadata.save
70
+ metadata.save!
71
71
  pull_hook :on_save
72
72
  self.load
73
73
  end
@@ -77,6 +77,7 @@ class MiGA::Project < MiGA::MiGA
77
77
  def load
78
78
  @datasets = {}
79
79
  @dataset_names_hash = nil
80
+ @dataset_names_set = nil
80
81
  @metadata = MiGA::Metadata.load "#{path}/miga.project.json"
81
82
  raise "Couldn't find project metadata at #{path}" if metadata.nil?
82
83
 
@@ -16,30 +16,44 @@ class MiGA::RemoteDataset < MiGA::MiGA
16
16
  # Path to a directory with a recent NCBI Taxonomy dump to use instead of
17
17
  # making API calls to NCBI servers, which can be obtained at:
18
18
  # https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
19
- def use_ncbi_taxonomy_dump(path)
19
+ #
20
+ # The +cli+ parameter, if passed, should be a MiGA::Cli object that will
21
+ # be used to report advance in the reading. Other objects can be passed,
22
+ # minimally supporting the MiGA::Cli#say and MiGA::Cli#advance method
23
+ # interfaces
24
+ def use_ncbi_taxonomy_dump(path, cli = nil)
20
25
  raise "Directory doesn't exist: #{path}" unless File.directory?(path)
21
26
 
22
27
  # Structure: { TaxID => ["name", "rank", parent TaxID] }
28
+ MiGA::MiGA.DEBUG "Loading NCBI Taxonomy dump: #{path}"
23
29
  @ncbi_taxonomy_names = {}
24
30
 
25
31
  # Read names.dmp
26
- File.open(File.join(path, 'names.dmp')) do |fh|
32
+ File.open(file = File.join(path, 'names.dmp')) do |fh|
33
+ read = 0
34
+ size = File.size(file)
27
35
  fh.each do |ln|
36
+ cli&.advance('- names.dmp:', read += ln.size, size)
28
37
  row = ln.split(/\t\|\t?/)
29
38
  next unless row[3] == 'scientific name'
30
39
  @ncbi_taxonomy_names[row[0].to_i] = [row[1].strip]
31
40
  end
41
+ cli&.say
32
42
  end
33
43
 
34
44
  # Read nodes.dmp
35
- File.open(File.join(path, 'nodes.dmp')) do |fh|
45
+ File.open(file = File.join(path, 'nodes.dmp')) do |fh|
46
+ read = 0
47
+ size = File.size(file)
36
48
  fh.each do |ln|
49
+ cli&.advance('- nodes.dmp:', read += ln.size, size)
37
50
  row = ln.split(/\t\|\t?/)
38
51
  child = row[0].to_i
39
52
  parent = row[1].to_i
40
53
  @ncbi_taxonomy_names[child][1] = row[2]
41
54
  @ncbi_taxonomy_names[child][2] = parent unless parent == child
42
55
  end
56
+ cli&.say
43
57
  end
44
58
  end
45
59
 
data/lib/miga/taxonomy.rb CHANGED
@@ -188,7 +188,7 @@ class MiGA::Taxonomy < MiGA::MiGA
188
188
  when Array, Hash
189
189
  self << str
190
190
  else
191
- "#{str} ".scan(/([A-Za-z]+):([^:]*)( )/) { |r, n, _| self << { r => n } }
191
+ " #{str} ".scan(/(?<= )([A-Za-z]+):([^:]*) /) { |r, n| self << { r => n } }
192
192
  end
193
193
  end
194
194
 
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.3, 10, 0].freeze
15
+ VERSION = [1.3, 10, 2].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2024, 1, 31)
23
+ VERSION_DATE = Date.new(2024, 2, 6)
24
24
 
25
25
  ##
26
26
  # References of MiGA
@@ -13,7 +13,7 @@ class MetadataTest < Test::Unit::TestCase
13
13
  File.unlink(md1.lock_file)
14
14
  end
15
15
  t1 = Time.new
16
- md1.save
16
+ md1.save!
17
17
  t2 = Time.new
18
18
  assert_path_not_exist(md1.lock_file)
19
19
  assert_ge(t2 - t1, 1.0)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.10.0
4
+ version: 1.3.10.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-01-31 00:00:00.000000000 Z
11
+ date: 2024-02-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons