miga-base 1.3.20.5 → 1.3.20.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 17c8c736e04eeaea0e7247e7a5134a0902f4e2ed4c749d68e00b00aeeb7e7295
4
- data.tar.gz: 93cca0af126920c07740be2a4f203d36f3632722022c5e09eda5d326469c31bc
3
+ metadata.gz: 2c6360afd316cce82e4e86fbf6add9fcf77e8d42b8f3310639be4d96f8b9c195
4
+ data.tar.gz: dba1b714a536d2a99fe6101ba2406eab9e8d91d32a813f6a00670195ac0bd841
5
5
  SHA512:
6
- metadata.gz: af8314535751804f3870767b4c0d88c5c139a3ae94e03c92317935397c5ee20a88e49ad97addcbf4be6e47947ee659a919db4ab1d60b48f86c5eda627fb3dba4
7
- data.tar.gz: 65e3cdc46e3cf3d4f094b0dd6bfa93806ede628573eede6f893450720d76a8c5d69528c4b63cb6839861438014d6801297d0c8a70c0d4f841a79118d54aab2cf
6
+ metadata.gz: 78f8a407b258726b64d4fcc1c124a614ae7a0b0f9401669b3fbeae6912045ba0c7dac5af73d20359bb0c073bba20c87d4a3efcbb0c81b06cedc1c65ce2e0feba
7
+ data.tar.gz: eeb2acbcb111419f6c7427a08a29095f075ba6b548a9a56ec8689d8ad72a188db6a3277c861c1baaebc5a9492f64d459f1f1c8be62d5231feaf7bb5d2544b2ec
@@ -39,17 +39,20 @@ module MiGA::Cli::Action::Doctor::Distances
39
39
  project = cli.load_project
40
40
  ref_ds = project.each_dataset.select(&:ref?)
41
41
 
42
- # Read and merge data
42
+ # Read and write data
43
43
  tmp = partial_bidir_tmp(project, ref_ds)
44
- dist = merge_bidir_tmp(tmp)
44
+ fixed_ds = merge_bidir_tmp(tmp)
45
45
  FileUtils.rm_rf(tmp)
46
46
 
47
- # Write missing values (threaded)
48
- MiGA::Parallel.distribute(ref_ds, cli[:threads]) do |ds, idx, thr|
49
- cli.advance('Datasets:', idx + 1, ref_ds.size, false) if thr == 0
50
- save_bidirectional(ds, dist)
47
+ # Fix tables if needed
48
+ unless fixed_ds.empty?
49
+ cli.say ' - Filled datasets: %i' % fixed_ds.size
50
+ %i[aai_distances ani_distances].each do |res_name|
51
+ res = cli.load_project.result(res_name) or next
52
+ cli.say ' - Recalculating tables: %s' % res_name
53
+ res.recalculate!('Distances updated for bidirectionality').save
54
+ end
51
55
  end
52
- cli.say
53
56
  end
54
57
 
55
58
  ##
@@ -69,94 +72,133 @@ module MiGA::Cli::Action::Doctor::Distances
69
72
 
70
73
  #---- Auxuliary functions -----
71
74
 
75
+ ##
76
+ # Calculates the number of chunks that should be produced during the
77
+ # bidirectional checks for +n+ reference datasets (Integer)
78
+ def partial_bidir_chunks(n)
79
+ y = [cli[:threads], (n / 1024).ceil].max
80
+ y = n if y > n
81
+ y
82
+ end
83
+
72
84
  ##
73
85
  # Make a temporal directory holding partial bidirectionality reports (one per
74
86
  # thread) in a custom multi-JSON format. Requires a MiGA::Project +project+
75
87
  # and the iterator of the reference datasets +ref_ds+. Returns the path to the
76
- # temporal directory created. Used by +check_bidir+
88
+ # temporal directory created
77
89
  def partial_bidir_tmp(project, ref_ds)
78
90
  n = ref_ds.size
91
+ chunks = partial_bidir_chunks(n)
79
92
 
80
93
  # Check first if a previous run is complete (and recover it)
81
94
  tmp = File.join(project.path, 'doctor-bidirectional.tmp')
82
- tmp_done = File.join(tmp, 'done.txt')
95
+ tmp_done = File.join(tmp, 'read-done.txt')
83
96
  if File.size?(tmp_done) &&
84
- File.readlines(tmp_done)[0].chomp.to_i == cli[:threads]
97
+ File.readlines(tmp_done)[0].chomp.to_i == chunks
85
98
  return tmp
86
99
  end
87
100
 
88
101
  # Read data first (threaded)
89
102
  FileUtils.mkdir_p(tmp)
90
- MiGA::Parallel.process(cli[:threads]) do |thr|
91
- file = File.join(tmp, "#{thr}.json")
92
- fh = File.open(file, 'w')
103
+ chunks_e = 0 .. chunks - 1
104
+ MiGA::Parallel.distribute(chunks_e, cli[:threads]) do |chunk, k, thr|
105
+ cli.advance('Reading:', k, chunks, false) if thr == 0
106
+ dist = {}
93
107
  [:aai, :ani].each do |metric|
94
- fh.puts "# #{metric}"
108
+ dist[metric] = {}
95
109
  ref_ds.each_with_index do |ds, idx|
96
- if idx % cli[:threads] == thr
97
- cli.advance('Reading:', idx + 1, n, false) if thr == 0
110
+ if idx % chunks == chunk
98
111
  row = read_bidirectional(ds, metric)
99
- fh.puts "#{ds.name} #{JSON.fast_generate(row)}" unless row.empty?
112
+ dist[metric][ds.name] = row unless row.empty?
100
113
  end
101
114
  end
102
115
  end
103
- fh.puts '# end'
104
- fh.flush # necessary for large threaded runs
105
- fh.close
106
- if thr == 0
107
- cli.advance('Reading:', n, n, false)
108
- cli.say
109
- end
116
+ file = File.join(tmp, "#{chunk}.marshal")
117
+ File.open("#{file}.tmp", 'w') { |fh| Marshal.dump(dist, fh) }
118
+ File.rename("#{file}.tmp", file)
110
119
  end
120
+ cli.advance('Reading:', chunks, chunks, false)
121
+ cli.say
111
122
 
112
123
  # Save information to indicate that the run is complete and return
113
- File.open(tmp_done, 'w') { |fh| fh.puts cli[:threads] }
124
+ File.open(tmp_done, 'w') { |fh| fh.puts chunks }
114
125
  return tmp
115
126
  end
116
127
 
117
128
  ##
118
129
  # Read partial temporal reports of bidirectionality (located in +tmp+), and
119
- # return a two-deep hash with the final missingness report by metric (first
120
- # key) and dataset name (second key). Used by +check_bidir+
130
+ # fill databases with missing values. Returns the names of the datasets fixed
131
+ # as a Set.
121
132
  def merge_bidir_tmp(tmp)
122
- dist = { aai: {}, ani: {} }
123
- cli[:threads].times do |i|
124
- cli.advance('Merging:', i + 1, cli[:threads], false)
125
-
126
- next if File.size?(File.join(tmp, "#{i+1}.json.marshal"))
127
- file = File.join(tmp, "#{i}.json")
128
- if File.size?("#{file}.marshal")
129
- dist = Marshal.load(File.read("#{file}.marshal"))
130
- next
133
+ tmp_done = File.join(tmp, 'read-done.txt')
134
+ chunks = File.readlines(tmp_done)[0].chomp.to_i
135
+
136
+ lower_triangle = []
137
+ chunks.times.each do |i|
138
+ (0 .. i).to_a.each { |j| lower_triangle << [i, j] }
139
+ end
140
+ MiGA::Parallel.distribute(lower_triangle, cli[:threads]) do |cell, k, thr|
141
+ cli.advance('Writing:', k, lower_triangle.size, false) if thr == 0
142
+ fixed_ds = merge_bidir_tmp_pair(tmp, cell[0], cell[1])
143
+ File.open(File.join(tmp, "#{cell[0]}-#{cell[1]}.txt"), 'w') do |fh|
144
+ fixed_ds.each { |ds| fh.puts ds }
145
+ end
146
+ end
147
+ cli.advance('Writing:', lower_triangle.size, lower_triangle.size, false)
148
+ cli.say
149
+ lower_triangle.map do |cell|
150
+ Set.new.tap do |y|
151
+ File.open(File.join(tmp, "#{cell[0]}-#{cell[1]}.txt"), 'r') do |fh|
152
+ fh.each { |ln| y << ln.chomp }
153
+ end
131
154
  end
155
+ end.inject(Set.new, :+)
156
+ end
132
157
 
133
- File.open(file, 'r') do |fh|
134
- metric = nil
135
- fh.each do |ln|
136
- qry, row = ln.chomp.split(' ', 2)
137
- row or raise "Unexpected format in #{file}:#{$.}"
138
- if qry == '#'
139
- metric = row.to_sym
140
- else
141
- raise "Unrecognized metric: #{metric}" unless dist[metric]
142
- JSON.parse(row).each do |sbj, val|
143
- dist[metric][qry] ||= {}
144
- if dist[metric][sbj]&.include?(qry)
145
- dist[metric][sbj].delete(qry) # Already bidirectional
146
- else
147
- dist[metric][qry][sbj] = val
148
- end
149
- end
150
- end
158
+ ##
159
+ # Cross-reference two reports of bidirectionality (located in +tmp+),
160
+ # identified by indexes +x+ and +y+, and fill databases with missing values.
161
+ # Returns the names of the fixed datasets as a Set.
162
+ def merge_bidir_tmp_pair(tmp, x, y)
163
+ dist_x = Marshal.load(File.read(File.join(tmp, "#{x}.marshal")))
164
+ if x == y
165
+ merge_bidir_tmp_cell(dist_x, dist_x)
166
+ else
167
+ dist_y = Marshal.load(File.read(File.join(tmp, "#{y}.marshal")))
168
+ merge_bidir_tmp_cell(dist_x, dist_y) +
169
+ merge_bidir_tmp_cell(dist_y, dist_x)
170
+ end
171
+ end
172
+
173
+ ##
174
+ # Find missing values in a "chunks cell" and fill databases. Returns the names
175
+ # of the fixed datasets as a Set.
176
+ def merge_bidir_tmp_cell(dist_x, dist_y)
177
+ # Find missing values
178
+ dist = {}
179
+ datasets = Set.new
180
+ dist_x.each do |metric, distances_x|
181
+ dist[metric] = {}
182
+ distances_x.each do |qry_x, row_x|
183
+ dist_y[metric].each do |qry_y, row_y|
184
+ # Ignore if missing in dist_x
185
+ next unless dist_x[metric][qry_x]&.include?(qry_y)
186
+ # Ignore if already in dist_y
187
+ next if dist_y[metric][qry_y]&.include?(qry_x)
188
+ # Save otherwise
189
+ dist[metric][qry_x] ||= {}
190
+ dist[metric][qry_x][qry_y] = dist_x[metric][qry_x][qry_y]
191
+ datasets << qry_y
151
192
  end
152
- raise "Incomplete thread dump: #{file}" unless metric == :end
153
193
  end
154
- File.open("#{file}.marshal.tmp", 'w') { |fh| Marshal.dump(dist, fh) }
155
- File.rename("#{file}.marshal.tmp", "#{file}.marshal")
156
194
  end
157
- cli.say
158
195
 
159
- return dist
196
+ # Save them in databases
197
+ datasets.each do |ds_name|
198
+ ds = cli.load_project.dataset(ds_name)
199
+ save_bidirectional(ds, dist)
200
+ end
201
+ datasets
160
202
  end
161
203
  end
162
204
 
data/lib/miga/json.rb CHANGED
@@ -9,7 +9,7 @@ rescue LoadError
9
9
  end
10
10
 
11
11
  ##
12
- # Taxonomic classifications in MiGA.
12
+ # JSON I/O utils in MiGA.
13
13
  class MiGA::Json < MiGA::MiGA
14
14
  class << self
15
15
  ##
data/lib/miga/parallel.rb CHANGED
@@ -21,6 +21,7 @@ class MiGA::Parallel < MiGA::MiGA
21
21
  # 3. Index of the acting thread
22
22
  def distribute(enum, threads, &blk)
23
23
  process(threads) { |thr| thread_enum(enum, threads, thr, &blk) }
24
+ Process.waitall # <- Just to double-check, but `process` should suffice
24
25
  end
25
26
 
26
27
  ##
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.3, 20, 5].freeze
15
+ VERSION = [1.3, 20, 7].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2024, 9, 1)
23
+ VERSION_DATE = Date.new(2024, 9, 12)
24
24
 
25
25
  ##
26
26
  # References of MiGA
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.20.5
4
+ version: 1.3.20.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-09-01 00:00:00.000000000 Z
11
+ date: 2024-09-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons