miga-base 1.2.18.2 → 1.3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/cli/action/doctor/base.rb +2 -1
  3. data/lib/miga/cli/action/init.rb +1 -1
  4. data/lib/miga/dataset/result/add.rb +3 -2
  5. data/lib/miga/version.rb +2 -2
  6. data/scripts/essential_genes.bash +4 -8
  7. data/utils/FastAAI/LICENSE +8 -0
  8. data/utils/FastAAI/README.md +151 -40
  9. data/utils/FastAAI/__init__.py +1 -0
  10. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz +0 -0
  11. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz +0 -0
  12. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz +0 -0
  13. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz +0 -0
  14. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz +0 -0
  15. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz +0 -0
  16. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz +0 -0
  17. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz +0 -0
  18. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz +0 -0
  19. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz +0 -0
  20. data/utils/FastAAI/fastaai/__init__.py +1 -0
  21. data/utils/FastAAI/fastaai/fastaai +4805 -0
  22. data/utils/FastAAI/fastaai/fastaai.py +4805 -0
  23. data/utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py +297 -0
  24. data/utils/FastAAI/fastaai/fastaai_miga_preproc.py +931 -0
  25. data/utils/FastAAI/metadata/Accession_names_and_IDs.txt +122 -0
  26. data/utils/distance/commands.rb +51 -23
  27. metadata +23 -6
  28. data/utils/FastAAI/FastAAI +0 -3659
  29. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Archaea_SCG.hmm +0 -0
  30. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Bacteria_SCG.hmm +0 -0
  31. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -0
@@ -0,0 +1,122 @@
1
+ PF01780_19 0
2
+ PF03948_14 1
3
+ PF17144_4 2
4
+ PF00830_19 3
5
+ PF00347_23 4
6
+ PF16906_5 5
7
+ PF13393_6 6
8
+ PF02565_15 7
9
+ PF01991_18 8
10
+ PF01984_20 9
11
+ PF00861_22 10
12
+ PF13656_6 11
13
+ PF00368_18 12
14
+ PF01142_18 13
15
+ PF00312_22 14
16
+ PF02367_17 15
17
+ PF01951_16 16
18
+ PF00749_21 17
19
+ PF01655_18 18
20
+ PF00318_20 19
21
+ PF01813_17 20
22
+ PF01649_18 21
23
+ PF01025_19 22
24
+ PF00380_19 23
25
+ PF01282_19 24
26
+ PF01864_17 25
27
+ PF01783_23 26
28
+ PF01808_18 27
29
+ PF01982_16 28
30
+ PF01715_17 29
31
+ PF00213_18 30
32
+ PF00119_20 31
33
+ PF00573_22 32
34
+ PF01981_16 33
35
+ PF00281_19 34
36
+ PF00584_20 35
37
+ PF00825_18 36
38
+ PF00406_22 37
39
+ PF00177_21 38
40
+ PF01192_22 39
41
+ PF05833_11 40
42
+ PF02699_15 41
43
+ PF01016_19 42
44
+ PF01765_19 43
45
+ PF00453_18 44
46
+ PF01193_24 45
47
+ PF05221_17 46
48
+ PF00231_19 47
49
+ PF00416_22 48
50
+ PF02033_18 49
51
+ PF01668_18 50
52
+ PF00886_19 51
53
+ PF00252_18 52
54
+ PF00572_18 53
55
+ PF00366_20 54
56
+ PF04104_14 55
57
+ PF04919_12 56
58
+ PF01912_18 57
59
+ PF00276_20 58
60
+ PF00203_21 59
61
+ PF00889_19 60
62
+ PF02996_17 61
63
+ PF00121_18 62
64
+ PF01990_17 63
65
+ PF00344_20 64
66
+ PF00297_22 65
67
+ PF01196_19 66
68
+ PF01194_17 67
69
+ PF01725_16 68
70
+ PF00750_19 69
71
+ PF00338_22 70
72
+ PF00238_19 71
73
+ PF01200_18 72
74
+ PF00162_19 73
75
+ PF00181_23 74
76
+ PF01866_17 75
77
+ PF00709_21 76
78
+ PF02006_16 77
79
+ PF00164_25 78
80
+ PF00237_19 79
81
+ PF01139_17 80
82
+ PF01351_18 81
83
+ PF04010_13 82
84
+ PF06093_13 83
85
+ PF00828_19 84
86
+ PF02410_15 85
87
+ PF01176_19 86
88
+ PF02130_17 87
89
+ PF01948_18 88
90
+ PF01195_19 89
91
+ PF01746_21 90
92
+ PF01667_17 91
93
+ PF03874_16 92
94
+ PF01090_19 93
95
+ PF01198_19 94
96
+ PF01250_17 95
97
+ PF17136_4 96
98
+ PF06026_14 97
99
+ PF03652_15 98
100
+ PF04019_12 99
101
+ PF01201_22 100
102
+ PF00832_20 101
103
+ PF01264_21 102
104
+ PF03840_14 103
105
+ PF00831_23 104
106
+ PF00189_20 105
107
+ PF02601_15 106
108
+ PF01496_19 107
109
+ PF00411_19 108
110
+ PF00334_19 109
111
+ PF00687_21 110
112
+ PF01157_18 111
113
+ PF01245_20 112
114
+ PF01994_16 113
115
+ PF01632_19 114
116
+ PF00827_17 115
117
+ PF01015_18 116
118
+ PF00829_21 117
119
+ PF00410_19 118
120
+ PF00833_18 119
121
+ PF00935_19 120
122
+ PF01992_16 121
@@ -146,56 +146,84 @@ module MiGA::DistanceRunner::Commands
146
146
  ##
147
147
  # Execute a FastAAI command
148
148
  def fastaai_cmd(targets)
149
- qry_idx = dataset.result(:essential_genes).file_path(:fastaai_index_2)
149
+ qry_idx = dataset.result(:essential_genes).file_path(:fastaai_crystal)
150
150
  return nil unless qry_idx
151
151
 
152
152
  # Merge databases
153
153
  donors = []
154
154
  targets.each do |target|
155
- tgt_idx = target&.result(:essential_genes)&.file_path(:fastaai_index_2)
155
+ tgt_idx = target&.result(:essential_genes)&.file_path(:fastaai_crystal)
156
156
  donors << tgt_idx if tgt_idx
157
157
  end
158
158
  return nil if donors.empty?
159
159
 
160
160
  # Build target database
161
- f1 = tmp_file
162
- if donors.size == 1
163
- FileUtils.cp(donors.first, f1)
164
- else
165
- File.open(f0 = tmp_file, 'w') { |fh| donors.each { |i| fh.puts i } }
166
- run_cmd(
167
- <<~CMD
168
- FastAAI merge_db --threads #{opts[:thr]} \
169
- --donor_file "#{f0}" --recipient "#{f1}"
170
- CMD
171
- )
172
- raise "Cannot merge databases into: #{f1}" unless File.size?(f1)
173
- end
161
+ fastaai_dir = File.join(MiGA::MiGA.root_path, 'utils', 'FastAAI', 'fastaai')
162
+ t_db = tmp_file # Target database (from crystals)
163
+ q_db = tmp_file # Query database (from crystal)
164
+ File.open(crystals = tmp_file, 'w') { |fh| donors.each { |i| fh.puts i } }
165
+ script = File.join(fastaai_dir, 'fastaai_miga_crystals_to_db.py')
166
+ run_cmd(
167
+ <<~CMD
168
+ python3 "#{script}" \
169
+ --crystal_list "#{crystals}" --database_path "#{t_db}" --overwrite
170
+ CMD
171
+ )
172
+ raise "Cannot merge databases into: #{t_db}" unless File.size?(t_db)
173
+ run_cmd(
174
+ <<~CMD
175
+ echo "#{qry_idx}" | \
176
+ python3 "#{script}" \
177
+ --crystal_list /dev/stdin --database_path "#{q_db}" --overwrite
178
+ CMD
179
+ )
180
+ raise "Cannot merge databases into: #{q_db}" unless File.size?(q_db)
174
181
 
175
182
  # Run FastAAI
183
+ script = File.join(fastaai_dir, 'fastaai')
176
184
  run_cmd(
177
185
  <<~CMD
178
- FastAAI db_query --query "#{qry_idx}" --target "#{f1}" \
179
- --output "#{f2 = tmp_file}" --threads #{opts[:thr]} \
180
- --do_stdev
186
+ python3 "#{script}" db_query \
187
+ --query "#{q_db}" --target "#{t_db}" \
188
+ --output "#{out_dir = tmp_file}" \
189
+ --threads 1 --do_stdev
181
190
  CMD
182
191
  )
183
- raise "Cannot find FastAAI output directory: #{f2}" unless Dir.exist?(f2)
192
+ #run_cmd(
193
+ # <<~CMD
194
+ # python3 "#{script}" db_query --query "#{q_db}" --target "#{t_db}" \
195
+ # --output "#{out_dir = tmp_file}" --threads #{opts[:thr]} \
196
+ # --do_stdev
197
+ # CMD
198
+ #)
199
+ raise "Cannot find FastAAI output: #{out_dir}" unless Dir.exist?(out_dir)
184
200
 
185
201
  # Save values in the databases
186
202
  haai_data = {}
187
203
  aai_data = {}
188
204
  # Ugly workaround to the insistence of FastAAI not to provide the files
189
205
  # I ask for ;-)
190
- qry_results = File.basename(qry_idx, '.faix') + '_results.txt'
191
- out_file = File.join(f2, 'results', qry_results)
206
+ # qry_results = File.basename(qry_idx, '.crystal') + '_results.txt'
207
+ # out_file = File.join(out_dir, 'results', qry_results)
208
+ out_file = Dir["#{out_dir}/results/*_results.txt"].first
209
+ unless out_file && File.exist?(out_file)
210
+ raise "Cannot find FastAAI results: #{Dir["#{out_dir}/**/*"]}"
211
+ end
192
212
  File.open(out_file, 'r') do |fh|
193
213
  fh.each do |ln|
194
214
  out = ln.chomp.split("\t")
195
215
  haai_data[out[1]] = [
196
216
  out[2].to_f * 100, out[3].to_f * 100, out[4].to_i, out[5].to_i
197
217
  ]
198
- out[6] = (out[6] =~ /^>/) ? 95.0 : out[6].to_f
218
+ if out[6] =~ /^>/
219
+ # J-bar = 0.843 <=> AAI-hat = 90%
220
+ # This approximation is not in the original FastAAI paper, but it
221
+ # allows to maintain monotonicity at AAI-hat ≥ 90%, which solves
222
+ # indexing issues the ML-estimate of "AAI ~ 95%"
223
+ out[6] = Math.sqrt(out[2].to_f) * 100
224
+ else
225
+ out[6] = out[6].to_f
226
+ end
199
227
  aai_data[out[1]] = [out[6], 0, 0, 0]
200
228
  end
201
229
  end
@@ -204,7 +232,7 @@ module MiGA::DistanceRunner::Commands
204
232
  batch_data_to_db(:aai, aai_data)
205
233
 
206
234
  # Cleanup
207
- [f1, f2].each { |i| FileUtils.rm_r(i) }
235
+ FileUtils.rm_rf([crystals, t_db, q_db, out_dir])
208
236
  end
209
237
 
210
238
  ##
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.18.2
4
+ version: 1.3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-03-07 00:00:00.000000000 Z
11
+ date: 2023-03-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons
@@ -287,13 +287,30 @@ files:
287
287
  - test/test_helper.rb
288
288
  - test/with_daemon_test.rb
289
289
  - test/with_option_test.rb
290
- - utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm
291
- - utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm
292
- - utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm
293
- - utils/FastAAI/FastAAI
294
290
  - utils/FastAAI/FastAAI-legacy/FastAAI
295
291
  - utils/FastAAI/FastAAI-legacy/kAAI_v1.0_virus.py
292
+ - utils/FastAAI/LICENSE
296
293
  - utils/FastAAI/README.md
294
+ - utils/FastAAI/__init__.py
295
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz
296
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz
297
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz
298
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz
299
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz
300
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz
301
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz
302
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz
303
+ - utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz
304
+ - utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz
305
+ - utils/FastAAI/fastaai/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm
306
+ - utils/FastAAI/fastaai/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm
307
+ - utils/FastAAI/fastaai/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm
308
+ - utils/FastAAI/fastaai/__init__.py
309
+ - utils/FastAAI/fastaai/fastaai
310
+ - utils/FastAAI/fastaai/fastaai.py
311
+ - utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py
312
+ - utils/FastAAI/fastaai/fastaai_miga_preproc.py
313
+ - utils/FastAAI/metadata/Accession_names_and_IDs.txt
297
314
  - utils/adapters.fa
298
315
  - utils/cleanup-databases.rb
299
316
  - utils/core-pan-plot.R