miga-base 1.2.18.2 → 1.3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/cli/action/doctor/base.rb +2 -1
  3. data/lib/miga/cli/action/init.rb +1 -1
  4. data/lib/miga/dataset/result/add.rb +3 -2
  5. data/lib/miga/version.rb +2 -2
  6. data/scripts/essential_genes.bash +4 -8
  7. data/utils/FastAAI/LICENSE +8 -0
  8. data/utils/FastAAI/README.md +151 -40
  9. data/utils/FastAAI/__init__.py +1 -0
  10. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz +0 -0
  11. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz +0 -0
  12. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz +0 -0
  13. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz +0 -0
  14. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz +0 -0
  15. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz +0 -0
  16. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz +0 -0
  17. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz +0 -0
  18. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz +0 -0
  19. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz +0 -0
  20. data/utils/FastAAI/fastaai/__init__.py +1 -0
  21. data/utils/FastAAI/fastaai/fastaai +4805 -0
  22. data/utils/FastAAI/fastaai/fastaai.py +4805 -0
  23. data/utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py +297 -0
  24. data/utils/FastAAI/fastaai/fastaai_miga_preproc.py +931 -0
  25. data/utils/FastAAI/metadata/Accession_names_and_IDs.txt +122 -0
  26. data/utils/distance/commands.rb +51 -23
  27. metadata +23 -6
  28. data/utils/FastAAI/FastAAI +0 -3659
  29. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Archaea_SCG.hmm +0 -0
  30. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Bacteria_SCG.hmm +0 -0
  31. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -0
@@ -0,0 +1,122 @@
1
+ PF01780_19 0
2
+ PF03948_14 1
3
+ PF17144_4 2
4
+ PF00830_19 3
5
+ PF00347_23 4
6
+ PF16906_5 5
7
+ PF13393_6 6
8
+ PF02565_15 7
9
+ PF01991_18 8
10
+ PF01984_20 9
11
+ PF00861_22 10
12
+ PF13656_6 11
13
+ PF00368_18 12
14
+ PF01142_18 13
15
+ PF00312_22 14
16
+ PF02367_17 15
17
+ PF01951_16 16
18
+ PF00749_21 17
19
+ PF01655_18 18
20
+ PF00318_20 19
21
+ PF01813_17 20
22
+ PF01649_18 21
23
+ PF01025_19 22
24
+ PF00380_19 23
25
+ PF01282_19 24
26
+ PF01864_17 25
27
+ PF01783_23 26
28
+ PF01808_18 27
29
+ PF01982_16 28
30
+ PF01715_17 29
31
+ PF00213_18 30
32
+ PF00119_20 31
33
+ PF00573_22 32
34
+ PF01981_16 33
35
+ PF00281_19 34
36
+ PF00584_20 35
37
+ PF00825_18 36
38
+ PF00406_22 37
39
+ PF00177_21 38
40
+ PF01192_22 39
41
+ PF05833_11 40
42
+ PF02699_15 41
43
+ PF01016_19 42
44
+ PF01765_19 43
45
+ PF00453_18 44
46
+ PF01193_24 45
47
+ PF05221_17 46
48
+ PF00231_19 47
49
+ PF00416_22 48
50
+ PF02033_18 49
51
+ PF01668_18 50
52
+ PF00886_19 51
53
+ PF00252_18 52
54
+ PF00572_18 53
55
+ PF00366_20 54
56
+ PF04104_14 55
57
+ PF04919_12 56
58
+ PF01912_18 57
59
+ PF00276_20 58
60
+ PF00203_21 59
61
+ PF00889_19 60
62
+ PF02996_17 61
63
+ PF00121_18 62
64
+ PF01990_17 63
65
+ PF00344_20 64
66
+ PF00297_22 65
67
+ PF01196_19 66
68
+ PF01194_17 67
69
+ PF01725_16 68
70
+ PF00750_19 69
71
+ PF00338_22 70
72
+ PF00238_19 71
73
+ PF01200_18 72
74
+ PF00162_19 73
75
+ PF00181_23 74
76
+ PF01866_17 75
77
+ PF00709_21 76
78
+ PF02006_16 77
79
+ PF00164_25 78
80
+ PF00237_19 79
81
+ PF01139_17 80
82
+ PF01351_18 81
83
+ PF04010_13 82
84
+ PF06093_13 83
85
+ PF00828_19 84
86
+ PF02410_15 85
87
+ PF01176_19 86
88
+ PF02130_17 87
89
+ PF01948_18 88
90
+ PF01195_19 89
91
+ PF01746_21 90
92
+ PF01667_17 91
93
+ PF03874_16 92
94
+ PF01090_19 93
95
+ PF01198_19 94
96
+ PF01250_17 95
97
+ PF17136_4 96
98
+ PF06026_14 97
99
+ PF03652_15 98
100
+ PF04019_12 99
101
+ PF01201_22 100
102
+ PF00832_20 101
103
+ PF01264_21 102
104
+ PF03840_14 103
105
+ PF00831_23 104
106
+ PF00189_20 105
107
+ PF02601_15 106
108
+ PF01496_19 107
109
+ PF00411_19 108
110
+ PF00334_19 109
111
+ PF00687_21 110
112
+ PF01157_18 111
113
+ PF01245_20 112
114
+ PF01994_16 113
115
+ PF01632_19 114
116
+ PF00827_17 115
117
+ PF01015_18 116
118
+ PF00829_21 117
119
+ PF00410_19 118
120
+ PF00833_18 119
121
+ PF00935_19 120
122
+ PF01992_16 121
@@ -146,56 +146,84 @@ module MiGA::DistanceRunner::Commands
146
146
  ##
147
147
  # Execute a FastAAI command
148
148
  def fastaai_cmd(targets)
149
- qry_idx = dataset.result(:essential_genes).file_path(:fastaai_index_2)
149
+ qry_idx = dataset.result(:essential_genes).file_path(:fastaai_crystal)
150
150
  return nil unless qry_idx
151
151
 
152
152
  # Merge databases
153
153
  donors = []
154
154
  targets.each do |target|
155
- tgt_idx = target&.result(:essential_genes)&.file_path(:fastaai_index_2)
155
+ tgt_idx = target&.result(:essential_genes)&.file_path(:fastaai_crystal)
156
156
  donors << tgt_idx if tgt_idx
157
157
  end
158
158
  return nil if donors.empty?
159
159
 
160
160
  # Build target database
161
- f1 = tmp_file
162
- if donors.size == 1
163
- FileUtils.cp(donors.first, f1)
164
- else
165
- File.open(f0 = tmp_file, 'w') { |fh| donors.each { |i| fh.puts i } }
166
- run_cmd(
167
- <<~CMD
168
- FastAAI merge_db --threads #{opts[:thr]} \
169
- --donor_file "#{f0}" --recipient "#{f1}"
170
- CMD
171
- )
172
- raise "Cannot merge databases into: #{f1}" unless File.size?(f1)
173
- end
161
+ fastaai_dir = File.join(MiGA::MiGA.root_path, 'utils', 'FastAAI', 'fastaai')
162
+ t_db = tmp_file # Target database (from crystals)
163
+ q_db = tmp_file # Query database (from crystal)
164
+ File.open(crystals = tmp_file, 'w') { |fh| donors.each { |i| fh.puts i } }
165
+ script = File.join(fastaai_dir, 'fastaai_miga_crystals_to_db.py')
166
+ run_cmd(
167
+ <<~CMD
168
+ python3 "#{script}" \
169
+ --crystal_list "#{crystals}" --database_path "#{t_db}" --overwrite
170
+ CMD
171
+ )
172
+ raise "Cannot merge databases into: #{t_db}" unless File.size?(t_db)
173
+ run_cmd(
174
+ <<~CMD
175
+ echo "#{qry_idx}" | \
176
+ python3 "#{script}" \
177
+ --crystal_list /dev/stdin --database_path "#{q_db}" --overwrite
178
+ CMD
179
+ )
180
+ raise "Cannot merge databases into: #{q_db}" unless File.size?(q_db)
174
181
 
175
182
  # Run FastAAI
183
+ script = File.join(fastaai_dir, 'fastaai')
176
184
  run_cmd(
177
185
  <<~CMD
178
- FastAAI db_query --query "#{qry_idx}" --target "#{f1}" \
179
- --output "#{f2 = tmp_file}" --threads #{opts[:thr]} \
180
- --do_stdev
186
+ python3 "#{script}" db_query \
187
+ --query "#{q_db}" --target "#{t_db}" \
188
+ --output "#{out_dir = tmp_file}" \
189
+ --threads 1 --do_stdev
181
190
  CMD
182
191
  )
183
- raise "Cannot find FastAAI output directory: #{f2}" unless Dir.exist?(f2)
192
+ #run_cmd(
193
+ # <<~CMD
194
+ # python3 "#{script}" db_query --query "#{q_db}" --target "#{t_db}" \
195
+ # --output "#{out_dir = tmp_file}" --threads #{opts[:thr]} \
196
+ # --do_stdev
197
+ # CMD
198
+ #)
199
+ raise "Cannot find FastAAI output: #{out_dir}" unless Dir.exist?(out_dir)
184
200
 
185
201
  # Save values in the databases
186
202
  haai_data = {}
187
203
  aai_data = {}
188
204
  # Ugly workaround to the insistence of FastAAI not to provide the files
189
205
  # I ask for ;-)
190
- qry_results = File.basename(qry_idx, '.faix') + '_results.txt'
191
- out_file = File.join(f2, 'results', qry_results)
206
+ # qry_results = File.basename(qry_idx, '.crystal') + '_results.txt'
207
+ # out_file = File.join(out_dir, 'results', qry_results)
208
+ out_file = Dir["#{out_dir}/results/*_results.txt"].first
209
+ unless out_file && File.exist?(out_file)
210
+ raise "Cannot find FastAAI results: #{Dir["#{out_dir}/**/*"]}"
211
+ end
192
212
  File.open(out_file, 'r') do |fh|
193
213
  fh.each do |ln|
194
214
  out = ln.chomp.split("\t")
195
215
  haai_data[out[1]] = [
196
216
  out[2].to_f * 100, out[3].to_f * 100, out[4].to_i, out[5].to_i
197
217
  ]
198
- out[6] = (out[6] =~ /^>/) ? 95.0 : out[6].to_f
218
+ if out[6] =~ /^>/
219
+ # J-bar = 0.843 <=> AAI-hat = 90%
220
+ # This approximation is not in the original FastAAI paper, but it
221
+ # allows to maintain monotonicity at AAI-hat ≥ 90%, which solves
222
+ # indexing issues the ML-estimate of "AAI ~ 95%"
223
+ out[6] = Math.sqrt(out[2].to_f) * 100
224
+ else
225
+ out[6] = out[6].to_f
226
+ end
199
227
  aai_data[out[1]] = [out[6], 0, 0, 0]
200
228
  end
201
229
  end
@@ -204,7 +232,7 @@ module MiGA::DistanceRunner::Commands
204
232
  batch_data_to_db(:aai, aai_data)
205
233
 
206
234
  # Cleanup
207
- [f1, f2].each { |i| FileUtils.rm_r(i) }
235
+ FileUtils.rm_rf([crystals, t_db, q_db, out_dir])
208
236
  end
209
237
 
210
238
  ##
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.18.2
4
+ version: 1.3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-03-07 00:00:00.000000000 Z
11
+ date: 2023-03-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons
@@ -287,13 +287,30 @@ files:
287
287
  - test/test_helper.rb
288
288
  - test/with_daemon_test.rb
289
289
  - test/with_option_test.rb
290
- - utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm
291
- - utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm
292
- - utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm
293
- - utils/FastAAI/FastAAI
294
290
  - utils/FastAAI/FastAAI-legacy/FastAAI
295
291
  - utils/FastAAI/FastAAI-legacy/kAAI_v1.0_virus.py
292
+ - utils/FastAAI/LICENSE
296
293
  - utils/FastAAI/README.md
294
+ - utils/FastAAI/__init__.py
295
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz
296
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz
297
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz
298
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz
299
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz
300
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz
301
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz
302
+ - utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz
303
+ - utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz
304
+ - utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz
305
+ - utils/FastAAI/fastaai/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm
306
+ - utils/FastAAI/fastaai/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm
307
+ - utils/FastAAI/fastaai/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm
308
+ - utils/FastAAI/fastaai/__init__.py
309
+ - utils/FastAAI/fastaai/fastaai
310
+ - utils/FastAAI/fastaai/fastaai.py
311
+ - utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py
312
+ - utils/FastAAI/fastaai/fastaai_miga_preproc.py
313
+ - utils/FastAAI/metadata/Accession_names_and_IDs.txt
297
314
  - utils/adapters.fa
298
315
  - utils/cleanup-databases.rb
299
316
  - utils/core-pan-plot.R