miga-base 0.7.26.1 → 1.0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/classify_wf.rb +2 -2
- data/lib/miga/cli/action/derep_wf.rb +1 -1
- data/lib/miga/cli/action/doctor.rb +57 -14
- data/lib/miga/cli/action/doctor/base.rb +47 -23
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/common.rb +1 -0
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +5 -4
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +36 -0
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +10 -2
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +6 -4
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/test/remote_dataset_test.rb +1 -1
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/database.rb +0 -1
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
- data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
- data/utils/enveomics/Manifest/Tasks/other.json +77 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
- data/utils/enveomics/Manifest/categories.json +13 -4
- data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
- data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
- data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
- data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
- data/utils/enveomics/Scripts/SRA.download.bash +6 -8
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/aai.rb +3 -2
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +87 -133
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
- data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/utils.R +30 -0
- data/utils/enveomics/enveomics.R/README.md +1 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- data/utils/subclade/pipeline.rb +2 -2
- metadata +35 -7
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -0,0 +1,293 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'enveomics_rb/stats'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'shellwords'
|
6
|
+
require 'tmpdir'
|
7
|
+
require 'zlib'
|
8
|
+
|
9
|
+
module Enveomics
|
10
|
+
# Wrapper class for ANIr estimation
|
11
|
+
#
|
12
|
+
# Use as: +ANIr.new(opts).go!+
|
13
|
+
class ANIr
|
14
|
+
# Options hash
|
15
|
+
attr :opts
|
16
|
+
|
17
|
+
# Identities list (unsorted)
|
18
|
+
attr :identities
|
19
|
+
|
20
|
+
def initialize(opts)
|
21
|
+
@opts = opts
|
22
|
+
@identities = []
|
23
|
+
end
|
24
|
+
|
25
|
+
# --------------------------------------------------[ High-level pipelines ]
|
26
|
+
|
27
|
+
# Perform all the analyses
|
28
|
+
def go!
|
29
|
+
read_input
|
30
|
+
detect_identity
|
31
|
+
estimate_ani_r
|
32
|
+
end
|
33
|
+
|
34
|
+
# Identify input/output mode and read mapping
|
35
|
+
def read_input
|
36
|
+
if opts[:m_format] != :list
|
37
|
+
@tmpdir = Dir.mktmpdir
|
38
|
+
@filter_contigs = !opts[:g].nil?
|
39
|
+
opts[:m] = File.join(@tmpdir, 'map.sam') if opts[:m].nil?
|
40
|
+
run_mapping unless File.exist? opts[:m]
|
41
|
+
load_contigs_to_filter if @filter_contigs
|
42
|
+
end
|
43
|
+
read_mapping = :"read_mapping_from_#{opts[:m_format]}"
|
44
|
+
raise Enveomics::OptionError.new(
|
45
|
+
"Unsupported mapping format: #{opts[:m_format]}"
|
46
|
+
) unless respond_to? read_mapping
|
47
|
+
@identities = []
|
48
|
+
send(read_mapping)
|
49
|
+
say "- Unfiltered average identity: #{sample.mean}"
|
50
|
+
say "- Reads mapped: #{sample.n}"
|
51
|
+
save_identities
|
52
|
+
save_histogram
|
53
|
+
ensure
|
54
|
+
@tmpdir ||= nil
|
55
|
+
FileUtils.rm_rf @tmpdir if @tmpdir
|
56
|
+
end
|
57
|
+
|
58
|
+
# Identify the identity threshold
|
59
|
+
def detect_identity
|
60
|
+
say 'Detecting identity threshold'
|
61
|
+
if opts[:algorithm] == :auto
|
62
|
+
say "- Bimodality: #{bimodality}"
|
63
|
+
opts[:algorithm] = bimodality >= opts[:bimodality] ? :gmm : :fix
|
64
|
+
end
|
65
|
+
say "- Algorithm: #{opts[:algorithm]}"
|
66
|
+
if opts[:algorithm] == :gmm
|
67
|
+
detect_identity_by_gmm
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Estimate ANIr
|
72
|
+
def estimate_ani_r
|
73
|
+
say 'Estimating ANIr'
|
74
|
+
@sample = nil # Empty cached sample
|
75
|
+
@identities.delete_if { |i| i < opts[:identity] }
|
76
|
+
say "- ANIr: #{sample.mean}"
|
77
|
+
end
|
78
|
+
|
79
|
+
# -----------------------------------------------------------------[ Utils ]
|
80
|
+
|
81
|
+
# Show progress unless +opts[:q]+
|
82
|
+
def say(*msg)
|
83
|
+
o = '[%s] %s' % [Time.now, msg.join('')]
|
84
|
+
$stderr.puts(o) unless opts[:q]
|
85
|
+
File.open(opts[:log], 'a') { |fh| fh.puts o } if opts[:log]
|
86
|
+
end
|
87
|
+
|
88
|
+
# Execute command in the shell
|
89
|
+
def run(cmd)
|
90
|
+
say " - Running: #{cmd.join(' ')}"
|
91
|
+
`#{cmd.shelljoin} 2>&1 | tee >> #{opts[:log] || '/dev/null'}`
|
92
|
+
unless $?.success?
|
93
|
+
raise Enveomics::CommandError.new("#{cmd.first} failed: #{$?}")
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Returns an open file handler for the file, supporting .gz
|
98
|
+
def reader(file)
|
99
|
+
file =~ /\.gz$/ ? Zlib::GzipReader.open(file) : File.open(file, 'r')
|
100
|
+
end
|
101
|
+
|
102
|
+
# Is the mapping in SAM format?
|
103
|
+
def sam?
|
104
|
+
opts[:m_format] == :sam
|
105
|
+
end
|
106
|
+
|
107
|
+
# ------------------------------------------------------------[ Map it out ]
|
108
|
+
|
109
|
+
# Execute Bowtie2 and generate SAM file
|
110
|
+
def run_mapping
|
111
|
+
say 'Running mapping using Bowtie2'
|
112
|
+
raise Enveomics::OptionError.new(
|
113
|
+
'Only SAM output is supported for mapping'
|
114
|
+
) unless sam?
|
115
|
+
|
116
|
+
@filter_contigs = false
|
117
|
+
say '- Indexing input sequences'
|
118
|
+
raise Enveomics::OptionError.new(
|
119
|
+
'Only FastA genome input is supported for mapping'
|
120
|
+
) unless opts[:g_format] == :fasta
|
121
|
+
|
122
|
+
idx = File.join(@tmpdir, 'genome.idx')
|
123
|
+
run(['bowtie2-build', opts[:g], idx])
|
124
|
+
|
125
|
+
say '- Mapping metagenomic reads to genome assembly'
|
126
|
+
cmd = [
|
127
|
+
'bowtie2', '-x', idx, '-p', opts[:threads], '-S', opts[:m], '--no-mixed'
|
128
|
+
]
|
129
|
+
cmd << '-f' if opts[:r_format] == :fasta
|
130
|
+
cmd +=
|
131
|
+
case opts[:r_type]
|
132
|
+
when :single
|
133
|
+
['-U', opts[:r]]
|
134
|
+
when :coupled
|
135
|
+
pairs = opts[:r].split(',', 2)
|
136
|
+
['-1', pairs[0], '-2', pairs[1], '--no-discordant']
|
137
|
+
when :interleaved
|
138
|
+
['--interleaved', opts[:r], '--no-discordant']
|
139
|
+
else
|
140
|
+
raise Enveomics::OptionError.new(
|
141
|
+
"Unsupported reads type: #{o[:r_type]}"
|
142
|
+
)
|
143
|
+
end
|
144
|
+
run(cmd)
|
145
|
+
end
|
146
|
+
|
147
|
+
# If +@filter_contigs+ is true, reads the genome assembly and saves contig
|
148
|
+
# names to filter the mapping
|
149
|
+
def load_contigs_to_filter
|
150
|
+
return unless @filter_contigs
|
151
|
+
say 'Loading contigs to filter'
|
152
|
+
reader = reader(opts[:g])
|
153
|
+
@contigs_to_filter =
|
154
|
+
case opts[:g_format]
|
155
|
+
when :fasta
|
156
|
+
reader.each.map { |ln| $1 if ln =~ /^>(\S+)/ }.compact
|
157
|
+
when :list
|
158
|
+
reader.each.map(&:chomp)
|
159
|
+
else
|
160
|
+
raise Enveomics::OptionError.new(
|
161
|
+
"Unsupported genome assembly format: #{opts[:g_format]}"
|
162
|
+
)
|
163
|
+
end
|
164
|
+
reader.close
|
165
|
+
say "- Got #{@contigs_to_filter.size} contigs"
|
166
|
+
end
|
167
|
+
|
168
|
+
# Reads the mapping file assuming SAM format
|
169
|
+
def read_mapping_from_sam
|
170
|
+
say 'Reading mapping from SAM file'
|
171
|
+
reader = reader(opts[:m])
|
172
|
+
reader.each { |ln| parse_sam_line(ln) }
|
173
|
+
reader.close
|
174
|
+
end
|
175
|
+
|
176
|
+
# Reads the mapping file assuming BAM format
|
177
|
+
def read_mapping_from_bam
|
178
|
+
say 'Reading mapping from BAM file'
|
179
|
+
IO.popen(['samtools', 'view', opts[:m]].shelljoin) do |fh|
|
180
|
+
fh.each { |ln| parse_sam_line(ln) }
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
# Reads the mapping file assuming a Tabular BLAST format
|
185
|
+
def read_mapping_from_tab
|
186
|
+
say 'Reading mapping from Tabular BLAST file'
|
187
|
+
reader = reader(opts[:m])
|
188
|
+
reader.each do |ln|
|
189
|
+
next if ln =~ /^\s*(#.*)?$/ # Comment or empty line
|
190
|
+
row = ln.chomp.split("\t")
|
191
|
+
next if @filter_contigs && !@contigs_to_filter.include?(row[1])
|
192
|
+
@identities << row[2].to_f
|
193
|
+
end
|
194
|
+
reader.close
|
195
|
+
end
|
196
|
+
|
197
|
+
# Reads the identities from a raw-text list
|
198
|
+
def read_mapping_from_list
|
199
|
+
say 'Reading identities from raw text list'
|
200
|
+
reader = reader(opts[:m])
|
201
|
+
@identities = reader.each.map(&:to_f)
|
202
|
+
reader.close
|
203
|
+
end
|
204
|
+
|
205
|
+
# Parses one line in SAM format
|
206
|
+
def parse_sam_line(ln)
|
207
|
+
return if ln =~ /^@/ || ln =~ /^\s*$/
|
208
|
+
row = ln.chomp.split("\t")
|
209
|
+
return if row[2] == '*'
|
210
|
+
return if @filter_contigs && !@contigs_to_filter.include?(row[2])
|
211
|
+
length = row[9].size
|
212
|
+
row.shift(11) # Discard non-flag columns
|
213
|
+
flags = Hash[row.map { |i| i.sub(/:.:/, ':').split(':', 2) }]
|
214
|
+
return if flags['YT'] && !%w[CP UU].include?(flags['YT'])
|
215
|
+
unless flags['MD']
|
216
|
+
raise Enveomics::ParseError.new(
|
217
|
+
"SAM line missing MD flag:\n#{ln}\nFlags: #{flags}"
|
218
|
+
)
|
219
|
+
end
|
220
|
+
mismatches = flags['MD'].scan(/[^\d]/).count
|
221
|
+
@identities << 100.0 * (length - mismatches) / length
|
222
|
+
end
|
223
|
+
|
224
|
+
# Save identites as raw text
|
225
|
+
def save_identities
|
226
|
+
return unless opts[:L]
|
227
|
+
say '- Saving identities'
|
228
|
+
File.open(opts[:L], 'w') do |fh|
|
229
|
+
identities.each { |i| fh.puts i }
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
# Save identity histogram as raw text
|
234
|
+
def save_histogram
|
235
|
+
return unless opts[:H]
|
236
|
+
say '- Saving histogram'
|
237
|
+
File.open(opts[:H], 'w') do |fh|
|
238
|
+
fh.puts "from\tto\tcount"
|
239
|
+
sample.histo_ranges.each_with_index do |r, k|
|
240
|
+
fh.puts (r + [sample.histo_counts[k]]).join("\t")
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
# -----------------------------------------------------------[ Peak finder ]
|
246
|
+
|
247
|
+
# Detect identity threshold by gaussian mixture model EM
|
248
|
+
def detect_identity_by_gmm
|
249
|
+
model_identities_by_gmm_em
|
250
|
+
detect_valley_by_gmm
|
251
|
+
end
|
252
|
+
|
253
|
+
# Model identities as a 2-gaussian mix by EM
|
254
|
+
def model_identities_by_gmm_em
|
255
|
+
say 'Modeling identities by gaussian mixture model using EM'
|
256
|
+
# TODO: Implement
|
257
|
+
raise Enveomics::UnimplementedError.new('Unimplemented operation')
|
258
|
+
end
|
259
|
+
|
260
|
+
# Detect valley by gaussian mix
|
261
|
+
def detect_valley_by_gmm
|
262
|
+
say 'Detecting valley by gaussian mixture model'
|
263
|
+
# TODO: Implement
|
264
|
+
raise Enveomics::UnimplementedError.new('Unimplemented operation')
|
265
|
+
end
|
266
|
+
|
267
|
+
# -----------------------------------------------------------[ Do the math ]
|
268
|
+
|
269
|
+
# Identities as a Enveomics::Stats::Sample object
|
270
|
+
def sample
|
271
|
+
@sample ||= Enveomics::Stats::Sample.new(
|
272
|
+
identities,
|
273
|
+
effective_range: [nil, 100.0],
|
274
|
+
histo_bin_size: opts[:bin_size]
|
275
|
+
)
|
276
|
+
end
|
277
|
+
|
278
|
+
# Returns the bimodality coefficient indicated by +opts[:coefficient]+
|
279
|
+
def bimodality
|
280
|
+
@bimodality ||=
|
281
|
+
case opts[:coefficient]
|
282
|
+
when :sarle
|
283
|
+
sample.sarle_bimodality
|
284
|
+
when :dma
|
285
|
+
sample.dma_bimodality
|
286
|
+
else
|
287
|
+
raise Enveomics::OptionError.new(
|
288
|
+
"Unsupported coefficient of bimodality: #{opts[:coefficient]}"
|
289
|
+
)
|
290
|
+
end
|
291
|
+
end
|
292
|
+
end
|
293
|
+
end
|
@@ -0,0 +1,175 @@
|
|
1
|
+
|
2
|
+
require 'enveomics_rb/enveomics'
|
3
|
+
require 'enveomics_rb/match'
|
4
|
+
use 'tmpdir'
|
5
|
+
use 'shellwords'
|
6
|
+
|
7
|
+
module Enveomics
|
8
|
+
class BMset
|
9
|
+
attr :qry, :sbj, :set, :opt
|
10
|
+
|
11
|
+
##
|
12
|
+
# Initialize Enveomics::BMset object with sequence paths +qry+ and +sbj+,
|
13
|
+
# and options Hash +opts+ (see #opt for supported options) with Symbol keys
|
14
|
+
def initialize(qry, sbj, opts = {})
|
15
|
+
@qry = qry
|
16
|
+
@sbj = sbj
|
17
|
+
@set = nil
|
18
|
+
@opt = opts
|
19
|
+
end
|
20
|
+
|
21
|
+
##
|
22
|
+
# Returns option with key +k+ as defined by #initialize or by default
|
23
|
+
# Supported options include [defaults in brackets]:
|
24
|
+
# - len [0]: Minimum alignment length in residues
|
25
|
+
# - id [0.0]: Minimum alignment identity in percent
|
26
|
+
# - fract [0.0]: Minimum alignment length as fraction of the query
|
27
|
+
# - score [0.0]: Minimum alignment score in bits
|
28
|
+
# - nucl [false]: The sequences are in nucleotides
|
29
|
+
# - thr [1]: Number of threads to use
|
30
|
+
# - bin ['']: Path to the directory containing binaries
|
31
|
+
# - program [:blast+]: Search engine to use
|
32
|
+
def opt(k)
|
33
|
+
@defaults ||= {
|
34
|
+
len: 0, id: 0.0, fract: 0.0, score: 0.0,
|
35
|
+
nucl: false, thr: 1, bin: '', program: :'blast+'
|
36
|
+
}
|
37
|
+
k = k.to_sym
|
38
|
+
@opt[k] = @defaults[k] if @opt[k].nil?
|
39
|
+
@opt[k]
|
40
|
+
end
|
41
|
+
|
42
|
+
##
|
43
|
+
# Array of Enveomics::Match objects
|
44
|
+
def set
|
45
|
+
match_and_filter! if @set.nil?
|
46
|
+
@set
|
47
|
+
end
|
48
|
+
|
49
|
+
##
|
50
|
+
# Returns the best match of query +qry+ as Enveomics::Match or nil if
|
51
|
+
# no qualifying match was found
|
52
|
+
def [](qry)
|
53
|
+
set[qry]
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# Number of matches found
|
58
|
+
def count
|
59
|
+
set.count
|
60
|
+
end
|
61
|
+
|
62
|
+
##
|
63
|
+
# Execute search and filter matches
|
64
|
+
def match_and_filter!
|
65
|
+
@set = {}
|
66
|
+
match!.each do |match|
|
67
|
+
# Already a better match?
|
68
|
+
next if self[match.qry] && self[match.qry].score >= match.score
|
69
|
+
|
70
|
+
# Is this a good enough match?
|
71
|
+
next unless %i[len id score fract].all? do |metric|
|
72
|
+
match.send(metric) >= opt(metric)
|
73
|
+
end
|
74
|
+
|
75
|
+
# Save match
|
76
|
+
@set[match.qry] = match
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
##
|
81
|
+
# Find all matches and return as an array of Enveomics::Match objects
|
82
|
+
def match!
|
83
|
+
y = []
|
84
|
+
Dir.mktmpdir do |dir|
|
85
|
+
# Determine commands
|
86
|
+
say('Temporal directory: ', dir)
|
87
|
+
db_path = File.join(dir, 'sbj.db')
|
88
|
+
out_path = File.join(dir, 'out.tsv')
|
89
|
+
cmds = []
|
90
|
+
case opt(:program)
|
91
|
+
when :blast
|
92
|
+
cmds << [
|
93
|
+
'formatdb', '-i', sbj, '-n', db_path, '-l', File.join(dir, 'log'),
|
94
|
+
'-p', opt(:nucl) ? 'F' : 'T'
|
95
|
+
]
|
96
|
+
cmd << [
|
97
|
+
'blastall', '-p', opt(:nucl) ? 'blastn' : 'blastp', '-d', db_path,
|
98
|
+
'-i', qry, '-v', '1', '-b', '1', '-a', opt(:thr).to_s, '-m', '8',
|
99
|
+
'-o', out_path
|
100
|
+
]
|
101
|
+
when :'blast+'
|
102
|
+
cmds << [
|
103
|
+
'makeblastdb', '-in', sbj, '-out', db_path,
|
104
|
+
'-dbtype', opt(:nucl) ? 'nucl' : 'prot'
|
105
|
+
]
|
106
|
+
cmds << [
|
107
|
+
opt(:nucl) ? 'blastn' : 'blastp', '-db', db_path, '-query', qry,
|
108
|
+
'-num_threads', opt(:thr).to_s, '-out', out_path, '-outfmt',
|
109
|
+
'6 qseqid sseqid pident length mismatch gapopen qstart qend ' \
|
110
|
+
'sstart send evalue bitscore qlen slen'
|
111
|
+
]
|
112
|
+
when :diamond
|
113
|
+
raise Enveomics::OptionError.new(
|
114
|
+
'Unsupported search engine diamond for nucleotides'
|
115
|
+
) if opt(:nucl)
|
116
|
+
cmds << [
|
117
|
+
'diamond', 'makedb', '--in', sbj, '--db', db_path,
|
118
|
+
'--threads', opt(:thr).to_s
|
119
|
+
]
|
120
|
+
cmds << [
|
121
|
+
'diamond', 'blastp', '--threads', opt(:thr).to_s,
|
122
|
+
'--db', db_path, '--query', qry, '--daa', "#{out_path}.daa",
|
123
|
+
'--quiet', '--sensitive'
|
124
|
+
]
|
125
|
+
cmds << [
|
126
|
+
'diamond', 'view', '--daa', "#{out_path}.daa", '--out', out_path,
|
127
|
+
'--quiet', '--outfmt'
|
128
|
+
] + %w[6 qseqid sseqid pident length mismatch gapopen qstart] +
|
129
|
+
%w[qend sstart send evalue bitscore qlen slen]
|
130
|
+
when :blat
|
131
|
+
cmds << ['blat', sbj, qry, '-out=blast8', out_path]
|
132
|
+
cmds[0] << '-prot' unless opt(:nucl)
|
133
|
+
else
|
134
|
+
raise Enveomics::OptionError.new(
|
135
|
+
"Unsupported search engine: #{opt(:program)}"
|
136
|
+
)
|
137
|
+
end
|
138
|
+
|
139
|
+
# Run commands
|
140
|
+
say('Running comparison')
|
141
|
+
say('Query: ', qry)
|
142
|
+
say('Subject: ', sbj)
|
143
|
+
cmd_err = File.join(dir, 'err')
|
144
|
+
begin
|
145
|
+
cmds.each do |cmd|
|
146
|
+
cmd[0] = File.join(opt(:bin), cmd[0]) unless opt(:bin) == ''
|
147
|
+
run_cmd(cmd, stderr: cmd_err)
|
148
|
+
end
|
149
|
+
rescue Enveomics::CommandError => e
|
150
|
+
$stderr.puts e
|
151
|
+
$stderr.puts ''
|
152
|
+
$stderr.puts '[ Error log ]'
|
153
|
+
$stderr.puts File.read(cmd_err)
|
154
|
+
exit
|
155
|
+
end
|
156
|
+
|
157
|
+
# Parse output
|
158
|
+
File.open(out_path, 'r') do |fh|
|
159
|
+
fh.each { |ln| y << Enveomics::Match.new(ln) }
|
160
|
+
end
|
161
|
+
end
|
162
|
+
y
|
163
|
+
end
|
164
|
+
|
165
|
+
##
|
166
|
+
# Enumerate RBMs and yield +blk+
|
167
|
+
def each(&blk)
|
168
|
+
if block_given?
|
169
|
+
set.each { |_, bm| blk.call(bm) }
|
170
|
+
else
|
171
|
+
to_enum(:each)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|