miga-base 0.7.26.2 → 1.0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/classify_wf.rb +2 -2
- data/lib/miga/cli/action/derep_wf.rb +1 -1
- data/lib/miga/cli/action/doctor.rb +57 -14
- data/lib/miga/cli/action/doctor/base.rb +47 -23
- data/lib/miga/cli/action/env.rb +26 -0
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/cli/base.rb +1 -0
- data/lib/miga/common.rb +1 -0
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +5 -4
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +36 -0
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +10 -2
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +12 -8
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/test/remote_dataset_test.rb +1 -1
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/database.rb +0 -1
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
- data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
- data/utils/enveomics/Manifest/Tasks/other.json +77 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
- data/utils/enveomics/Manifest/categories.json +13 -4
- data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
- data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
- data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
- data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
- data/utils/enveomics/Scripts/SRA.download.bash +6 -8
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/aai.rb +3 -2
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +87 -133
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
- data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/utils.R +30 -0
- data/utils/enveomics/enveomics.R/README.md +1 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- data/utils/subclade/pipeline.rb +2 -2
- metadata +33 -4
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -0,0 +1,172 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
$VERSION = 0.1
|
6
|
+
$:.push File.expand_path('../lib', __FILE__)
|
7
|
+
require 'enveomics_rb/enveomics'
|
8
|
+
require 'tmpdir'
|
9
|
+
|
10
|
+
o = {
|
11
|
+
q: false, thr: 1,
|
12
|
+
len: 0, id: 0.0, fract: 0.0, score: 0.0,
|
13
|
+
bin: '', program: :'blast+', nucl: false
|
14
|
+
}
|
15
|
+
|
16
|
+
OptionParser.new do |opts|
|
17
|
+
cmd = File.basename($0)
|
18
|
+
opts.banner = <<~BANNER
|
19
|
+
|
20
|
+
[Enveomics Collection: #{cmd} v#{$VERSION}]
|
21
|
+
|
22
|
+
[DEPRECATED: Please use rbm.rb instead]
|
23
|
+
|
24
|
+
Finds the reciprocal best matches between two sets of sequences
|
25
|
+
|
26
|
+
Usage: #{cmd} [options]
|
27
|
+
|
28
|
+
BANNER
|
29
|
+
|
30
|
+
opts.separator 'Mandatory'
|
31
|
+
opts.on(
|
32
|
+
'-1', '--seq1 FILE',
|
33
|
+
'Path to the FastA file containing the set 1'
|
34
|
+
) { |v| o[:seq1] = v }
|
35
|
+
opts.on(
|
36
|
+
'-2', '--seq2 FILE',
|
37
|
+
'Path to the FastA file containing the set 2'
|
38
|
+
) { |v| o[:seq2] = v }
|
39
|
+
opts.separator ''
|
40
|
+
opts.separator 'Search Options'
|
41
|
+
opts.on(
|
42
|
+
'-n', '--nucl',
|
43
|
+
'Sequences are assumed to be nucleotides (proteins by default)',
|
44
|
+
'Incompatible with -p diamond'
|
45
|
+
) { |v| o[:nucl] = true }
|
46
|
+
opts.on(
|
47
|
+
'-l', '--len INT', Integer,
|
48
|
+
'Minimum alignment length (in residues)',
|
49
|
+
"By default: #{o[:len]}"
|
50
|
+
) { |v| o[:len] = v }
|
51
|
+
opts.on(
|
52
|
+
'-f', '--fract FLOAT', Float,
|
53
|
+
'Minimum alignment length (as a fraction of the query)',
|
54
|
+
'If set, requires BLAST+ or Diamond (see -p)',
|
55
|
+
"By default: #{o[:fract]}"
|
56
|
+
) { |v| o[:fract] = v }
|
57
|
+
opts.on(
|
58
|
+
'-i', '--id NUM', Float,
|
59
|
+
'Minimum alignment identity (in %)',
|
60
|
+
"By default: #{o[:id]}"
|
61
|
+
){ |v| o[:id] = v }
|
62
|
+
opts.on(
|
63
|
+
'-s', '--score NUM', Float,
|
64
|
+
'Minimum alignment score (in bits)',
|
65
|
+
"By default: #{o[:score]}"
|
66
|
+
) { |v| o[:score] = v }
|
67
|
+
opts.separator ''
|
68
|
+
opts.separator 'Software Options'
|
69
|
+
opts.on(
|
70
|
+
'-b', '--bin DIR',
|
71
|
+
'Path to the directory containing the binaries of the search program'
|
72
|
+
) { |v| o[:bin] = v }
|
73
|
+
opts.on(
|
74
|
+
'-p', '--program STR',
|
75
|
+
'Search program to be used. One of: blast+ (default), blast, diamond'
|
76
|
+
) { |v| o[:program] = v.downcase.to_sym }
|
77
|
+
opts.on(
|
78
|
+
'-t', '--threads INT', Integer,
|
79
|
+
'Number of parallel threads to be used',
|
80
|
+
"By default: #{o[:thr]}"
|
81
|
+
) { |v| o[:thr] = v }
|
82
|
+
opts.separator ''
|
83
|
+
opts.separator 'Other Options'
|
84
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
85
|
+
opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
|
86
|
+
opts.separator ''
|
87
|
+
end.parse!
|
88
|
+
|
89
|
+
abort '-1 is mandatory' if o[:seq1].nil?
|
90
|
+
abort '-2 is mandatory' if o[:seq2].nil?
|
91
|
+
if o[:program] == :diamond && o[:nucl]
|
92
|
+
abort '-p diamond is incompatible with -n'
|
93
|
+
end
|
94
|
+
if o[:fract] > 0.0 && o[:program] == :blast
|
95
|
+
abort 'Argument -f/--fract requires -p blast+ or -p diamond'
|
96
|
+
end
|
97
|
+
o[:bin] = o[:bin] + '/' if o[:bin].size > 0
|
98
|
+
$quiet = o[:q]
|
99
|
+
|
100
|
+
Dir.mktmpdir do |dir|
|
101
|
+
say('Temporal directory: ', dir)
|
102
|
+
|
103
|
+
# Create databases
|
104
|
+
say 'Creating databases'
|
105
|
+
[:seq1, :seq2].each do |seq|
|
106
|
+
case o[:program]
|
107
|
+
when :blast
|
108
|
+
`"#{o[:bin]}formatdb" -i "#{o[seq]}" -n "#{dir}/#{seq}" \
|
109
|
+
-p #{o[:nucl] ? 'F' : 'T'}`
|
110
|
+
when :'blast+'
|
111
|
+
`"#{o[:bin]}makeblastdb" -in "#{o[seq]}" -out "#{dir}/#{seq}" \
|
112
|
+
-dbtype #{o[:nucl] ? 'nucl' : 'prot'}`
|
113
|
+
when :diamond
|
114
|
+
`"#{o[:bin]}diamond" makedb --in "#{o[seq]}" \
|
115
|
+
--db "#{dir}/#{seq}.dmnd" --threads "#{o[:thr]}"`
|
116
|
+
else
|
117
|
+
abort "Unsupported program: #{o[:program]}"
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# Best-hits
|
122
|
+
rbh = {}
|
123
|
+
n2 = 0
|
124
|
+
say ' Running comparisons'
|
125
|
+
[2, 1].each do |i|
|
126
|
+
qry_seen = {}
|
127
|
+
q = o[:"seq#{i}"]
|
128
|
+
s = "#{dir}/seq#{i == 1 ? 2 : 1}"
|
129
|
+
say(' Query: ', q)
|
130
|
+
case o[:program]
|
131
|
+
when :blast
|
132
|
+
`"#{o[:bin]}blastall" -p #{o[:nucl] ? 'blastn' : 'blastp'} -d "#{s}" \
|
133
|
+
-i "#{q}" -v 1 -b 1 -a #{o[:thr]} -m 8 -o "#{dir}/#{i}.tab"`
|
134
|
+
when :'blast+'
|
135
|
+
`"#{o[:bin]}#{o[:nucl] ? 'blastn' : 'blastp'}" -db "#{s}" -query "#{q}" \
|
136
|
+
-max_target_seqs 1 -num_threads #{o[:thr]} -out "#{dir}/#{i}.tab" \
|
137
|
+
-outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend \
|
138
|
+
sstart send evalue bitscore qlen slen"`
|
139
|
+
when :diamond
|
140
|
+
`"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" --db "#{s}.dmnd" \
|
141
|
+
--query "#{q}" --sensitive --daa "#{dir}/#{i}.daa" --quiet \
|
142
|
+
&& "#{o[:bin]}diamond" view --daa "#{dir}/#{i}.daa" --outfmt \
|
143
|
+
6 qseqid sseqid pident length mismatch gapopen qstart qend sstart \
|
144
|
+
send evalue bitscore qlen slen --out "#{dir}/#{i}.tab" --quiet`
|
145
|
+
else
|
146
|
+
abort "Unsupported program: #{o[:program]}"
|
147
|
+
end
|
148
|
+
|
149
|
+
n = 0
|
150
|
+
File.open("#{dir}/#{i}.tab", 'r') do |fh|
|
151
|
+
fh.each do |ln|
|
152
|
+
ln.chomp!
|
153
|
+
row = ln.split(/\t/)
|
154
|
+
row[12] = '1' unless [:'blast+', :diamond].include? o[:program]
|
155
|
+
next unless qry_seen[row[0]].nil? &&
|
156
|
+
row[3].to_i >= o[:len] && row[2].to_f >= o[:id] &&
|
157
|
+
row[11].to_f >= o[:score] && row[3].to_f / row[12].to_i >= o[:fract]
|
158
|
+
|
159
|
+
qry_seen[row[0]] = 1
|
160
|
+
n += 1
|
161
|
+
if i == 2
|
162
|
+
rbh[row[0]] = row[1]
|
163
|
+
elsif !rbh[row[1]].nil? && rbh[row[1]] == row[0]
|
164
|
+
puts ln
|
165
|
+
n2 += 1
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
say " #{n} sequences with hit"
|
170
|
+
end
|
171
|
+
say " #{n2} RBMs"
|
172
|
+
end
|
@@ -1,146 +1,100 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
#
|
4
|
-
# @author: Luis M. Rodriguez-R
|
5
|
-
# @update: Aug-25-2015
|
6
|
-
# @license: artistic license 2.0
|
7
|
-
#
|
3
|
+
# frozen_string_literal: true
|
8
4
|
|
9
|
-
|
5
|
+
$VERSION = 1.0
|
6
|
+
$:.push File.expand_path('../lib', __FILE__)
|
7
|
+
require 'enveomics_rb/rbm'
|
10
8
|
require 'tmpdir'
|
11
9
|
|
12
|
-
|
13
|
-
|
14
|
-
|
10
|
+
bms_dummy = Enveomics::RBM.new('1', '2').bms1
|
11
|
+
o = { q: false }
|
12
|
+
%i[thr len id fract score bin program nucl].each do |k|
|
13
|
+
o[k] = bms_dummy.opt(k)
|
14
|
+
end
|
15
|
+
|
15
16
|
OptionParser.new do |opts|
|
16
|
-
|
17
|
-
|
17
|
+
cmd = File.basename($0)
|
18
|
+
opts.banner = <<~BANNER
|
18
19
|
|
19
|
-
|
20
|
-
opts.separator ""
|
21
|
-
opts.separator "Mandatory"
|
22
|
-
opts.on("-1", "--seq1 FILE",
|
23
|
-
"Path to the FastA file containing the set 1."){ |v| o[:seq1] = v }
|
24
|
-
opts.on("-2", "--seq2 FILE",
|
25
|
-
"Path to the FastA file containing the set 2."){ |v| o[:seq2] = v }
|
26
|
-
opts.separator ""
|
27
|
-
opts.separator "Search Options"
|
28
|
-
opts.on("-n", "--nucl",
|
29
|
-
"Sequences are assumed to be nucleotides (proteins by default)."
|
30
|
-
){ |v| o[:nucl] = true }
|
31
|
-
opts.on("-l", "--len INT",
|
32
|
-
"Minimum alignment length (in residues). By default: #{o[:len]}."
|
33
|
-
){ |v| o[:len] = v.to_i }
|
34
|
-
opts.on("-f", "--fract FLOAT",
|
35
|
-
"Minimum alignment length (as a fraction of the query).",
|
36
|
-
"If set, requires BLAST+ or Diamond (see -p). By default: #{o[:fract]}."
|
37
|
-
){ |v| o[:fract] = v.to_i }
|
38
|
-
opts.on("-i", "--id NUM",
|
39
|
-
"Minimum alignment identity (in %). By default: #{o[:id].to_s}."
|
40
|
-
){ |v| o[:id] = v.to_f }
|
41
|
-
opts.on("-s", "--score NUM",
|
42
|
-
"Minimum alignment score (in bits). By default: #{o[:score]}."
|
43
|
-
){ |v| o[:score] = v.to_f }
|
44
|
-
opts.separator ""
|
45
|
-
opts.separator "Software Options"
|
46
|
-
opts.on("-b", "--bin DIR",
|
47
|
-
"Path to the directory containing the binaries of the search program."
|
48
|
-
){ |v| o[:bin] = v }
|
49
|
-
opts.on("-p", "--program STR",
|
50
|
-
"Search program to be used. One of: blast+ (default), blast, diamond."
|
51
|
-
){ |v| o[:program] = v }
|
52
|
-
opts.on("-t", "--threads INT",
|
53
|
-
"Number of parallel threads to be used. By default: #{o[:thr]}."
|
54
|
-
){ |v| o[:thr] = v.to_i }
|
55
|
-
opts.separator ""
|
56
|
-
opts.separator "Other Options"
|
57
|
-
opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = true }
|
58
|
-
opts.on("-h", "--help", "Display this screen") do
|
59
|
-
puts opts
|
60
|
-
exit
|
61
|
-
end
|
62
|
-
opts.separator ""
|
63
|
-
end.parse!
|
64
|
-
abort "-1 is mandatory" if o[:seq1].nil?
|
65
|
-
abort "-2 is mandatory" if o[:seq2].nil?
|
66
|
-
abort '-p diamond is incompatible with -n' if o[:program]=='diamond' && o[:nucl]
|
67
|
-
abort 'Argument -f/--fract requires -p blast+ or -p diamond' if
|
68
|
-
o[:fract]>0 and o[:program]!='blast+' and o[:program]!='diamond'
|
69
|
-
o[:bin] = o[:bin]+"/" if o[:bin].size > 0
|
20
|
+
[Enveomics Collection: #{cmd} v#{$VERSION}]
|
70
21
|
|
71
|
-
|
72
|
-
$stderr.puts "Temporal directory: #{dir}." unless o[:q]
|
22
|
+
Finds the reciprocal best matches between two sets of sequences
|
73
23
|
|
74
|
-
|
75
|
-
$stderr.puts "Creating databases." unless o[:q]
|
76
|
-
[:seq1, :seq2].each do |seq|
|
77
|
-
case o[:program].downcase
|
78
|
-
when 'blast'
|
79
|
-
`"#{o[:bin]}formatdb" -i "#{o[seq]}" -n "#{dir}/#{seq}" \
|
80
|
-
-p #{(o[:nucl]?"F":"T")}`
|
81
|
-
when 'blast+'
|
82
|
-
`"#{o[:bin]}makeblastdb" -in "#{o[seq]}" -out "#{dir}/#{seq}" \
|
83
|
-
-dbtype #{(o[:nucl]?"nucl":"prot")}`
|
84
|
-
when 'diamond'
|
85
|
-
`"#{o[:bin]}diamond" makedb --in "#{dir}/#{seq}.fa" \
|
86
|
-
--db "#{dir}/#{seq}.fa.dmnd" --threads "#{o[:thr]}"`
|
87
|
-
else
|
88
|
-
abort "Unsupported program: #{o[:program]}."
|
89
|
-
end
|
90
|
-
end # |seq|
|
24
|
+
Usage: #{cmd} [options]
|
91
25
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
26
|
+
BANNER
|
27
|
+
|
28
|
+
opts.separator 'Mandatory'
|
29
|
+
opts.on(
|
30
|
+
'-1', '--seq1 FILE',
|
31
|
+
'Path to the FastA file containing the set 1'
|
32
|
+
) { |v| o[:seq1] = v }
|
33
|
+
opts.on(
|
34
|
+
'-2', '--seq2 FILE',
|
35
|
+
'Path to the FastA file containing the set 2'
|
36
|
+
) { |v| o[:seq2] = v }
|
37
|
+
opts.separator ''
|
38
|
+
opts.separator 'Search Options'
|
39
|
+
opts.on(
|
40
|
+
'-n', '--nucl',
|
41
|
+
'Sequences are assumed to be nucleotides (proteins by default)',
|
42
|
+
'Incompatible with -p diamond'
|
43
|
+
) { |v| o[:nucl] = true }
|
44
|
+
opts.on(
|
45
|
+
'-l', '--len INT', Integer,
|
46
|
+
'Minimum alignment length (in residues)',
|
47
|
+
"By default: #{o[:len]}"
|
48
|
+
) { |v| o[:len] = v }
|
49
|
+
opts.on(
|
50
|
+
'-f', '--fract FLOAT', Float,
|
51
|
+
'Minimum alignment length (as a fraction of the query)',
|
52
|
+
'If set, requires BLAST+ or Diamond (see -p)',
|
53
|
+
"By default: #{o[:fract]}"
|
54
|
+
) { |v| o[:fract] = v }
|
55
|
+
opts.on(
|
56
|
+
'-i', '--id NUM', Float,
|
57
|
+
'Minimum alignment identity (in %)',
|
58
|
+
"By default: #{o[:id]}"
|
59
|
+
){ |v| o[:id] = v }
|
60
|
+
opts.on(
|
61
|
+
'-s', '--score NUM', Float,
|
62
|
+
'Minimum alignment score (in bits)',
|
63
|
+
"By default: #{o[:score]}"
|
64
|
+
) { |v| o[:score] = v }
|
65
|
+
opts.separator ''
|
66
|
+
opts.separator 'Software Options'
|
67
|
+
opts.on(
|
68
|
+
'-b', '--bin DIR',
|
69
|
+
'Path to the directory containing the binaries of the search program'
|
70
|
+
) { |v| o[:bin] = v }
|
71
|
+
opts.on(
|
72
|
+
'-p', '--program STR',
|
73
|
+
'Search program to be used',
|
74
|
+
'One of: blast+ (default), blast, diamond, blat'
|
75
|
+
) { |v| o[:program] = v.downcase.to_sym }
|
76
|
+
opts.on(
|
77
|
+
'-t', '--threads INT', Integer,
|
78
|
+
'Number of parallel threads to be used',
|
79
|
+
"By default: #{o[:thr]}"
|
80
|
+
) { |v| o[:thr] = v }
|
81
|
+
opts.separator ''
|
82
|
+
opts.separator 'Other Options'
|
83
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
84
|
+
opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
|
85
|
+
opts.separator ''
|
86
|
+
end.parse!
|
144
87
|
|
88
|
+
raise Enveomics::OptionError.new('-1 is mandatory') if o[:seq1].nil?
|
89
|
+
raise Enveomics::OptionError.new('-2 is mandatory') if o[:seq2].nil?
|
90
|
+
raise Enveomics::OptionError.new(
|
91
|
+
'Argument -f/--fract requires -p blast+ or -p diamond'
|
92
|
+
) if o[:fract] > 0.0 && !%i[blast+ diamond].include?(o[:program])
|
93
|
+
$QUIET = o[:q]
|
145
94
|
|
95
|
+
rbm = Enveomics::RBM.new(o[:seq1], o[:seq2], o)
|
96
|
+
rbm.each { |bm| puts bm.to_s }
|
97
|
+
say('Forward Best Matches: ', rbm.bms1.count)
|
98
|
+
say('Reverse Best Matches: ', rbm.bms2.count)
|
99
|
+
say('Reciprocal Best Matches: ', rbm.count)
|
146
100
|
|
@@ -0,0 +1,148 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
$VERSION = 1.0
|
6
|
+
$:.push File.expand_path('../lib', __FILE__)
|
7
|
+
require 'enveomics_rb/enveomics'
|
8
|
+
use 'shellwords'
|
9
|
+
|
10
|
+
o = {
|
11
|
+
q: false, threads: 2, m_format: :sam, g_format: :fasta, identity: 95.0,
|
12
|
+
o: '-', header: true
|
13
|
+
}
|
14
|
+
|
15
|
+
OptionParser.new do |opt|
|
16
|
+
Enveomics.opt_banner(
|
17
|
+
opt, 'Filters a SAM or BAM file by target sequences and/or identity',
|
18
|
+
"#{File.basename($0)} -m map.sam -o filtered_map.sam [options]"
|
19
|
+
)
|
20
|
+
|
21
|
+
opt.separator 'Input/Output'
|
22
|
+
opt.on(
|
23
|
+
'-g', '--genome PATH',
|
24
|
+
'Genome assembly',
|
25
|
+
'Supports compression with .gz extension, use - for STDIN'
|
26
|
+
) { |v| o[:g] = v }
|
27
|
+
opt.on(
|
28
|
+
'-m', '--mapping PATH',
|
29
|
+
'Mapping file',
|
30
|
+
'Supports compression with .gz extension, use - for STDIN'
|
31
|
+
) { |v| o[:m] = v }
|
32
|
+
opt.on(
|
33
|
+
'-o', '--out-sam PATH',
|
34
|
+
'Output filtered file in SAM format',
|
35
|
+
'Supports compression with .gz extension, use - for STDOUT (default)'
|
36
|
+
) { |v| o[:o] = v }
|
37
|
+
opt.separator ''
|
38
|
+
|
39
|
+
opt.separator 'Formats'
|
40
|
+
opt.on(
|
41
|
+
'--g-format STRING',
|
42
|
+
'Genome assembly format: fasta (default) or list'
|
43
|
+
) { |v| o[:g_format] = v.downcase.to_sym }
|
44
|
+
opt.on(
|
45
|
+
'--m-format STRING',
|
46
|
+
'Mapping file format: sam (default) or bam',
|
47
|
+
'sam supports compression with .gz file extension'
|
48
|
+
) { |v| o[:m_format] = v.downcase.to_sym }
|
49
|
+
opt.separator ''
|
50
|
+
|
51
|
+
opt.separator 'General'
|
52
|
+
opt.on(
|
53
|
+
'-i', '--identity FLOAT', Float,
|
54
|
+
"Set a fixed threshold of percent identity (default: #{o[:identity]})"
|
55
|
+
) { |v| o[:identity] = v }
|
56
|
+
opt.on('--no-header', 'Do not include the headers') { |v| o[:header] = v }
|
57
|
+
opt.separator ''
|
58
|
+
opt.on(
|
59
|
+
'-t', '--threads INT', Integer, "Threads to use (default: #{o[:threads]})"
|
60
|
+
) { |v| o[:threads] = v }
|
61
|
+
opt.on('-l', '--log PATH', 'Log file to save output') { |v| o[:log] = v }
|
62
|
+
opt.on('-q', '--quiet', 'Run quietly') { |v| o[:q] = v }
|
63
|
+
opt.on('-h', '--help', 'Display this screen') do
|
64
|
+
puts opt
|
65
|
+
exit
|
66
|
+
end
|
67
|
+
opt.separator ''
|
68
|
+
end.parse!
|
69
|
+
|
70
|
+
$QUIET = o[:q]
|
71
|
+
|
72
|
+
# Functions
|
73
|
+
|
74
|
+
##
|
75
|
+
# Parses one line +ln+ in SAM format and outputs filtered lines to +ofh+
|
76
|
+
# Filters by minimum +identity+ and +target+ sequences, and prints
|
77
|
+
# the headers if +header+
|
78
|
+
def parse_sam_line(ln, identity, target, header, ofh)
|
79
|
+
if ln =~ /^@/ || ln =~ /^\s*$/
|
80
|
+
ofh.puts ln if header
|
81
|
+
return
|
82
|
+
end
|
83
|
+
|
84
|
+
# No match
|
85
|
+
row = ln.chomp.split("\t")
|
86
|
+
return if row[2] == '*'
|
87
|
+
|
88
|
+
# Filter by target
|
89
|
+
return if !target.nil? && !target.include?(row[2])
|
90
|
+
|
91
|
+
# Exclude unless concordant or unaligned
|
92
|
+
length = row[9].size
|
93
|
+
row.shift(11) # Discard non-flag columns
|
94
|
+
flags = Hash[row.map { |i| i.sub(/:.:/, ':').split(':', 2) }]
|
95
|
+
return if flags['YT'] && !%w[CP UU].include?(flags['YT'])
|
96
|
+
|
97
|
+
# Filter by identity
|
98
|
+
unless flags['MD']
|
99
|
+
raise Enveomics::ParseError.new(
|
100
|
+
"SAM line missing MD flag:\n#{ln}\nFlags: #{flags}"
|
101
|
+
)
|
102
|
+
end
|
103
|
+
mismatches = flags['MD'].scan(/[^\d]/).count
|
104
|
+
id = 100.0 * (length - mismatches) / length
|
105
|
+
ofh.puts ln if id >= identity
|
106
|
+
end
|
107
|
+
|
108
|
+
# Reading targets
|
109
|
+
if o[:g]
|
110
|
+
say 'Loading target sequences to filter'
|
111
|
+
reader = reader(o[:g])
|
112
|
+
target =
|
113
|
+
case o[:g_format]
|
114
|
+
when :fasta
|
115
|
+
reader.each.map { |ln| $1 if ln =~ /^>(\S+)/ }.compact
|
116
|
+
when :list
|
117
|
+
reader.each.map(&:chomp)
|
118
|
+
else
|
119
|
+
raise Enveomics::OptionError.new(
|
120
|
+
"Unsupported target sequences format: #{o[:g_format]}"
|
121
|
+
)
|
122
|
+
end
|
123
|
+
reader.close
|
124
|
+
else
|
125
|
+
target = nil
|
126
|
+
end
|
127
|
+
|
128
|
+
# Reading and filtering mapping
|
129
|
+
say 'Reading mapping file'
|
130
|
+
ofh = writer(o[:o])
|
131
|
+
case o[:m_format]
|
132
|
+
when :sam
|
133
|
+
reader = reader(o[:m])
|
134
|
+
reader.each { |ln| parse_sam_line(ln, o[:identity], target, o[:header], ofh) }
|
135
|
+
reader.close
|
136
|
+
when :bam
|
137
|
+
cmd = ['samtools', 'view', o[:m], '-@', o[:threads]]
|
138
|
+
cmd << '-h' if o[:header]
|
139
|
+
IO.popen(cmd.shelljoin) do |fh|
|
140
|
+
fh.each { |ln| parse_sam_line(ln, o[:identity], target, o[:header], ofh) }
|
141
|
+
end
|
142
|
+
else
|
143
|
+
raise Enveomics::OptionError.new(
|
144
|
+
"Unsupported mapping format: #{o[:m_format]}"
|
145
|
+
)
|
146
|
+
end
|
147
|
+
ofh.close
|
148
|
+
|