miga-base 0.7.26.2 → 1.0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/classify_wf.rb +2 -2
  7. data/lib/miga/cli/action/derep_wf.rb +1 -1
  8. data/lib/miga/cli/action/doctor.rb +57 -14
  9. data/lib/miga/cli/action/doctor/base.rb +47 -23
  10. data/lib/miga/cli/action/env.rb +26 -0
  11. data/lib/miga/cli/action/init.rb +11 -7
  12. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  13. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  14. data/lib/miga/cli/action/tax_dist.rb +2 -2
  15. data/lib/miga/cli/action/wf.rb +5 -4
  16. data/lib/miga/cli/base.rb +1 -0
  17. data/lib/miga/common.rb +1 -0
  18. data/lib/miga/daemon.rb +11 -4
  19. data/lib/miga/dataset/result.rb +10 -6
  20. data/lib/miga/json.rb +5 -4
  21. data/lib/miga/metadata.rb +5 -1
  22. data/lib/miga/parallel.rb +36 -0
  23. data/lib/miga/project.rb +8 -8
  24. data/lib/miga/project/base.rb +4 -4
  25. data/lib/miga/project/result.rb +2 -2
  26. data/lib/miga/sqlite.rb +10 -2
  27. data/lib/miga/version.rb +23 -9
  28. data/scripts/aai_distances.bash +16 -18
  29. data/scripts/ani_distances.bash +16 -17
  30. data/scripts/assembly.bash +31 -16
  31. data/scripts/haai_distances.bash +3 -27
  32. data/scripts/miga.bash +12 -8
  33. data/scripts/p.bash +1 -1
  34. data/scripts/read_quality.bash +9 -18
  35. data/scripts/trimmed_fasta.bash +14 -30
  36. data/scripts/trimmed_reads.bash +36 -36
  37. data/test/parallel_test.rb +31 -0
  38. data/test/project_test.rb +2 -1
  39. data/test/remote_dataset_test.rb +1 -1
  40. data/utils/distance/commands.rb +1 -0
  41. data/utils/distance/database.rb +0 -1
  42. data/utils/distance/runner.rb +2 -4
  43. data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
  44. data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
  45. data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
  46. data/utils/enveomics/Manifest/Tasks/other.json +77 -0
  47. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
  48. data/utils/enveomics/Manifest/categories.json +13 -4
  49. data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
  50. data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
  51. data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
  52. data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
  53. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  54. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  55. data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
  56. data/utils/enveomics/Scripts/SRA.download.bash +6 -8
  57. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  58. data/utils/enveomics/Scripts/aai.rb +3 -2
  59. data/utils/enveomics/Scripts/anir.rb +137 -0
  60. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  61. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  62. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
  63. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  64. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  65. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  66. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  67. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  68. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  69. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  70. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  71. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  72. data/utils/enveomics/Scripts/rbm.rb +87 -133
  73. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  74. data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
  75. data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
  76. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  77. data/utils/enveomics/enveomics.R/R/utils.R +30 -0
  78. data/utils/enveomics/enveomics.R/README.md +1 -0
  79. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
  80. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
  81. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
  82. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
  83. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
  84. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
  85. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
  86. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
  87. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
  88. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
  89. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  90. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
  91. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
  93. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
  94. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
  95. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
  96. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
  97. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
  98. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
  99. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
  100. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  101. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
  102. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
  103. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
  104. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
  105. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
  106. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  107. data/utils/multitrim/README.md +67 -0
  108. data/utils/multitrim/multitrim.py +1555 -0
  109. data/utils/multitrim/multitrim.yml +13 -0
  110. data/utils/requirements.txt +4 -3
  111. data/utils/subclade/pipeline.rb +2 -2
  112. metadata +33 -4
  113. data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -0,0 +1,172 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # frozen_string_literal: true
4
+
5
+ $VERSION = 0.1
6
+ $:.push File.expand_path('../lib', __FILE__)
7
+ require 'enveomics_rb/enveomics'
8
+ require 'tmpdir'
9
+
10
+ o = {
11
+ q: false, thr: 1,
12
+ len: 0, id: 0.0, fract: 0.0, score: 0.0,
13
+ bin: '', program: :'blast+', nucl: false
14
+ }
15
+
16
+ OptionParser.new do |opts|
17
+ cmd = File.basename($0)
18
+ opts.banner = <<~BANNER
19
+
20
+ [Enveomics Collection: #{cmd} v#{$VERSION}]
21
+
22
+ [DEPRECATED: Please use rbm.rb instead]
23
+
24
+ Finds the reciprocal best matches between two sets of sequences
25
+
26
+ Usage: #{cmd} [options]
27
+
28
+ BANNER
29
+
30
+ opts.separator 'Mandatory'
31
+ opts.on(
32
+ '-1', '--seq1 FILE',
33
+ 'Path to the FastA file containing the set 1'
34
+ ) { |v| o[:seq1] = v }
35
+ opts.on(
36
+ '-2', '--seq2 FILE',
37
+ 'Path to the FastA file containing the set 2'
38
+ ) { |v| o[:seq2] = v }
39
+ opts.separator ''
40
+ opts.separator 'Search Options'
41
+ opts.on(
42
+ '-n', '--nucl',
43
+ 'Sequences are assumed to be nucleotides (proteins by default)',
44
+ 'Incompatible with -p diamond'
45
+ ) { |v| o[:nucl] = true }
46
+ opts.on(
47
+ '-l', '--len INT', Integer,
48
+ 'Minimum alignment length (in residues)',
49
+ "By default: #{o[:len]}"
50
+ ) { |v| o[:len] = v }
51
+ opts.on(
52
+ '-f', '--fract FLOAT', Float,
53
+ 'Minimum alignment length (as a fraction of the query)',
54
+ 'If set, requires BLAST+ or Diamond (see -p)',
55
+ "By default: #{o[:fract]}"
56
+ ) { |v| o[:fract] = v }
57
+ opts.on(
58
+ '-i', '--id NUM', Float,
59
+ 'Minimum alignment identity (in %)',
60
+ "By default: #{o[:id]}"
61
+ ){ |v| o[:id] = v }
62
+ opts.on(
63
+ '-s', '--score NUM', Float,
64
+ 'Minimum alignment score (in bits)',
65
+ "By default: #{o[:score]}"
66
+ ) { |v| o[:score] = v }
67
+ opts.separator ''
68
+ opts.separator 'Software Options'
69
+ opts.on(
70
+ '-b', '--bin DIR',
71
+ 'Path to the directory containing the binaries of the search program'
72
+ ) { |v| o[:bin] = v }
73
+ opts.on(
74
+ '-p', '--program STR',
75
+ 'Search program to be used. One of: blast+ (default), blast, diamond'
76
+ ) { |v| o[:program] = v.downcase.to_sym }
77
+ opts.on(
78
+ '-t', '--threads INT', Integer,
79
+ 'Number of parallel threads to be used',
80
+ "By default: #{o[:thr]}"
81
+ ) { |v| o[:thr] = v }
82
+ opts.separator ''
83
+ opts.separator 'Other Options'
84
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
85
+ opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
86
+ opts.separator ''
87
+ end.parse!
88
+
89
+ abort '-1 is mandatory' if o[:seq1].nil?
90
+ abort '-2 is mandatory' if o[:seq2].nil?
91
+ if o[:program] == :diamond && o[:nucl]
92
+ abort '-p diamond is incompatible with -n'
93
+ end
94
+ if o[:fract] > 0.0 && o[:program] == :blast
95
+ abort 'Argument -f/--fract requires -p blast+ or -p diamond'
96
+ end
97
+ o[:bin] = o[:bin] + '/' if o[:bin].size > 0
98
+ $quiet = o[:q]
99
+
100
+ Dir.mktmpdir do |dir|
101
+ say('Temporal directory: ', dir)
102
+
103
+ # Create databases
104
+ say 'Creating databases'
105
+ [:seq1, :seq2].each do |seq|
106
+ case o[:program]
107
+ when :blast
108
+ `"#{o[:bin]}formatdb" -i "#{o[seq]}" -n "#{dir}/#{seq}" \
109
+ -p #{o[:nucl] ? 'F' : 'T'}`
110
+ when :'blast+'
111
+ `"#{o[:bin]}makeblastdb" -in "#{o[seq]}" -out "#{dir}/#{seq}" \
112
+ -dbtype #{o[:nucl] ? 'nucl' : 'prot'}`
113
+ when :diamond
114
+ `"#{o[:bin]}diamond" makedb --in "#{o[seq]}" \
115
+ --db "#{dir}/#{seq}.dmnd" --threads "#{o[:thr]}"`
116
+ else
117
+ abort "Unsupported program: #{o[:program]}"
118
+ end
119
+ end
120
+
121
+ # Best-hits
122
+ rbh = {}
123
+ n2 = 0
124
+ say ' Running comparisons'
125
+ [2, 1].each do |i|
126
+ qry_seen = {}
127
+ q = o[:"seq#{i}"]
128
+ s = "#{dir}/seq#{i == 1 ? 2 : 1}"
129
+ say(' Query: ', q)
130
+ case o[:program]
131
+ when :blast
132
+ `"#{o[:bin]}blastall" -p #{o[:nucl] ? 'blastn' : 'blastp'} -d "#{s}" \
133
+ -i "#{q}" -v 1 -b 1 -a #{o[:thr]} -m 8 -o "#{dir}/#{i}.tab"`
134
+ when :'blast+'
135
+ `"#{o[:bin]}#{o[:nucl] ? 'blastn' : 'blastp'}" -db "#{s}" -query "#{q}" \
136
+ -max_target_seqs 1 -num_threads #{o[:thr]} -out "#{dir}/#{i}.tab" \
137
+ -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend \
138
+ sstart send evalue bitscore qlen slen"`
139
+ when :diamond
140
+ `"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" --db "#{s}.dmnd" \
141
+ --query "#{q}" --sensitive --daa "#{dir}/#{i}.daa" --quiet \
142
+ && "#{o[:bin]}diamond" view --daa "#{dir}/#{i}.daa" --outfmt \
143
+ 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart \
144
+ send evalue bitscore qlen slen --out "#{dir}/#{i}.tab" --quiet`
145
+ else
146
+ abort "Unsupported program: #{o[:program]}"
147
+ end
148
+
149
+ n = 0
150
+ File.open("#{dir}/#{i}.tab", 'r') do |fh|
151
+ fh.each do |ln|
152
+ ln.chomp!
153
+ row = ln.split(/\t/)
154
+ row[12] = '1' unless [:'blast+', :diamond].include? o[:program]
155
+ next unless qry_seen[row[0]].nil? &&
156
+ row[3].to_i >= o[:len] && row[2].to_f >= o[:id] &&
157
+ row[11].to_f >= o[:score] && row[3].to_f / row[12].to_i >= o[:fract]
158
+
159
+ qry_seen[row[0]] = 1
160
+ n += 1
161
+ if i == 2
162
+ rbh[row[0]] = row[1]
163
+ elsif !rbh[row[1]].nil? && rbh[row[1]] == row[0]
164
+ puts ln
165
+ n2 += 1
166
+ end
167
+ end
168
+ end
169
+ say " #{n} sequences with hit"
170
+ end
171
+ say " #{n2} RBMs"
172
+ end
@@ -1,146 +1,100 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- #
4
- # @author: Luis M. Rodriguez-R
5
- # @update: Aug-25-2015
6
- # @license: artistic license 2.0
7
- #
3
+ # frozen_string_literal: true
8
4
 
9
- require 'optparse'
5
+ $VERSION = 1.0
6
+ $:.push File.expand_path('../lib', __FILE__)
7
+ require 'enveomics_rb/rbm'
10
8
  require 'tmpdir'
11
9
 
12
- o = {len:0, id:0, fract:0, score:0, q:false, bin:"", program:"blast+", thr:1,
13
- nucl:false}
14
- ARGV << "-h" if ARGV.size==0
10
+ bms_dummy = Enveomics::RBM.new('1', '2').bms1
11
+ o = { q: false }
12
+ %i[thr len id fract score bin program nucl].each do |k|
13
+ o[k] = bms_dummy.opt(k)
14
+ end
15
+
15
16
  OptionParser.new do |opts|
16
- opts.banner = "
17
- Finds the reciprocal best matches between two sets of sequences.
17
+ cmd = File.basename($0)
18
+ opts.banner = <<~BANNER
18
19
 
19
- Usage: #{$0} [options]"
20
- opts.separator ""
21
- opts.separator "Mandatory"
22
- opts.on("-1", "--seq1 FILE",
23
- "Path to the FastA file containing the set 1."){ |v| o[:seq1] = v }
24
- opts.on("-2", "--seq2 FILE",
25
- "Path to the FastA file containing the set 2."){ |v| o[:seq2] = v }
26
- opts.separator ""
27
- opts.separator "Search Options"
28
- opts.on("-n", "--nucl",
29
- "Sequences are assumed to be nucleotides (proteins by default)."
30
- ){ |v| o[:nucl] = true }
31
- opts.on("-l", "--len INT",
32
- "Minimum alignment length (in residues). By default: #{o[:len]}."
33
- ){ |v| o[:len] = v.to_i }
34
- opts.on("-f", "--fract FLOAT",
35
- "Minimum alignment length (as a fraction of the query).",
36
- "If set, requires BLAST+ or Diamond (see -p). By default: #{o[:fract]}."
37
- ){ |v| o[:fract] = v.to_i }
38
- opts.on("-i", "--id NUM",
39
- "Minimum alignment identity (in %). By default: #{o[:id].to_s}."
40
- ){ |v| o[:id] = v.to_f }
41
- opts.on("-s", "--score NUM",
42
- "Minimum alignment score (in bits). By default: #{o[:score]}."
43
- ){ |v| o[:score] = v.to_f }
44
- opts.separator ""
45
- opts.separator "Software Options"
46
- opts.on("-b", "--bin DIR",
47
- "Path to the directory containing the binaries of the search program."
48
- ){ |v| o[:bin] = v }
49
- opts.on("-p", "--program STR",
50
- "Search program to be used. One of: blast+ (default), blast, diamond."
51
- ){ |v| o[:program] = v }
52
- opts.on("-t", "--threads INT",
53
- "Number of parallel threads to be used. By default: #{o[:thr]}."
54
- ){ |v| o[:thr] = v.to_i }
55
- opts.separator ""
56
- opts.separator "Other Options"
57
- opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = true }
58
- opts.on("-h", "--help", "Display this screen") do
59
- puts opts
60
- exit
61
- end
62
- opts.separator ""
63
- end.parse!
64
- abort "-1 is mandatory" if o[:seq1].nil?
65
- abort "-2 is mandatory" if o[:seq2].nil?
66
- abort '-p diamond is incompatible with -n' if o[:program]=='diamond' && o[:nucl]
67
- abort 'Argument -f/--fract requires -p blast+ or -p diamond' if
68
- o[:fract]>0 and o[:program]!='blast+' and o[:program]!='diamond'
69
- o[:bin] = o[:bin]+"/" if o[:bin].size > 0
20
+ [Enveomics Collection: #{cmd} v#{$VERSION}]
70
21
 
71
- Dir.mktmpdir do |dir|
72
- $stderr.puts "Temporal directory: #{dir}." unless o[:q]
22
+ Finds the reciprocal best matches between two sets of sequences
73
23
 
74
- # Create databases.
75
- $stderr.puts "Creating databases." unless o[:q]
76
- [:seq1, :seq2].each do |seq|
77
- case o[:program].downcase
78
- when 'blast'
79
- `"#{o[:bin]}formatdb" -i "#{o[seq]}" -n "#{dir}/#{seq}" \
80
- -p #{(o[:nucl]?"F":"T")}`
81
- when 'blast+'
82
- `"#{o[:bin]}makeblastdb" -in "#{o[seq]}" -out "#{dir}/#{seq}" \
83
- -dbtype #{(o[:nucl]?"nucl":"prot")}`
84
- when 'diamond'
85
- `"#{o[:bin]}diamond" makedb --in "#{dir}/#{seq}.fa" \
86
- --db "#{dir}/#{seq}.fa.dmnd" --threads "#{o[:thr]}"`
87
- else
88
- abort "Unsupported program: #{o[:program]}."
89
- end
90
- end # |seq|
24
+ Usage: #{cmd} [options]
91
25
 
92
- # Best-hits.
93
- rbh = {}
94
- n2 = 0
95
- $stderr.puts " Running comparisons." unless o[:q]
96
- [2,1].each do |i|
97
- qry_seen = {}
98
- q = o[:"seq#{i}"]
99
- s = "#{dir}/seq#{i==1?2:1}"
100
- $stderr.puts " Query: #{q}." unless o[:q]
101
- case o[:program].downcase
102
- when 'blast'
103
- `"#{o[:bin]}blastall" -p #{o[:nucl]?"blastn":"blastp"} -d "#{s}" \
104
- -i "#{q}" -v 1 -b 1 -a #{o[:thr]} -m 8 -o "#{dir}/#{i}.tab"`
105
- when 'blast+'
106
- `"#{o[:bin]}#{o[:nucl]?"blastn":"blastp"}" -db "#{s}" -query "#{q}" \
107
- -max_target_seqs 1 -num_threads #{o[:thr]} -out "#{dir}/#{i}.tab" \
108
- -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend \
109
- sstart send evalue bitscore qlen slen"`
110
- when 'diamond'
111
- `"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" \
112
- --outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend \
113
- sstart send evalue bitscore qlen slen" --db "#{s}.dmnd" \
114
- --query "#{q}" --out "#{dir}/#{i}.tab" --more-sensitive`
115
- else
116
- abort "Unsupported program: #{o[:program]}."
117
- end
118
- fh = File.open("#{dir}/#{i}.tab", "r")
119
- n = 0
120
- fh.each_line do |ln|
121
- ln.chomp!
122
- row = ln.split(/\t/)
123
- row[12] = "1" unless %w[blast+ diamond].include? o[:program]
124
- if qry_seen[ row[0] ].nil? and row[3].to_i >= o[:len] and
125
- row[2].to_f >= o[:id] and row[11].to_f >= o[:score] and
126
- row[3].to_f/row[12].to_i >= o[:fract]
127
- qry_seen[ row[0] ] = 1
128
- n += 1
129
- if i==2
130
- rbh[ row[0] ] = row[1]
131
- else
132
- if !rbh[ row[1] ].nil? and rbh[ row[1] ]==row[0]
133
- puts ln
134
- n2 += 1
135
- end
136
- end
137
- end
138
- end # |ln|
139
- fh.close()
140
- $stderr.puts " #{n} sequences with hit." unless o[:q]
141
- end # |i|
142
- $stderr.puts " #{n2} RBMs." unless o[:q]
143
- end # |dir|
26
+ BANNER
27
+
28
+ opts.separator 'Mandatory'
29
+ opts.on(
30
+ '-1', '--seq1 FILE',
31
+ 'Path to the FastA file containing the set 1'
32
+ ) { |v| o[:seq1] = v }
33
+ opts.on(
34
+ '-2', '--seq2 FILE',
35
+ 'Path to the FastA file containing the set 2'
36
+ ) { |v| o[:seq2] = v }
37
+ opts.separator ''
38
+ opts.separator 'Search Options'
39
+ opts.on(
40
+ '-n', '--nucl',
41
+ 'Sequences are assumed to be nucleotides (proteins by default)',
42
+ 'Incompatible with -p diamond'
43
+ ) { |v| o[:nucl] = true }
44
+ opts.on(
45
+ '-l', '--len INT', Integer,
46
+ 'Minimum alignment length (in residues)',
47
+ "By default: #{o[:len]}"
48
+ ) { |v| o[:len] = v }
49
+ opts.on(
50
+ '-f', '--fract FLOAT', Float,
51
+ 'Minimum alignment length (as a fraction of the query)',
52
+ 'If set, requires BLAST+ or Diamond (see -p)',
53
+ "By default: #{o[:fract]}"
54
+ ) { |v| o[:fract] = v }
55
+ opts.on(
56
+ '-i', '--id NUM', Float,
57
+ 'Minimum alignment identity (in %)',
58
+ "By default: #{o[:id]}"
59
+ ){ |v| o[:id] = v }
60
+ opts.on(
61
+ '-s', '--score NUM', Float,
62
+ 'Minimum alignment score (in bits)',
63
+ "By default: #{o[:score]}"
64
+ ) { |v| o[:score] = v }
65
+ opts.separator ''
66
+ opts.separator 'Software Options'
67
+ opts.on(
68
+ '-b', '--bin DIR',
69
+ 'Path to the directory containing the binaries of the search program'
70
+ ) { |v| o[:bin] = v }
71
+ opts.on(
72
+ '-p', '--program STR',
73
+ 'Search program to be used',
74
+ 'One of: blast+ (default), blast, diamond, blat'
75
+ ) { |v| o[:program] = v.downcase.to_sym }
76
+ opts.on(
77
+ '-t', '--threads INT', Integer,
78
+ 'Number of parallel threads to be used',
79
+ "By default: #{o[:thr]}"
80
+ ) { |v| o[:thr] = v }
81
+ opts.separator ''
82
+ opts.separator 'Other Options'
83
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
84
+ opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
85
+ opts.separator ''
86
+ end.parse!
144
87
 
88
+ raise Enveomics::OptionError.new('-1 is mandatory') if o[:seq1].nil?
89
+ raise Enveomics::OptionError.new('-2 is mandatory') if o[:seq2].nil?
90
+ raise Enveomics::OptionError.new(
91
+ 'Argument -f/--fract requires -p blast+ or -p diamond'
92
+ ) if o[:fract] > 0.0 && !%i[blast+ diamond].include?(o[:program])
93
+ $QUIET = o[:q]
145
94
 
95
+ rbm = Enveomics::RBM.new(o[:seq1], o[:seq2], o)
96
+ rbm.each { |bm| puts bm.to_s }
97
+ say('Forward Best Matches: ', rbm.bms1.count)
98
+ say('Reverse Best Matches: ', rbm.bms2.count)
99
+ say('Reciprocal Best Matches: ', rbm.count)
146
100
 
@@ -0,0 +1,148 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # frozen_string_literal: true
4
+
5
+ $VERSION = 1.0
6
+ $:.push File.expand_path('../lib', __FILE__)
7
+ require 'enveomics_rb/enveomics'
8
+ use 'shellwords'
9
+
10
+ o = {
11
+ q: false, threads: 2, m_format: :sam, g_format: :fasta, identity: 95.0,
12
+ o: '-', header: true
13
+ }
14
+
15
+ OptionParser.new do |opt|
16
+ Enveomics.opt_banner(
17
+ opt, 'Filters a SAM or BAM file by target sequences and/or identity',
18
+ "#{File.basename($0)} -m map.sam -o filtered_map.sam [options]"
19
+ )
20
+
21
+ opt.separator 'Input/Output'
22
+ opt.on(
23
+ '-g', '--genome PATH',
24
+ 'Genome assembly',
25
+ 'Supports compression with .gz extension, use - for STDIN'
26
+ ) { |v| o[:g] = v }
27
+ opt.on(
28
+ '-m', '--mapping PATH',
29
+ 'Mapping file',
30
+ 'Supports compression with .gz extension, use - for STDIN'
31
+ ) { |v| o[:m] = v }
32
+ opt.on(
33
+ '-o', '--out-sam PATH',
34
+ 'Output filtered file in SAM format',
35
+ 'Supports compression with .gz extension, use - for STDOUT (default)'
36
+ ) { |v| o[:o] = v }
37
+ opt.separator ''
38
+
39
+ opt.separator 'Formats'
40
+ opt.on(
41
+ '--g-format STRING',
42
+ 'Genome assembly format: fasta (default) or list'
43
+ ) { |v| o[:g_format] = v.downcase.to_sym }
44
+ opt.on(
45
+ '--m-format STRING',
46
+ 'Mapping file format: sam (default) or bam',
47
+ 'sam supports compression with .gz file extension'
48
+ ) { |v| o[:m_format] = v.downcase.to_sym }
49
+ opt.separator ''
50
+
51
+ opt.separator 'General'
52
+ opt.on(
53
+ '-i', '--identity FLOAT', Float,
54
+ "Set a fixed threshold of percent identity (default: #{o[:identity]})"
55
+ ) { |v| o[:identity] = v }
56
+ opt.on('--no-header', 'Do not include the headers') { |v| o[:header] = v }
57
+ opt.separator ''
58
+ opt.on(
59
+ '-t', '--threads INT', Integer, "Threads to use (default: #{o[:threads]})"
60
+ ) { |v| o[:threads] = v }
61
+ opt.on('-l', '--log PATH', 'Log file to save output') { |v| o[:log] = v }
62
+ opt.on('-q', '--quiet', 'Run quietly') { |v| o[:q] = v }
63
+ opt.on('-h', '--help', 'Display this screen') do
64
+ puts opt
65
+ exit
66
+ end
67
+ opt.separator ''
68
+ end.parse!
69
+
70
+ $QUIET = o[:q]
71
+
72
+ # Functions
73
+
74
+ ##
75
+ # Parses one line +ln+ in SAM format and outputs filtered lines to +ofh+
76
+ # Filters by minimum +identity+ and +target+ sequences, and prints
77
+ # the headers if +header+
78
+ def parse_sam_line(ln, identity, target, header, ofh)
79
+ if ln =~ /^@/ || ln =~ /^\s*$/
80
+ ofh.puts ln if header
81
+ return
82
+ end
83
+
84
+ # No match
85
+ row = ln.chomp.split("\t")
86
+ return if row[2] == '*'
87
+
88
+ # Filter by target
89
+ return if !target.nil? && !target.include?(row[2])
90
+
91
+ # Exclude unless concordant or unaligned
92
+ length = row[9].size
93
+ row.shift(11) # Discard non-flag columns
94
+ flags = Hash[row.map { |i| i.sub(/:.:/, ':').split(':', 2) }]
95
+ return if flags['YT'] && !%w[CP UU].include?(flags['YT'])
96
+
97
+ # Filter by identity
98
+ unless flags['MD']
99
+ raise Enveomics::ParseError.new(
100
+ "SAM line missing MD flag:\n#{ln}\nFlags: #{flags}"
101
+ )
102
+ end
103
+ mismatches = flags['MD'].scan(/[^\d]/).count
104
+ id = 100.0 * (length - mismatches) / length
105
+ ofh.puts ln if id >= identity
106
+ end
107
+
108
+ # Reading targets
109
+ if o[:g]
110
+ say 'Loading target sequences to filter'
111
+ reader = reader(o[:g])
112
+ target =
113
+ case o[:g_format]
114
+ when :fasta
115
+ reader.each.map { |ln| $1 if ln =~ /^>(\S+)/ }.compact
116
+ when :list
117
+ reader.each.map(&:chomp)
118
+ else
119
+ raise Enveomics::OptionError.new(
120
+ "Unsupported target sequences format: #{o[:g_format]}"
121
+ )
122
+ end
123
+ reader.close
124
+ else
125
+ target = nil
126
+ end
127
+
128
+ # Reading and filtering mapping
129
+ say 'Reading mapping file'
130
+ ofh = writer(o[:o])
131
+ case o[:m_format]
132
+ when :sam
133
+ reader = reader(o[:m])
134
+ reader.each { |ln| parse_sam_line(ln, o[:identity], target, o[:header], ofh) }
135
+ reader.close
136
+ when :bam
137
+ cmd = ['samtools', 'view', o[:m], '-@', o[:threads]]
138
+ cmd << '-h' if o[:header]
139
+ IO.popen(cmd.shelljoin) do |fh|
140
+ fh.each { |ln| parse_sam_line(ln, o[:identity], target, o[:header], ofh) }
141
+ end
142
+ else
143
+ raise Enveomics::OptionError.new(
144
+ "Unsupported mapping format: #{o[:m_format]}"
145
+ )
146
+ end
147
+ ofh.close
148
+