BioDSL 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,314 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Trim sequence ends in the stream matching a specified primer.
30
+ #
31
+ # +trim_primer+ can trim full or partial primer sequence from sequence ends.
32
+ # This is done by matching the primer at the end specified by the +direction+
33
+ # option:
34
+ #
35
+ # Forward clip:
36
+ # sequence ATCGACTGCATCACGACG
37
+ # primer CATGAATCGA
38
+ # result CTGCATCACGACG
39
+ #
40
+ # Reverse clip:
41
+ # sequence ATCGACTGCATCACGACG
42
+ # primer GACGATAGCA
43
+ # result ATCGACTGCATCAC
44
+ #
45
+ # The primer sequence can be reverse complemented using the
46
+ # +reverse_complement+ option. Also, a minimum overlap for trimming can be
47
+ # specified using the +overlap_min+ option (default=1).
48
+ #
49
+ # Non-perfect matching can be allowed by setting the allowed
50
+ # +mismatch_percent+, +insertion_percent+ and +deletion_percent+.
51
+ #
52
+ # The following keys are added to clipped records:
53
+ #
54
+ # * TRIM_PRIMER_DIR - Direction of clip.
55
+ # * TRIM_PRIMER_POS - Sequence position of clip (0 based).
56
+ # * TRIM_PRIMER_LEN - Length of clip match.
57
+ # * TRIM_PRIMER_PAT - Clip match pattern.
58
+ # == Usage
59
+ #
60
+ # trim_primer(<primer: <string>>, <direction: <:forward|:reverse>
61
+ # [, reverse_complement: <bool>[, overlap_min: <uint>
62
+ # [, mismatch_percent: <uint>
63
+ # [, insertion_percent: <uint>
64
+ # [, deletion_percent: <uint>]]]]])
65
+ #
66
+ # === Options
67
+ #
68
+ # * primer: <string> - Primer sequence to search for.
69
+ # * direction: <:forward|:reverse> - Clip direction.
70
+ # * reverse_complement: <bool> - Reverse complement primer (default=false)
71
+ # * overlap_min: <uint> - Minimum primer length used (default=1)
72
+ # * mismatch_percent: <unit> - Allowed percent mismatches (default=0)
73
+ # * insertion_percent: <unit> - Allowed percent insertions (default=0)
74
+ # * deletion_percent: <unit> - Allowed percent mismatches (default=0)
75
+ #
76
+ # == Examples
77
+ #
78
+ # Consider the following FASTA entry in the file test.fna:
79
+ #
80
+ # >test
81
+ # ACTGACTGATGACTACGACTACGACTACTACTACGT
82
+ #
83
+ # The forward end can be trimmed like this:
84
+ #
85
+ # BP.new.
86
+ # read_fasta(input: "test.fna").
87
+ # trim_primer(primer: "ATAGAACTGAC", direction: :forward).
88
+ # dump.
89
+ # run
90
+ #
91
+ # {:SEQ_NAME=>"test",
92
+ # :SEQ=>"TGATGACTACGACTACGACTACTACTACGT",
93
+ # :SEQ_LEN=>30,
94
+ # :TRIM_PRIMER_DIR=>"FORWARD",
95
+ # :TRIM_PRIMER_POS=>0,
96
+ # :TRIM_PRIMER_LEN=>6,
97
+ # :TRIM_PRIMER_PAT=>"ACTGAC"}
98
+ #
99
+ # And trimming a reverse primer:
100
+ #
101
+ # BP.new.
102
+ # read_fasta(input: "test.fna").
103
+ # trim_primer(primer: "ACTACGTGCGGAT", direction: :reverse).
104
+ # dump.
105
+ # run
106
+ #
107
+ # {:SEQ_NAME=>"test",
108
+ # :SEQ=>"ACTGACTGATGACTACGACTACGACTACT",
109
+ # :SEQ_LEN=>29,
110
+ # :TRIM_PRIMER_DIR=>"REVERSE",
111
+ # :TRIM_PRIMER_POS=>29,
112
+ # :TRIM_PRIMER_LEN=>7,
113
+ # :TRIM_PRIMER_PAT=>"ACTACGT"}
114
+ #
115
+ # rubocop: disable ClassLength
116
+ class TrimPrimer
117
+ STATS = %i(records_in records_out sequences_in sequences_out pattern_hits
118
+ pattern_misses residues_in residues_out)
119
+
120
+ # Constructor for TrimPrimer.
121
+ #
122
+ # @param options [Hash] Options hash.
123
+ # @option options [String] :primer
124
+ # @option options [Symbol] :direction
125
+ # @option options [Boolean] :overlap_min
126
+ # @option options [Boolean] :reverse_complement
127
+ # @option options [Integer] :mismatch_percent
128
+ # @option options [Ingetger] :insertion_percent
129
+ # @option options [Integer] :deletion_percent
130
+ #
131
+ # @return [TrimPrimer] Class instance.
132
+ def initialize(options)
133
+ @options = options
134
+ @options[:overlap_min] ||= 1
135
+ @options[:mismatch_percent] ||= 0
136
+ @options[:insertion_percent] ||= 0
137
+ @options[:deletion_percent] ||= 0
138
+ @pattern = pattern
139
+ @hit = false
140
+
141
+ check_options
142
+ end
143
+
144
+ # Return command lambda for trim_primer.
145
+ #
146
+ # @return [Proc] Command lambda.
147
+ def lmb
148
+ lambda do |input, output, status|
149
+ status_init(status, STATS)
150
+
151
+ input.each do |record|
152
+ @status[:records_in] += 1
153
+
154
+ if record[:SEQ] && record[:SEQ].length > 0
155
+ @status[:sequences_in] += 1
156
+
157
+ case @options[:direction]
158
+ when :forward then trim_forward(record)
159
+ when :reverse then trim_reverse(record)
160
+ end
161
+ end
162
+
163
+ output << record
164
+
165
+ @status[:records_out] += 1
166
+ end
167
+ end
168
+ end
169
+
170
+ private
171
+
172
+ # Check options.
173
+ def check_options
174
+ options_allowed(@options, :primer, :direction, :overlap_min,
175
+ :reverse_complement, :mismatch_percent,
176
+ :insertion_percent, :deletion_percent)
177
+ options_required(@options, :primer, :direction)
178
+ options_allowed_values(@options, direction: [:forward, :reverse])
179
+ options_allowed_values(@options, reverse_complement: [true, false])
180
+ options_assert(@options, ':overlap_min > 0')
181
+ options_assert(@options, ':mismatch_percent >= 0')
182
+ options_assert(@options, ':insertion_percent >= 0')
183
+ options_assert(@options, ':deletion_percent >= 0')
184
+ end
185
+
186
+ # Determine the pattern from the sequence and reverse complement if need be.
187
+ def pattern
188
+ if @options[:reverse_complement]
189
+ Seq.new(seq: @options[:primer], type: :dna).reverse.complement.seq
190
+ else
191
+ @options[:primer]
192
+ end
193
+ end
194
+
195
+ # Trim record with sequence in the forward direction.
196
+ #
197
+ # @param record [Hash] BioDSL record
198
+ def trim_forward(record)
199
+ entry = BioDSL::Seq.new_bp(record)
200
+
201
+ @status[:residues_in] += entry.length
202
+
203
+ while @pattern.length >= @options[:overlap_min]
204
+ if (match = match_forward(entry))
205
+ merge_forward(record, entry, match)
206
+ @hit = true
207
+ break
208
+ end
209
+
210
+ @pattern = @pattern[1...@pattern.length]
211
+ end
212
+
213
+ @hit ? @status[:pattern_hits] += 1 : @status[:pattern_misses] += 1
214
+ end
215
+
216
+ # Search a given entry and return match data.
217
+ #
218
+ # @param entry [BioDSL::Seq] Sequence entry.
219
+ #
220
+ # @return [BioDSL::Seq::Match,nil] Match result.
221
+ def match_forward(entry)
222
+ match_opt = match_options(@pattern.length)
223
+ match_opt[:start] = 0
224
+ match_opt[:stop] = 0
225
+
226
+ entry.patmatch(@pattern, match_opt)
227
+ end
228
+
229
+ # Use given match data to extract subsequence from given entry and merge to
230
+ # the given record.
231
+ #
232
+ # @param record [Hash] BioDSL record
233
+ # @param entry [BioDSL::Seq] Sequence entry.
234
+ # @param match [BioDSL::Seq::Match] Match data.
235
+ def merge_forward(record, entry, match)
236
+ entry = entry[match.pos + match.length..-1]
237
+
238
+ record.merge!(entry.to_bp)
239
+ record[:TRIM_PRIMER_DIR] = 'FORWARD'
240
+ record[:TRIM_PRIMER_POS] = match.pos
241
+ record[:TRIM_PRIMER_LEN] = match.length
242
+ record[:TRIM_PRIMER_PAT] = match.match
243
+ end
244
+
245
+ # Trim record with sequence in the reverse direction.
246
+ #
247
+ # @param record [Hash] BioDSL record
248
+ def trim_reverse(record)
249
+ entry = BioDSL::Seq.new_bp(record)
250
+
251
+ @status[:residues_in] += entry.length
252
+
253
+ while @pattern.length >= @options[:overlap_min]
254
+ if (match = match_reverse(entry))
255
+ merge_reverse(record, entry, match)
256
+ @hit = true
257
+ break
258
+ end
259
+
260
+ @pattern = @pattern[0...@pattern.length - 1]
261
+ end
262
+
263
+ @hit ? @status[:pattern_hits] += 1 : @status[:pattern_misses] += 1
264
+ end
265
+
266
+ # Search a given entry and return match data.
267
+ #
268
+ # @param entry [BioDSL::Seq] Sequence entry.
269
+ #
270
+ # @return [BioDSL::Seq::Match,nil] Match result.
271
+ def match_reverse(entry)
272
+ match_opt = match_options(@pattern.length)
273
+
274
+ start = entry.length - @pattern.length
275
+ start = 0 if start < 0
276
+
277
+ match_opt[:start] = start
278
+
279
+ entry.patmatch(@pattern, match_opt)
280
+ end
281
+
282
+ # Use given match data to extract subsequence from given entry and merge to
283
+ # the given record.
284
+ #
285
+ # @param record [Hash] BioDSL record
286
+ # @param entry [BioDSL::Seq] Sequence entry.
287
+ # @param match [BioDSL::Seq::Match] Match data.
288
+ def merge_reverse(record, entry, match)
289
+ entry = entry[0...match.pos]
290
+
291
+ record.merge!(entry.to_bp)
292
+ record[:TRIM_PRIMER_DIR] = 'REVERSE'
293
+ record[:TRIM_PRIMER_POS] = match.pos
294
+ record[:TRIM_PRIMER_LEN] = match.length
295
+ record[:TRIM_PRIMER_PAT] = match.match
296
+ end
297
+
298
+ # Calculate from the given pattern lenght the absolue mismatches, insertions
299
+ # and deletions allowed and return a hash with these values.
300
+ #
301
+ # @param length [Integer] Pattern length.
302
+ #
303
+ # @return [Hash] Match options hash.
304
+ def match_options(length)
305
+ mis = (length * @options[:mismatch_percent] * 0.01).round
306
+ ins = (length * @options[:insertion_percent] * 0.01).round
307
+ del = (length * @options[:deletion_percent] * 0.01).round
308
+
309
+ {max_mismatches: mis,
310
+ max_insertions: ins,
311
+ max_deletions: del}
312
+ end
313
+ end
314
+ end
@@ -0,0 +1,192 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Trim sequence ends removing residues with a low quality score.
30
+ #
31
+ # +trim_seq+ removes subquality residues from the ends of sequences in the
32
+ # stream based on quality SCORES in a FASTQ type quality score string.
33
+ # Trimming progresses until a stretch, specified with the +length_min+
34
+ # option, is found thus preventing premature termination of the trimming
35
+ # by e.g. a single good quality residue at the end. It is possible, using
36
+ # the +mode+ option to indicate if the sequence should be trimmed from the
37
+ # left or right end or both (default=:both).
38
+ #
39
+ # == Usage
40
+ #
41
+ # trim_seq([quality_min: <uint>[, length_min: <uint>
42
+ # [, mode: <:left|:right|:both>]]])
43
+ #
44
+ # === Options
45
+ #
46
+ # * quality_min: <uint> - Minimum quality (default=20).
47
+ # * length_min: <uint> - Minimum stretch length (default=3).
48
+ # * mode: <string> - Trim mode :left|:right|:both (default=:both).
49
+ #
50
+ # == Examples
51
+ #
52
+ # Consider the following FASTQ entry in the file test.fq:
53
+ #
54
+ # @test
55
+ # gatcgatcgtacgagcagcatctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcat
56
+ # +
57
+ # @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJI
58
+ #
59
+ # To trim both ends simply do:
60
+ #
61
+ # BP.new.read_fastq(input: "test.fq").trim_seq.trim_seq.run
62
+ #
63
+ # SEQ_NAME: test
64
+ # SEQ: tctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcat
65
+ # SEQ_LEN: 62
66
+ # SCORES: TUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJI
67
+ # ---
68
+ #
69
+ # Use the +quality_min+ option to change the minimum value to discard:
70
+ #
71
+ # BP.new.
72
+ # read_fastq(input: "test.fq").
73
+ # trim_seq(quality_min: 25).
74
+ # trim_seq.
75
+ # run
76
+ #
77
+ # SEQ_NAME: test
78
+ # SEQ: cgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag
79
+ # SEQ_LEN: 57
80
+ # SCORES: YZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh
81
+ # ---
82
+ #
83
+ # To trim the left end only (use :rigth for right end only), do:
84
+ #
85
+ # BP.new.read_fastq(input: "test.fq").trim_seq(mode: :left).trim_seq.run
86
+ #
87
+ # SEQ_NAME: test
88
+ # SEQ: tctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag
89
+ # SEQ_LEN: 62
90
+ # SCORES: TUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh
91
+ # ---
92
+ #
93
+ # To increase the length of stretch of good quality residues to match, use
94
+ # the +length_min+ option:
95
+ #
96
+ # BP.new.read_fastq(input: "test.fq").trim_seq(length_min: 4).trim_seq.run
97
+ #
98
+ # SEQ_NAME: test
99
+ # SEQ: tctgacgtatcgatcgttgattagttgctagctatgcagtct
100
+ # SEQ_LEN: 42
101
+ # SCORES: TUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUT
102
+ # ---
103
+ class TrimSeq
104
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
105
+ residues_out)
106
+
107
+ # Constructor for the TrimSeq class.
108
+ #
109
+ # @param [Hash] options Options hash.
110
+ #
111
+ # @option options [Integer] :quality_min
112
+ # TrimSeq minimum quality (default=20).
113
+ #
114
+ # @option options [Symbol] :mode
115
+ # TrimSeq mode (default=:both).
116
+ #
117
+ # @option options [Integer] :length_min
118
+ # TrimSeq stretch length triggering trim (default=3).
119
+ #
120
+ # @return [Proc] Returns the trim_seq command lambda.
121
+ #
122
+ # @return [TrimSeq] Returns an instance of the TrimSeq class.
123
+ def initialize(options)
124
+ @options = options
125
+
126
+ check_options
127
+ defaults
128
+
129
+ @mode = @options[:mode].to_sym
130
+ @min = @options[:quality_min]
131
+ @len = @options[:length_min]
132
+ end
133
+
134
+ # Return a lambda for the trim_seq command.
135
+ #
136
+ # @return [Proc] Returns the trim_seq command lambda.
137
+ def lmb
138
+ lambda do |input, output, status|
139
+ status_init(status, STATS)
140
+
141
+ input.each do |record|
142
+ @status[:records_in] += 1
143
+
144
+ trim_seq(record) if record[:SEQ] && record[:SCORES]
145
+
146
+ output << record
147
+
148
+ @status[:records_out] += 1
149
+ end
150
+ end
151
+ end
152
+
153
+ private
154
+
155
+ # Check the options.
156
+ def check_options
157
+ options_allowed(@options, :quality_min, :length_min, :mode)
158
+ options_allowed_values(@options, mode: [:left, :right, :both])
159
+ options_assert(@options, ':quality_min >= 0')
160
+ options_assert(@options, ':quality_min <= 40')
161
+ options_assert(@options, ':length_min > 0')
162
+ end
163
+
164
+ # Set defaul options.
165
+ def defaults
166
+ @options[:quality_min] ||= 20
167
+ @options[:mode] ||= :both
168
+ @options[:length_min] ||= 3
169
+ end
170
+
171
+ # Trim sequence in a given record with sequence info.
172
+ #
173
+ # @param record [Hash] BioDSL record
174
+ def trim_seq(record)
175
+ entry = BioDSL::Seq.new_bp(record)
176
+
177
+ @status[:sequences_in] += 1
178
+ @status[:residues_in] += entry.length
179
+
180
+ case @mode
181
+ when :both then entry.quality_trim!(@min, @len)
182
+ when :left then entry.quality_trim_left!(@min, @len)
183
+ when :right then entry.quality_trim_right!(@min, @len)
184
+ end
185
+
186
+ @status[:sequences_out] += 1
187
+ @status[:residues_out] += entry.length
188
+
189
+ record.merge! entry.to_bp
190
+ end
191
+ end
192
+ end
@@ -0,0 +1,170 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Run uchime_ref on sequences in the stream.
30
+ #
31
+ # This is a wrapper for the +usearch+ tool to run the program uchime_ref.
32
+ # Basically sequence type records are searched against a reference database or
33
+ # non-chimeric sequences, and chimirec sequences are filtered out so only
34
+ # non-chimeric sequences are output.
35
+ #
36
+ # Please refer to the manual:
37
+ #
38
+ # http://drive5.com/usearch/manual/uchime_ref.html
39
+ #
40
+ # Usearch 7.0 must be installed for +usearch+ to work. Read more here:
41
+ #
42
+ # http://www.drive5.com/usearch/
43
+ #
44
+ # == Usage
45
+ #
46
+ # uchime_ref(<database: <file>[cpus: <uint>])
47
+ #
48
+ # === Options
49
+ #
50
+ # * database: <file> - Database to search (in FASTA format).
51
+ # * cpus: <uint> - Number of CPU cores to use (default=1).
52
+ #
53
+ # == Examples
54
+ #
55
+ class UchimeRef
56
+ require 'BioDSL/helpers/aux_helper'
57
+
58
+ include AuxHelper
59
+
60
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
61
+ residues_out)
62
+
63
+ # Constructor for UchimeRef.
64
+ #
65
+ # @param options [Hash] Options hash.
66
+ # @option options [String] :database
67
+ # @option options [Integer] :cpus
68
+ #
69
+ # @return [UchimeRef] Class instance.
70
+ def initialize(options)
71
+ @options = options
72
+ aux_exist('usearch')
73
+ check_options
74
+ @options[:cpus] ||= 1
75
+ @options[:strand] ||= 'plus' # This option cant be changed in usearch7.0
76
+ end
77
+
78
+ # Return command lambda for uchime_ref.
79
+ #
80
+ # @return [Proc] Command lambda.
81
+ def lmb
82
+ lambda do |input, output, status|
83
+ status_init(status, STATS)
84
+
85
+ TmpDir.create('input', 'output') do |tmp_in, tmp_out|
86
+ process_input(input, output, tmp_in)
87
+ run_uchime_ref(tmp_in, tmp_out)
88
+
89
+ process_output(output, tmp_out)
90
+ end
91
+ end
92
+ end
93
+
94
+ private
95
+
96
+ # Check options.
97
+ def check_options
98
+ options_allowed(@options, :database, :cpus)
99
+ options_required(@options, :database)
100
+ options_files_exist(@options, :database)
101
+ options_assert(@options, ':cpus >= 1')
102
+ options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
103
+ end
104
+
105
+ # Process input stream and save records with sequences to a temporary FASTA
106
+ # file or emit non-sequence containing records to the output stream.
107
+ #
108
+ # @param input [Enumerator] Input stream.
109
+ # @param output [Enumerator::Yielder] Output stream.
110
+ # @param tmp_in [String] Path to temporary FASTA file.
111
+ def process_input(input, output, tmp_in)
112
+ BioDSL::Fasta.open(tmp_in, 'w') do |ios|
113
+ input.each_with_index do |record, i|
114
+ @status[:records_in] += 1
115
+
116
+ if record[:SEQ]
117
+ @status[:sequences_in] += 1
118
+ @status[:residues_in] += record[:SEQ].length
119
+ seq_name = record[:SEQ_NAME] || i.to_s
120
+
121
+ entry = BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ])
122
+
123
+ ios.puts entry.to_fasta
124
+ else
125
+ output << record
126
+ @status[:records_out] += 1
127
+ end
128
+ end
129
+ end
130
+ end
131
+
132
+ # Run uchime_ref on input file and save result input file.
133
+ #
134
+ # @param tmp_in [String] Path to input file.
135
+ # @param tmp_out [String] Path to output file.
136
+ #
137
+ # @raise [BioDSL::UsearchError] If command fails.
138
+ def run_uchime_ref(tmp_in, tmp_out)
139
+ uchime_opts = {
140
+ input: tmp_in,
141
+ output: tmp_out,
142
+ database: @options[:database],
143
+ strand: @options[:strand],
144
+ cpus: @options[:cpus],
145
+ verbose: @options[:verbose]
146
+ }
147
+
148
+ BioDSL::Usearch.uchime_ref(uchime_opts)
149
+ rescue BioDSL::UsearchError => e
150
+ raise unless e.message =~ /Empty input file/
151
+ end
152
+
153
+ # Process uchime_ref output data and emit to output stream.
154
+ #
155
+ # @param output [Enumerator::Yielder] Output stream.
156
+ # @param tmp_out [String] Path to file with uchime_ref data.
157
+ def process_output(output, tmp_out)
158
+ Fasta.open(tmp_out) do |ios|
159
+ ios.each do |entry|
160
+ record = entry.to_bp
161
+
162
+ output << record
163
+ @status[:sequences_out] += 1
164
+ @status[:residues_out] += entry.length
165
+ @status[:records_out] += 1
166
+ end
167
+ end
168
+ end
169
+ end
170
+ end