BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,314 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Trim sequence ends in the stream matching a specified primer.
30
+ #
31
+ # +trim_primer+ can trim full or partial primer sequence from sequence ends.
32
+ # This is done by matching the primer at the end specified by the +direction+
33
+ # option:
34
+ #
35
+ # Forward clip:
36
+ # sequence ATCGACTGCATCACGACG
37
+ # primer CATGAATCGA
38
+ # result CTGCATCACGACG
39
+ #
40
+ # Reverse clip:
41
+ # sequence ATCGACTGCATCACGACG
42
+ # primer GACGATAGCA
43
+ # result ATCGACTGCATCAC
44
+ #
45
+ # The primer sequence can be reverse complemented using the
46
+ # +reverse_complement+ option. Also, a minimum overlap for trimming can be
47
+ # specified using the +overlap_min+ option (default=1).
48
+ #
49
+ # Non-perfect matching can be allowed by setting the allowed
50
+ # +mismatch_percent+, +insertion_percent+ and +deletion_percent+.
51
+ #
52
+ # The following keys are added to clipped records:
53
+ #
54
+ # * TRIM_PRIMER_DIR - Direction of clip.
55
+ # * TRIM_PRIMER_POS - Sequence position of clip (0 based).
56
+ # * TRIM_PRIMER_LEN - Length of clip match.
57
+ # * TRIM_PRIMER_PAT - Clip match pattern.
58
+ # == Usage
59
+ #
60
+ # trim_primer(<primer: <string>>, <direction: <:forward|:reverse>
61
+ # [, reverse_complement: <bool>[, overlap_min: <uint>
62
+ # [, mismatch_percent: <uint>
63
+ # [, insertion_percent: <uint>
64
+ # [, deletion_percent: <uint>]]]]])
65
+ #
66
+ # === Options
67
+ #
68
+ # * primer: <string> - Primer sequence to search for.
69
+ # * direction: <:forward|:reverse> - Clip direction.
70
+ # * reverse_complement: <bool> - Reverse complement primer (default=false)
71
+ # * overlap_min: <uint> - Minimum primer length used (default=1)
72
+ # * mismatch_percent: <unit> - Allowed percent mismatches (default=0)
73
+ # * insertion_percent: <unit> - Allowed percent insertions (default=0)
74
+ # * deletion_percent: <unit> - Allowed percent mismatches (default=0)
75
+ #
76
+ # == Examples
77
+ #
78
+ # Consider the following FASTA entry in the file test.fna:
79
+ #
80
+ # >test
81
+ # ACTGACTGATGACTACGACTACGACTACTACTACGT
82
+ #
83
+ # The forward end can be trimmed like this:
84
+ #
85
+ # BP.new.
86
+ # read_fasta(input: "test.fna").
87
+ # trim_primer(primer: "ATAGAACTGAC", direction: :forward).
88
+ # dump.
89
+ # run
90
+ #
91
+ # {:SEQ_NAME=>"test",
92
+ # :SEQ=>"TGATGACTACGACTACGACTACTACTACGT",
93
+ # :SEQ_LEN=>30,
94
+ # :TRIM_PRIMER_DIR=>"FORWARD",
95
+ # :TRIM_PRIMER_POS=>0,
96
+ # :TRIM_PRIMER_LEN=>6,
97
+ # :TRIM_PRIMER_PAT=>"ACTGAC"}
98
+ #
99
+ # And trimming a reverse primer:
100
+ #
101
+ # BP.new.
102
+ # read_fasta(input: "test.fna").
103
+ # trim_primer(primer: "ACTACGTGCGGAT", direction: :reverse).
104
+ # dump.
105
+ # run
106
+ #
107
+ # {:SEQ_NAME=>"test",
108
+ # :SEQ=>"ACTGACTGATGACTACGACTACGACTACT",
109
+ # :SEQ_LEN=>29,
110
+ # :TRIM_PRIMER_DIR=>"REVERSE",
111
+ # :TRIM_PRIMER_POS=>29,
112
+ # :TRIM_PRIMER_LEN=>7,
113
+ # :TRIM_PRIMER_PAT=>"ACTACGT"}
114
+ #
115
+ # rubocop: disable ClassLength
116
+ class TrimPrimer
117
+ STATS = %i(records_in records_out sequences_in sequences_out pattern_hits
118
+ pattern_misses residues_in residues_out)
119
+
120
+ # Constructor for TrimPrimer.
121
+ #
122
+ # @param options [Hash] Options hash.
123
+ # @option options [String] :primer
124
+ # @option options [Symbol] :direction
125
+ # @option options [Boolean] :overlap_min
126
+ # @option options [Boolean] :reverse_complement
127
+ # @option options [Integer] :mismatch_percent
128
+ # @option options [Ingetger] :insertion_percent
129
+ # @option options [Integer] :deletion_percent
130
+ #
131
+ # @return [TrimPrimer] Class instance.
132
+ def initialize(options)
133
+ @options = options
134
+ @options[:overlap_min] ||= 1
135
+ @options[:mismatch_percent] ||= 0
136
+ @options[:insertion_percent] ||= 0
137
+ @options[:deletion_percent] ||= 0
138
+ @pattern = pattern
139
+ @hit = false
140
+
141
+ check_options
142
+ end
143
+
144
+ # Return command lambda for trim_primer.
145
+ #
146
+ # @return [Proc] Command lambda.
147
+ def lmb
148
+ lambda do |input, output, status|
149
+ status_init(status, STATS)
150
+
151
+ input.each do |record|
152
+ @status[:records_in] += 1
153
+
154
+ if record[:SEQ] && record[:SEQ].length > 0
155
+ @status[:sequences_in] += 1
156
+
157
+ case @options[:direction]
158
+ when :forward then trim_forward(record)
159
+ when :reverse then trim_reverse(record)
160
+ end
161
+ end
162
+
163
+ output << record
164
+
165
+ @status[:records_out] += 1
166
+ end
167
+ end
168
+ end
169
+
170
+ private
171
+
172
+ # Check options.
173
+ def check_options
174
+ options_allowed(@options, :primer, :direction, :overlap_min,
175
+ :reverse_complement, :mismatch_percent,
176
+ :insertion_percent, :deletion_percent)
177
+ options_required(@options, :primer, :direction)
178
+ options_allowed_values(@options, direction: [:forward, :reverse])
179
+ options_allowed_values(@options, reverse_complement: [true, false])
180
+ options_assert(@options, ':overlap_min > 0')
181
+ options_assert(@options, ':mismatch_percent >= 0')
182
+ options_assert(@options, ':insertion_percent >= 0')
183
+ options_assert(@options, ':deletion_percent >= 0')
184
+ end
185
+
186
+ # Determine the pattern from the sequence and reverse complement if need be.
187
+ def pattern
188
+ if @options[:reverse_complement]
189
+ Seq.new(seq: @options[:primer], type: :dna).reverse.complement.seq
190
+ else
191
+ @options[:primer]
192
+ end
193
+ end
194
+
195
+ # Trim record with sequence in the forward direction.
196
+ #
197
+ # @param record [Hash] BioDSL record
198
+ def trim_forward(record)
199
+ entry = BioDSL::Seq.new_bp(record)
200
+
201
+ @status[:residues_in] += entry.length
202
+
203
+ while @pattern.length >= @options[:overlap_min]
204
+ if (match = match_forward(entry))
205
+ merge_forward(record, entry, match)
206
+ @hit = true
207
+ break
208
+ end
209
+
210
+ @pattern = @pattern[1...@pattern.length]
211
+ end
212
+
213
+ @hit ? @status[:pattern_hits] += 1 : @status[:pattern_misses] += 1
214
+ end
215
+
216
+ # Search a given entry and return match data.
217
+ #
218
+ # @param entry [BioDSL::Seq] Sequence entry.
219
+ #
220
+ # @return [BioDSL::Seq::Match,nil] Match result.
221
+ def match_forward(entry)
222
+ match_opt = match_options(@pattern.length)
223
+ match_opt[:start] = 0
224
+ match_opt[:stop] = 0
225
+
226
+ entry.patmatch(@pattern, match_opt)
227
+ end
228
+
229
+ # Use given match data to extract subsequence from given entry and merge to
230
+ # the given record.
231
+ #
232
+ # @param record [Hash] BioDSL record
233
+ # @param entry [BioDSL::Seq] Sequence entry.
234
+ # @param match [BioDSL::Seq::Match] Match data.
235
+ def merge_forward(record, entry, match)
236
+ entry = entry[match.pos + match.length..-1]
237
+
238
+ record.merge!(entry.to_bp)
239
+ record[:TRIM_PRIMER_DIR] = 'FORWARD'
240
+ record[:TRIM_PRIMER_POS] = match.pos
241
+ record[:TRIM_PRIMER_LEN] = match.length
242
+ record[:TRIM_PRIMER_PAT] = match.match
243
+ end
244
+
245
+ # Trim record with sequence in the reverse direction.
246
+ #
247
+ # @param record [Hash] BioDSL record
248
+ def trim_reverse(record)
249
+ entry = BioDSL::Seq.new_bp(record)
250
+
251
+ @status[:residues_in] += entry.length
252
+
253
+ while @pattern.length >= @options[:overlap_min]
254
+ if (match = match_reverse(entry))
255
+ merge_reverse(record, entry, match)
256
+ @hit = true
257
+ break
258
+ end
259
+
260
+ @pattern = @pattern[0...@pattern.length - 1]
261
+ end
262
+
263
+ @hit ? @status[:pattern_hits] += 1 : @status[:pattern_misses] += 1
264
+ end
265
+
266
+ # Search a given entry and return match data.
267
+ #
268
+ # @param entry [BioDSL::Seq] Sequence entry.
269
+ #
270
+ # @return [BioDSL::Seq::Match,nil] Match result.
271
+ def match_reverse(entry)
272
+ match_opt = match_options(@pattern.length)
273
+
274
+ start = entry.length - @pattern.length
275
+ start = 0 if start < 0
276
+
277
+ match_opt[:start] = start
278
+
279
+ entry.patmatch(@pattern, match_opt)
280
+ end
281
+
282
+ # Use given match data to extract subsequence from given entry and merge to
283
+ # the given record.
284
+ #
285
+ # @param record [Hash] BioDSL record
286
+ # @param entry [BioDSL::Seq] Sequence entry.
287
+ # @param match [BioDSL::Seq::Match] Match data.
288
+ def merge_reverse(record, entry, match)
289
+ entry = entry[0...match.pos]
290
+
291
+ record.merge!(entry.to_bp)
292
+ record[:TRIM_PRIMER_DIR] = 'REVERSE'
293
+ record[:TRIM_PRIMER_POS] = match.pos
294
+ record[:TRIM_PRIMER_LEN] = match.length
295
+ record[:TRIM_PRIMER_PAT] = match.match
296
+ end
297
+
298
+ # Calculate from the given pattern lenght the absolue mismatches, insertions
299
+ # and deletions allowed and return a hash with these values.
300
+ #
301
+ # @param length [Integer] Pattern length.
302
+ #
303
+ # @return [Hash] Match options hash.
304
+ def match_options(length)
305
+ mis = (length * @options[:mismatch_percent] * 0.01).round
306
+ ins = (length * @options[:insertion_percent] * 0.01).round
307
+ del = (length * @options[:deletion_percent] * 0.01).round
308
+
309
+ {max_mismatches: mis,
310
+ max_insertions: ins,
311
+ max_deletions: del}
312
+ end
313
+ end
314
+ end
@@ -0,0 +1,192 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Trim sequence ends removing residues with a low quality score.
30
+ #
31
+ # +trim_seq+ removes subquality residues from the ends of sequences in the
32
+ # stream based on quality SCORES in a FASTQ type quality score string.
33
+ # Trimming progresses until a stretch, specified with the +length_min+
34
+ # option, is found thus preventing premature termination of the trimming
35
+ # by e.g. a single good quality residue at the end. It is possible, using
36
+ # the +mode+ option to indicate if the sequence should be trimmed from the
37
+ # left or right end or both (default=:both).
38
+ #
39
+ # == Usage
40
+ #
41
+ # trim_seq([quality_min: <uint>[, length_min: <uint>
42
+ # [, mode: <:left|:right|:both>]]])
43
+ #
44
+ # === Options
45
+ #
46
+ # * quality_min: <uint> - Minimum quality (default=20).
47
+ # * length_min: <uint> - Minimum stretch length (default=3).
48
+ # * mode: <string> - Trim mode :left|:right|:both (default=:both).
49
+ #
50
+ # == Examples
51
+ #
52
+ # Consider the following FASTQ entry in the file test.fq:
53
+ #
54
+ # @test
55
+ # gatcgatcgtacgagcagcatctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcat
56
+ # +
57
+ # @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJI
58
+ #
59
+ # To trim both ends simply do:
60
+ #
61
+ # BP.new.read_fastq(input: "test.fq").trim_seq.trim_seq.run
62
+ #
63
+ # SEQ_NAME: test
64
+ # SEQ: tctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcat
65
+ # SEQ_LEN: 62
66
+ # SCORES: TUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJI
67
+ # ---
68
+ #
69
+ # Use the +quality_min+ option to change the minimum value to discard:
70
+ #
71
+ # BP.new.
72
+ # read_fastq(input: "test.fq").
73
+ # trim_seq(quality_min: 25).
74
+ # trim_seq.
75
+ # run
76
+ #
77
+ # SEQ_NAME: test
78
+ # SEQ: cgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag
79
+ # SEQ_LEN: 57
80
+ # SCORES: YZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh
81
+ # ---
82
+ #
83
+ # To trim the left end only (use :rigth for right end only), do:
84
+ #
85
+ # BP.new.read_fastq(input: "test.fq").trim_seq(mode: :left).trim_seq.run
86
+ #
87
+ # SEQ_NAME: test
88
+ # SEQ: tctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag
89
+ # SEQ_LEN: 62
90
+ # SCORES: TUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh
91
+ # ---
92
+ #
93
+ # To increase the length of stretch of good quality residues to match, use
94
+ # the +length_min+ option:
95
+ #
96
+ # BP.new.read_fastq(input: "test.fq").trim_seq(length_min: 4).trim_seq.run
97
+ #
98
+ # SEQ_NAME: test
99
+ # SEQ: tctgacgtatcgatcgttgattagttgctagctatgcagtct
100
+ # SEQ_LEN: 42
101
+ # SCORES: TUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUT
102
+ # ---
103
+ class TrimSeq
104
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
105
+ residues_out)
106
+
107
+ # Constructor for the TrimSeq class.
108
+ #
109
+ # @param [Hash] options Options hash.
110
+ #
111
+ # @option options [Integer] :quality_min
112
+ # TrimSeq minimum quality (default=20).
113
+ #
114
+ # @option options [Symbol] :mode
115
+ # TrimSeq mode (default=:both).
116
+ #
117
+ # @option options [Integer] :length_min
118
+ # TrimSeq stretch length triggering trim (default=3).
119
+ #
120
+ # @return [Proc] Returns the trim_seq command lambda.
121
+ #
122
+ # @return [TrimSeq] Returns an instance of the TrimSeq class.
123
+ def initialize(options)
124
+ @options = options
125
+
126
+ check_options
127
+ defaults
128
+
129
+ @mode = @options[:mode].to_sym
130
+ @min = @options[:quality_min]
131
+ @len = @options[:length_min]
132
+ end
133
+
134
+ # Return a lambda for the trim_seq command.
135
+ #
136
+ # @return [Proc] Returns the trim_seq command lambda.
137
+ def lmb
138
+ lambda do |input, output, status|
139
+ status_init(status, STATS)
140
+
141
+ input.each do |record|
142
+ @status[:records_in] += 1
143
+
144
+ trim_seq(record) if record[:SEQ] && record[:SCORES]
145
+
146
+ output << record
147
+
148
+ @status[:records_out] += 1
149
+ end
150
+ end
151
+ end
152
+
153
+ private
154
+
155
+ # Check the options.
156
+ def check_options
157
+ options_allowed(@options, :quality_min, :length_min, :mode)
158
+ options_allowed_values(@options, mode: [:left, :right, :both])
159
+ options_assert(@options, ':quality_min >= 0')
160
+ options_assert(@options, ':quality_min <= 40')
161
+ options_assert(@options, ':length_min > 0')
162
+ end
163
+
164
+ # Set defaul options.
165
+ def defaults
166
+ @options[:quality_min] ||= 20
167
+ @options[:mode] ||= :both
168
+ @options[:length_min] ||= 3
169
+ end
170
+
171
+ # Trim sequence in a given record with sequence info.
172
+ #
173
+ # @param record [Hash] BioDSL record
174
+ def trim_seq(record)
175
+ entry = BioDSL::Seq.new_bp(record)
176
+
177
+ @status[:sequences_in] += 1
178
+ @status[:residues_in] += entry.length
179
+
180
+ case @mode
181
+ when :both then entry.quality_trim!(@min, @len)
182
+ when :left then entry.quality_trim_left!(@min, @len)
183
+ when :right then entry.quality_trim_right!(@min, @len)
184
+ end
185
+
186
+ @status[:sequences_out] += 1
187
+ @status[:residues_out] += entry.length
188
+
189
+ record.merge! entry.to_bp
190
+ end
191
+ end
192
+ end
@@ -0,0 +1,170 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Run uchime_ref on sequences in the stream.
30
+ #
31
+ # This is a wrapper for the +usearch+ tool to run the program uchime_ref.
32
+ # Basically sequence type records are searched against a reference database or
33
+ # non-chimeric sequences, and chimirec sequences are filtered out so only
34
+ # non-chimeric sequences are output.
35
+ #
36
+ # Please refer to the manual:
37
+ #
38
+ # http://drive5.com/usearch/manual/uchime_ref.html
39
+ #
40
+ # Usearch 7.0 must be installed for +usearch+ to work. Read more here:
41
+ #
42
+ # http://www.drive5.com/usearch/
43
+ #
44
+ # == Usage
45
+ #
46
+ # uchime_ref(<database: <file>[cpus: <uint>])
47
+ #
48
+ # === Options
49
+ #
50
+ # * database: <file> - Database to search (in FASTA format).
51
+ # * cpus: <uint> - Number of CPU cores to use (default=1).
52
+ #
53
+ # == Examples
54
+ #
55
+ class UchimeRef
56
+ require 'BioDSL/helpers/aux_helper'
57
+
58
+ include AuxHelper
59
+
60
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
61
+ residues_out)
62
+
63
+ # Constructor for UchimeRef.
64
+ #
65
+ # @param options [Hash] Options hash.
66
+ # @option options [String] :database
67
+ # @option options [Integer] :cpus
68
+ #
69
+ # @return [UchimeRef] Class instance.
70
+ def initialize(options)
71
+ @options = options
72
+ aux_exist('usearch')
73
+ check_options
74
+ @options[:cpus] ||= 1
75
+ @options[:strand] ||= 'plus' # This option cant be changed in usearch7.0
76
+ end
77
+
78
+ # Return command lambda for uchime_ref.
79
+ #
80
+ # @return [Proc] Command lambda.
81
+ def lmb
82
+ lambda do |input, output, status|
83
+ status_init(status, STATS)
84
+
85
+ TmpDir.create('input', 'output') do |tmp_in, tmp_out|
86
+ process_input(input, output, tmp_in)
87
+ run_uchime_ref(tmp_in, tmp_out)
88
+
89
+ process_output(output, tmp_out)
90
+ end
91
+ end
92
+ end
93
+
94
+ private
95
+
96
+ # Check options.
97
+ def check_options
98
+ options_allowed(@options, :database, :cpus)
99
+ options_required(@options, :database)
100
+ options_files_exist(@options, :database)
101
+ options_assert(@options, ':cpus >= 1')
102
+ options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
103
+ end
104
+
105
+ # Process input stream and save records with sequences to a temporary FASTA
106
+ # file or emit non-sequence containing records to the output stream.
107
+ #
108
+ # @param input [Enumerator] Input stream.
109
+ # @param output [Enumerator::Yielder] Output stream.
110
+ # @param tmp_in [String] Path to temporary FASTA file.
111
+ def process_input(input, output, tmp_in)
112
+ BioDSL::Fasta.open(tmp_in, 'w') do |ios|
113
+ input.each_with_index do |record, i|
114
+ @status[:records_in] += 1
115
+
116
+ if record[:SEQ]
117
+ @status[:sequences_in] += 1
118
+ @status[:residues_in] += record[:SEQ].length
119
+ seq_name = record[:SEQ_NAME] || i.to_s
120
+
121
+ entry = BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ])
122
+
123
+ ios.puts entry.to_fasta
124
+ else
125
+ output << record
126
+ @status[:records_out] += 1
127
+ end
128
+ end
129
+ end
130
+ end
131
+
132
+ # Run uchime_ref on input file and save result input file.
133
+ #
134
+ # @param tmp_in [String] Path to input file.
135
+ # @param tmp_out [String] Path to output file.
136
+ #
137
+ # @raise [BioDSL::UsearchError] If command fails.
138
+ def run_uchime_ref(tmp_in, tmp_out)
139
+ uchime_opts = {
140
+ input: tmp_in,
141
+ output: tmp_out,
142
+ database: @options[:database],
143
+ strand: @options[:strand],
144
+ cpus: @options[:cpus],
145
+ verbose: @options[:verbose]
146
+ }
147
+
148
+ BioDSL::Usearch.uchime_ref(uchime_opts)
149
+ rescue BioDSL::UsearchError => e
150
+ raise unless e.message =~ /Empty input file/
151
+ end
152
+
153
+ # Process uchime_ref output data and emit to output stream.
154
+ #
155
+ # @param output [Enumerator::Yielder] Output stream.
156
+ # @param tmp_out [String] Path to file with uchime_ref data.
157
+ def process_output(output, tmp_out)
158
+ Fasta.open(tmp_out) do |ios|
159
+ ios.each do |entry|
160
+ record = entry.to_bp
161
+
162
+ output << record
163
+ @status[:sequences_out] += 1
164
+ @status[:residues_out] += entry.length
165
+ @status[:records_out] += 1
166
+ end
167
+ end
168
+ end
169
+ end
170
+ end