BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,318 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Clip sequences in the stream at a specified primer location.
30
+ #
31
+ # +clip_primer+ locates a specified +primer+ in sequences in the stream and
32
+ # clips the sequence after the match if the +direction+ is forward or before
33
+ # the match is the +direction+ is reverse. Using the +reverse_complement+
34
+ # option the primer sequence will be reverse complemented prior to matching.
35
+ # Using the +search_distance+ option will limit the primer search to the
36
+ # beginning of the sequence if the +direction+ is forward and to the end if
37
+ # the direction is +reverse+.
38
+ #
39
+ # Non-perfect matching can be allowed by setting the allowed
40
+ # +mismatch_percent+, +insertion_percent+ and +deletion_percent+.
41
+ #
42
+ # The following keys are added to clipped records:
43
+ #
44
+ # * CLIP_PRIMER_DIR - Direction of clip.
45
+ # * CLIP_PRIMER_POS - Sequence position of clip (0 based).
46
+ # * CLIP_PRIMER_LEN - Length of clip match.
47
+ # * CLIP_PRIMER_PAT - Clip match pattern.
48
+ # == Usage
49
+ #
50
+ # clip_primer(<primer: <string>>, <direction: <:forward|:reverse>
51
+ # [, reverse_complement: <bool>[, search_distance: <uint>
52
+ # [, mismatch_percent: <uint>
53
+ # [, insertion_percent: <uint>
54
+ # [, deletion_percent: <uint>]]]]])
55
+ #
56
+ # === Options
57
+ #
58
+ # * primer: <string> - Primer sequence to search for.
59
+ # * direction: <:forward|:reverse> - Clip direction.
60
+ # * reverse_complement: <bool> -
61
+ # Reverse complement primer (default=false).
62
+ # * search_distance: <uint> -
63
+ # Search distance from forward or reverse end.
64
+ # * mismatch_percent: <unit> - Allowed percent mismatches (default=0).
65
+ # * insertion_percent: <unit> - Allowed percent insertions (default=0).
66
+ # * deletion_percent: <unit> - Allowed percent mismatches (default=0).
67
+ #
68
+ # == Examples
69
+ #
70
+ # Consider the following FASTA entry in the file test.fq:
71
+ #
72
+ # >test
73
+ # actgactgaTCGTATGCCGTCTTCTGCTTactacgt
74
+ #
75
+ # To clip this sequence in the forward direction with the primer
76
+ # 'TGACTACGACTACGACTACT' do:
77
+ #
78
+ # BP.new.
79
+ # read_fasta(input: "test.fna").
80
+ # clip_primer(primer: "TGACTACGACTACGACTACT", direction: :forward).
81
+ # dump.
82
+ # run
83
+ #
84
+ # {:SEQ_NAME=>"test",
85
+ # :SEQ=>"actacgt",
86
+ # :SEQ_LEN=>7,
87
+ # :CLIP_PRIMER_DIR=>"FORWARD",
88
+ # :CLIP_PRIMER_POS=>9,
89
+ # :CLIP_PRIMER_LEN=>20,
90
+ # :CLIP_PRIMER_PAT=>"TGACTACGACTACGACTACT"}
91
+ #
92
+ # Or in the reverse direction:
93
+ #
94
+ # BP.new.
95
+ # read_fasta(input: "test.fna").
96
+ # clip_primer(primer: "TGACTACGACTACGACTACT", direction: :reverse).
97
+ # dump.
98
+ # run
99
+ #
100
+ # {:SEQ_NAME=>"test",
101
+ # :SEQ=>"actgactga",
102
+ # :SEQ_LEN=>9,
103
+ # :CLIP_PRIMER_DIR=>"REVERSE",
104
+ # :CLIP_PRIMER_POS=>9,
105
+ # :CLIP_PRIMER_LEN=>20,
106
+ # :CLIP_PRIMER_PAT=>"TGACTACGACTACGACTACT"}
107
+ # rubocop:disable ClassLength
108
+ class ClipPrimer
109
+ STATS = %i(records_in records_out sequences_in sequences_out
110
+ residues_in residues_out pattern_hits pattern_misses)
111
+
112
+ # Constructor for ClipPrimer.
113
+ #
114
+ # @param options [Hash] Options hash.
115
+ # @option options [String] :primer Primer used for matching.
116
+ # @option options [Symbol] :direction Direction for clipping.
117
+ # @option options [Integer] :search_distance Search distance.
118
+ # @option options [Boolean] :reverse_complment
119
+ # Flag indicating that primer should be reverse complemented.
120
+ #
121
+ # @return [ClipPrimer] Returns ClipPrimer instance.
122
+ def initialize(options)
123
+ @options = options
124
+ defaults
125
+ check_options
126
+
127
+ @primer = primer
128
+ @mis = calc_mis
129
+ @ins = calc_ins
130
+ @del = calc_del
131
+ end
132
+
133
+ # Lambda for ClipPrimer command.
134
+ #
135
+ # @return [Proc] Lambda for command.
136
+ def lmb
137
+ lambda do |input, output, status|
138
+ status_init(status, STATS)
139
+
140
+ input.each do |record|
141
+ @status[:records_in] += 1
142
+
143
+ clip_primer(record) if record[:SEQ] && record[:SEQ].length > 0
144
+
145
+ output << record
146
+ @status[:records_out] += 1
147
+ end
148
+ end
149
+ end
150
+
151
+ private
152
+
153
+ # Check options.
154
+ def check_options
155
+ options_allowed(@options, :primer, :direction, :search_distance,
156
+ :reverse_complement, :mismatch_percent,
157
+ :insertion_percent, :deletion_percent)
158
+ options_required(@options, :primer, :direction)
159
+ options_allowed_values(@options, direction: [:forward, :reverse])
160
+ options_allowed_values(@options, reverse_complement: [true, false])
161
+ options_assert(@options, ':search_distance > 0')
162
+ options_assert(@options, ':mismatch_percent >= 0')
163
+ options_assert(@options, ':insertion_percent >= 0')
164
+ options_assert(@options, ':deletion_percent >= 0')
165
+ end
166
+
167
+ # Set default option values.
168
+ def defaults
169
+ @options[:mismatch_percent] ||= 0
170
+ @options[:insertion_percent] ||= 0
171
+ @options[:deletion_percent] ||= 0
172
+ end
173
+
174
+ # Calculate the mismatch percentage.
175
+ #
176
+ # @return [Float] Mismatch percentage.
177
+ def calc_mis
178
+ (@primer.length * @options[:mismatch_percent] * 0.01).round
179
+ end
180
+
181
+ # Calculate the insertion percentage.
182
+ #
183
+ # @return [Float] Insertion percentage.
184
+ def calc_ins
185
+ (@primer.length * @options[:insertion_percent] * 0.01).round
186
+ end
187
+
188
+ # Calculate the deletion percentage.
189
+ #
190
+ # @return [Float] Deletion percentage.
191
+ def calc_del
192
+ (@primer.length * @options[:deletion_percent] * 0.01).round
193
+ end
194
+
195
+ # Reset any previous clip_primer results from record.
196
+ #
197
+ # @param record [Hash] BioPiece record to reset.
198
+ def reset(record)
199
+ record.delete :CLIP_PRIMER_DIR
200
+ record.delete :CLIP_PRIMER_POS
201
+ record.delete :CLIP_PRIMER_LEN
202
+ record.delete :CLIP_PRIMER_PAT
203
+ end
204
+
205
+ def clip_primer(record)
206
+ reset(record)
207
+ entry = BioDSL::Seq.new_bp(record)
208
+
209
+ @status[:sequences_in] += 1
210
+ @status[:residues_in] += entry.length
211
+
212
+ case @options[:direction]
213
+ when :forward then clip_primer_forward(record, entry)
214
+ when :reverse then clip_primer_reverse(record, entry)
215
+ else
216
+ fail RunTimeError, 'This should never happen'
217
+ end
218
+
219
+ @status[:sequences_out] += 1
220
+ @status[:residues_out] += entry.length
221
+ end
222
+
223
+ # Clip forward primer from entry and save clip information
224
+ # in record.
225
+ #
226
+ # @param record [Hash] BioPiece record with sequence.
227
+ # @param entry [BioDSL::Seq] Sequence entry.
228
+ def clip_primer_forward(record, entry)
229
+ if (match = entry.patmatch(@primer, start: 0, stop: stop(entry),
230
+ max_mismatches: @mis,
231
+ max_insertions: @ins,
232
+ max_deletions: @del))
233
+ @status[:pattern_hits] += 1
234
+
235
+ if match.pos + match.length <= entry.length
236
+ entry = entry[match.pos + match.length..-1]
237
+
238
+ merge_record_entry(record, entry, match, 'FORWARD')
239
+ end
240
+ else
241
+ @status[:pattern_misses] += 1
242
+ end
243
+ end
244
+
245
+ # Calculate the match stop position.
246
+ #
247
+ # @param entry [BioDSL::Seq] Sequence entry.
248
+ #
249
+ # @return [Integer] Match stop position.
250
+ def stop(entry)
251
+ stop = search_distance(entry) - @primer.length
252
+ stop = 0 if stop < 0
253
+ stop
254
+ end
255
+
256
+ # Clip reverse primer from entry and save clip information
257
+ # in record.
258
+ #
259
+ # @param record [Hash] BioPiece record with sequence.
260
+ # @param entry [BioDSL::Seq] Sequence entry.
261
+ def clip_primer_reverse(record, entry)
262
+ start = entry.length - search_distance(entry)
263
+
264
+ if (match = entry.patmatch(@primer, start: start,
265
+ stop: entry.length - 1,
266
+ max_mismatches: @mis,
267
+ max_insertions: @ins,
268
+ max_deletions: @del))
269
+ @status[:pattern_hits] += 1
270
+
271
+ entry = entry[0...match.pos]
272
+
273
+ merge_record_entry(record, entry, match, 'REVERSE')
274
+ else
275
+ @status[:pattern_misses] += 1
276
+ end
277
+ end
278
+
279
+ # Merge entry and match info to record.
280
+ #
281
+ # @param record [Hash] BioDSL record.
282
+ # @param entry [BioDSL::Seq] Sequence entry.
283
+ # @param match [BioDSL::Match] Match object.
284
+ # @param type [String] Type.
285
+ def merge_record_entry(record, entry, match, type)
286
+ record.merge!(entry.to_bp)
287
+ record[:CLIP_PRIMER_DIR] = type
288
+ record[:CLIP_PRIMER_POS] = match.pos
289
+ record[:CLIP_PRIMER_LEN] = match.length
290
+ record[:CLIP_PRIMER_PAT] = match.match
291
+ end
292
+
293
+ # Return the primer sequence and reverse-complement according to options.
294
+ #
295
+ # @return [String] Primer sequence.
296
+ def primer
297
+ if @options[:reverse_complement]
298
+ Seq.new(seq: @options[:primer], type: :dna).reverse.complement.seq
299
+ else
300
+ @options[:primer]
301
+ end
302
+ end
303
+
304
+ # Determine the search distance from the search_distance in the options or
305
+ # as the sequence length.
306
+ #
307
+ # @param entry [BioDSL::Seq] Sequence entry.
308
+ #
309
+ # @return [Integer] Search distance.
310
+ def search_distance(entry)
311
+ if @options[:search_distance] && @options[:search_distance] < entry.length
312
+ @options[:search_distance]
313
+ else
314
+ entry.length
315
+ end
316
+ end
317
+ end
318
+ end
@@ -0,0 +1,181 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Create OTUs from sequences in the stream.
30
+ #
31
+ # Use the +usearch+ program cluster_otus to cluster sequences in the stream
32
+ # and output a representative sequence from each cluster. Sequences must
33
+ # be dereplicated and sorted according to +SEQ_COUNT+ in decreasing order.
34
+ #
35
+ # Please refer to the manual:
36
+ #
37
+ # http://drive5.com/usearch/manual/cluster_otus.html
38
+ #
39
+ # Usearch 7.0 must be installed for +usearch+ to work. Read more here:
40
+ #
41
+ # http://www.drive5.com/usearch/
42
+ #
43
+ # == Usage
44
+ #
45
+ # cluster_otus([identity: <float>])
46
+ #
47
+ # === Options
48
+ #
49
+ # * identity: <float> - OTU cluster identity between 0.0 and 1.0
50
+ # (Default 0.97).
51
+ #
52
+ # == Examples
53
+ #
54
+ # To create OTU clusters do:
55
+ #
56
+ # BP.new.
57
+ # read_fasta(input: "in.fna").
58
+ # dereplicate_seq.
59
+ # sort(key: :SEQ_COUNT, reverse: true).
60
+ # cluster_otus.
61
+ # run
62
+ class ClusterOtus
63
+ require 'BioDSL/helpers/aux_helper'
64
+
65
+ include AuxHelper
66
+
67
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
68
+ residues_out)
69
+
70
+ # Constructor for ClusterOtu.
71
+ #
72
+ # @param options [Hash] Options hash.
73
+ # @option options [Float] :identity Cluster identity.
74
+ #
75
+ # @return [ClusterOtu] Instance of ClusterOtu.
76
+ def initialize(options)
77
+ @options = options
78
+
79
+ aux_exist('usearch')
80
+ check_options
81
+ defaults
82
+ end
83
+
84
+ def lmb
85
+ lambda do |input, output, status|
86
+ status_init(status, STATS)
87
+
88
+ TmpDir.create('tmp.fa', 'tmp.uc') do |tmp_in, tmp_out|
89
+ process_input(input, output, tmp_in)
90
+
91
+ BioDSL::Usearch.cluster_otus(input: tmp_in, output: tmp_out,
92
+ identity: @options[:identity],
93
+ verbose: @options[:verbose])
94
+
95
+ process_output(output, tmp_out)
96
+ end
97
+ end
98
+ end
99
+
100
+ private
101
+
102
+ # Check options.
103
+ def check_options
104
+ options_allowed(@options, :identity)
105
+ options_assert(@options, ':identity >= 0.0')
106
+ options_assert(@options, ':identity <= 1.0')
107
+ end
108
+
109
+ # Set default options.
110
+ def defaults
111
+ @options[:identity] ||= 0.97
112
+ end
113
+
114
+ # Process input records and save sequence data to a temporary FASTA file for
115
+ # use with +usearch cluster_otus+.
116
+ #
117
+ # @param input [Enumerator] Input stream.
118
+ # @param output [Enumerator::Yielder] Output stream.
119
+ # @param tmp_in [String] Path to temporary FASTA file.
120
+ def process_input(input, output, tmp_in)
121
+ BioDSL::Fasta.open(tmp_in, 'w') do |ios|
122
+ input.each_with_index do |record, i|
123
+ @status[:records_in] += 1
124
+
125
+ if record.key? :SEQ
126
+ @status[:sequences_in] += 1
127
+ @status[:residues_in] += record[:SEQ].length
128
+ ios.puts record2entry(record, i).to_fasta
129
+ else
130
+ output << record
131
+ @status[:records_out] += 1
132
+ end
133
+ end
134
+ end
135
+ end
136
+
137
+ # Create a Sequence entry from a record using the record index as sequence
138
+ # name if no such is found.
139
+ #
140
+ # @param record [Hash] BioDSL record.
141
+ # @param i [Integer] Record index
142
+ def record2entry(record, i)
143
+ seq_name = record[:SEQ_NAME] || i.to_s
144
+
145
+ if record.key? :SEQ_COUNT
146
+ seq_name << ";size=#{record[:SEQ_COUNT]}"
147
+ else
148
+ fail BioDSL::SeqError, 'Missing SEQ_COUNT'
149
+ end
150
+
151
+ BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ])
152
+ end
153
+
154
+ # Process the cluster output and emit otus to the output stream.
155
+ #
156
+ # @param output [Enumerator::Yielder] Output stream.
157
+ # @param tmp_out [String] Path to temporary OTU file.
158
+ #
159
+ # @raise [UsearchError] if size info is missing from SEQ_NAME.
160
+ def process_output(output, tmp_out)
161
+ BioDSL::Fasta.open(tmp_out) do |ios|
162
+ ios.each do |entry|
163
+ record = entry.to_bp
164
+
165
+ if record[:SEQ_NAME] =~ /;size=(\d+)$/
166
+ record[:SEQ_COUNT] = Regexp.last_match(1).to_i
167
+ record[:SEQ_NAME].sub!(/;size=\d+$/, '')
168
+ else
169
+ fail BioDSL::UsearchError, 'Missing size in SEQ_NAME: ' \
170
+ "#{record[:SEQ_NAME]}"
171
+ end
172
+
173
+ output << record
174
+ @status[:sequences_out] += 1
175
+ @status[:residues_out] += record[:SEQ].length
176
+ @status[:records_out] += 1
177
+ end
178
+ end
179
+ end
180
+ end
181
+ end
@@ -0,0 +1,170 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Collapse OTUs based on identicial taxonomy strings.
30
+ #
31
+ # +collapse_otus+ collapses OTUs in OTU style records if the TAXONOMY string
32
+ # is redundant. At the same time the sample counts (+_COUNT+) is incremented
33
+ # the collapsed OTUs.
34
+ #
35
+ # == Usage
36
+ #
37
+ # collapse_otus
38
+ #
39
+ # === Options
40
+ #
41
+ # == Examples
42
+ #
43
+ # Here is an OTU table with four rows, one of which has a redundant Taxonomy
44
+ # string:
45
+ #
46
+ # BP.new.read_table(input: "otu_table.txt").dump.run
47
+ #
48
+ # {:OTU=>"OTU_1",
49
+ # :CM1_COUNT=>881,
50
+ # :CM10_COUNT=>234,
51
+ # :TAXONOMY=>
52
+ # "Bacteria(100);Firmicutes(100);Bacilli(100);Lactobacillales(100); \
53
+ # Leuconostocaceae(100);Leuconostoc(100)"}
54
+ # {:OTU=>"OTU_0",
55
+ # :CM1_COUNT=>3352,
56
+ # :CM10_COUNT=>4329,
57
+ # :TAXONOMY=>
58
+ # "Bacteria(100);Firmicutes(100);Bacilli(100);Lactobacillales(100); \
59
+ # Streptococcaceae(100);Lactococcus(100)"}
60
+ # {:OTU=>"OTU_5",
61
+ # :CM1_COUNT=>5,
62
+ # :CM10_COUNT=>0,
63
+ # :TAXONOMY=>
64
+ # "Bacteria(100);Proteobacteria(100);Gammaproteobacteria(100); \
65
+ # Pseudomonadales(100);Pseudomonadaceae(100);Pseudomonas(100)"}
66
+ # {:OTU=>"OTU_3",
67
+ # :CM1_COUNT=>228,
68
+ # :CM10_COUNT=>200,
69
+ # :TAXONOMY=>
70
+ # "Bacteria(100);Firmicutes(100);Bacilli(100);Lactobacillales(100); \
71
+ # Streptococcaceae(100);Lactococcus(100)"}
72
+ #
73
+ # In order to collapse the redudant OTU simply run the stream through
74
+ # +collapse_otus+:
75
+ #
76
+ # BP.new.read_table(input: "otu_table.txt").collapse_otus.dump.run
77
+ #
78
+ # {:OTU=>"OTU_1",
79
+ # :CM1_COUNT=>881,
80
+ # :CM10_COUNT=>234,
81
+ # :TAXONOMY=>
82
+ # "Bacteria(100);Firmicutes(100);Bacilli(100);Lactobacillales(100); \
83
+ # Leuconostocaceae(100);Leuconostoc(100)"}
84
+ # {:OTU=>"OTU_0",
85
+ # :CM1_COUNT=>3580,
86
+ # :CM10_COUNT=>4529,
87
+ # :TAXONOMY=>
88
+ # "Bacteria(100);Firmicutes(100);Bacilli(100);Lactobacillales(100); \
89
+ # Streptococcaceae(100);Lactococcus(100)"}
90
+ # {:OTU=>"OTU_5",
91
+ # :CM1_COUNT=>5,
92
+ # :CM10_COUNT=>0,
93
+ # :TAXONOMY=>
94
+ # "Bacteria(100);Proteobacteria(100);Gammaproteobacteria(100); \
95
+ # Pseudomonadales(100);Pseudomonadaceae(100);Pseudomonas(100)"}
96
+ class CollapseOtus
97
+ STATS = %i(records_in records_out otus_in otus_out)
98
+
99
+ # Constructor for CollapseOtus.
100
+ #
101
+ # @param options [Hash] Options Hash.
102
+ def initialize(options)
103
+ @options = options
104
+
105
+ check_options
106
+ end
107
+
108
+ # Return the CollapseOtus command lambda.
109
+ #
110
+ # @return [Proc] Lambda for the command.
111
+ def lmb
112
+ lambda do |input, output, status|
113
+ status_init(status, STATS)
114
+
115
+ hash = {}
116
+
117
+ input.each do |record|
118
+ @status[:records_in] += 1
119
+
120
+ if record[:TAXONOMY]
121
+ @status[:otus_in] += 1
122
+
123
+ collapse_tax(hash, record)
124
+ else
125
+ output << record
126
+ @status[:records_out] += 1
127
+ end
128
+ end
129
+
130
+ write_tax(hash, output)
131
+ end
132
+ end
133
+
134
+ private
135
+
136
+ # Check options.
137
+ def check_options
138
+ options_allowed(@options, nil)
139
+ end
140
+
141
+ # Collapse identical taxonomies by removing duplicates and adding their
142
+ # counts.
143
+ #
144
+ # @param hash [Hash] Hash with taxonomy records.
145
+ # @param record [Hash] BioDSL record with taxonomy info.
146
+ def collapse_tax(hash, record)
147
+ key = record[:TAXONOMY].gsub(/\(\d+\)/, '').to_sym
148
+
149
+ if hash.key? key
150
+ record.each do |k, v|
151
+ hash[key][k] += v if k[-6..-1] == '_COUNT'
152
+ end
153
+ else
154
+ hash[key] = record
155
+ end
156
+ end
157
+
158
+ # Output collapsed taxonomy records.
159
+ #
160
+ # @param hash [Hash] Hash with taxonomy records.
161
+ # @param output [Enumerator::Yielder] Output stream.
162
+ def write_tax(hash, output)
163
+ hash.each_value do |record|
164
+ output << record
165
+ @status[:otus_out] += 1
166
+ @status[:records_out] += 1
167
+ end
168
+ end
169
+ end
170
+ end