BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,336 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Assemble ordered overlapping pair-end sequences in the stream.
30
+ #
31
+ # +assemble_pairs+ assembles overlapping pair-end sequences into single
32
+ # sequences that are output to the stream - the orginal sequences are no
33
+ # output. Assembly works by progressively considering all overlaps between the
34
+ # maximum considered overlap using the +overlap_max+ option (default is the
35
+ # length of the shortest sequence) until the minimum required overlap supplied
36
+ # with the +overlap_min+ option (default 1). For each overlap a percentage of
37
+ # mismatches can be allowed using the +mismatch_percent+ option (default 20%).
38
+ #
39
+ # Mismatches in the overlapping regions are resolved so that the residues with
40
+ # the highest quality score is used in the assembled sequence. The quality
41
+ # scores are averaged in the overlapping region. The sequence of the
42
+ # overlapping region is output in upper case and the remaining in lower case.
43
+ #
44
+ # Futhermore, sequences must be in interleaved order in the stream - use
45
+ # +read_fastq+ with +input+ and +input2+ options for that.
46
+ #
47
+ # The additional keys are added to records with assembled sequences:
48
+ #
49
+ # * OVERLAP_LEN - the length of the located overlap.
50
+ # * HAMMING_DIST - the number of mismatches in the assembly.
51
+ #
52
+ # Using the +merge_unassembled+ option will merge any unassembled sequences
53
+ # taking into account reverse complementation of read2 if the
54
+ # +reverse_complement+ option is true. Note that you probably want to set
55
+ # +overlap_min+ to 1 before using +merge_unassembled+ to improve chances of
56
+ # making an assembly before falling back to a simple merge.
57
+ #
58
+ # == Usage
59
+ #
60
+ # assemble_pairs([mismatch_percent: <uint>[, overlap_min: <uint>
61
+ # [, overlap_max: <uint>[, reverse_complement: <bool>
62
+ # [, merge_unassembled: <bool>]]]]])
63
+ #
64
+ # === Options
65
+ #
66
+ # * mismatch_percent: <uint> - Maximum allowed overlap mismatches in
67
+ # percent (default=20).
68
+ # * overlap_min: <uint> - Minimum overlap required (default=1).
69
+ # * overlap_max: <uint> - Maximum overlap considered
70
+ # (default=<length of shortest sequences>).
71
+ # * reverse_complement: <bool> - Reverse-complement read2 before assembly
72
+ # (default=false).
73
+ # * merge_unassembled: <bool> - Merge unassembled pairs (default=false).
74
+ #
75
+ # == Examples
76
+ #
77
+ # If you have two pair-end sequence files with the Illumina data then you
78
+ # can assemble these using assemble_pairs like this:
79
+ #
80
+ # BP.new.
81
+ # read_fastq(input: "file1.fq", input2: "file2.fq).
82
+ # assemble_pairs(reverse_complement: true).
83
+ # run
84
+ # rubocop:disable ClassLength
85
+ class AssemblePairs
86
+ STATS = %i(overlap_sum hamming_sum records_in records_out sequences_in
87
+ sequences_out residues_in residues_out assembled unassembled)
88
+
89
+ # Constructor for the AssemblePairs class.
90
+ #
91
+ # @param [Hash] options Options hash.
92
+ #
93
+ # @option options [Integer] :mismatch_percent
94
+ # Maximum allowed overlap mismatches in percent.
95
+ #
96
+ # @option options [Integer] :overlap_min
97
+ # Minimum length of overlap.
98
+ #
99
+ # @option options [Integer] :overlap_max
100
+ # Maximum length of overlap.
101
+ #
102
+ # @option options [Boolean] :reverse_complement
103
+ # Reverse-complment read2.
104
+ #
105
+ # @option options [Boolean] :merge_unassembled
106
+ # Merge read pairs that couldn't be assembled.
107
+ #
108
+ # @option options [Boolean] :allow_unassembled
109
+ # Output reads that couldn't be assembled.
110
+ #
111
+ # @return [ReadFasta] Returns an instance of the class.
112
+ def initialize(options)
113
+ @options = options
114
+
115
+ @overlap_sum = 0
116
+ @hamming_sum = 0
117
+
118
+ check_options
119
+ defaults
120
+ end
121
+
122
+ # Return a lambda for the read_fasta command.
123
+ #
124
+ # @return [Proc] Returns the read_fasta command lambda.
125
+ def lmb
126
+ lambda do |input, output, status|
127
+ status_init(status, STATS)
128
+
129
+ input.each_slice(2) do |record1, record2|
130
+ @status[:records_in] += 2
131
+
132
+ if record2 && record1[:SEQ] && record2[:SEQ]
133
+ assemble_pairs(record1, record2, output)
134
+ else
135
+ output_record(record1, output)
136
+ output_record(record2, output) if record2
137
+ end
138
+ end
139
+
140
+ calc_status
141
+ end
142
+ end
143
+
144
+ private
145
+
146
+ # Check the options.
147
+ def check_options
148
+ options_allowed(@options, :mismatch_percent, :overlap_min, :overlap_max,
149
+ :reverse_complement, :merge_unassembled,
150
+ :allow_unassembled)
151
+ options_allowed_values(@options, reverse_complement: [true, false, nil])
152
+ options_allowed_values(@options, merge_unassembled: [true, false, nil])
153
+ options_allowed_values(@options, allow_unassembled: [true, false, nil])
154
+ options_conflict(@options, allow_unassembled: :merge_unassembled)
155
+ options_assert(@options, ':mismatch_percent >= 0')
156
+ options_assert(@options, ':mismatch_percent <= 100')
157
+ options_assert(@options, ':overlap_min > 0')
158
+ end
159
+
160
+ # Set default options.
161
+ def defaults
162
+ @options[:mismatch_percent] ||= 20
163
+ @options[:overlap_min] ||= 1
164
+ end
165
+
166
+ # Output a record to the stream if a stram is provided.
167
+ #
168
+ # @param record [Hash] BioDSL record to output.
169
+ # @param output [Enumerator::Yielder, nil] Output stream or nil.
170
+ def output_record(record, output)
171
+ return unless output
172
+ output << record
173
+ @status[:records_out] += 1
174
+ end
175
+
176
+ # Assemble records with sequences and output to the stream
177
+ #
178
+ # @param record1 [Hash] BioDSL record1.
179
+ # @param record2 [Hash] BioDSL record2.
180
+ # @param output [Enumerator::Yielder] Output stream.
181
+ def assemble_pairs(record1, record2, output)
182
+ entry1, entry2 = records2entries(record1, record2)
183
+
184
+ if overlap_possible?(entry1, entry2, @options[:overlap_min]) &&
185
+ assembled = assemble_entries(entry1, entry2)
186
+ output_assembled(assembled, output)
187
+ elsif @options[:merge_unassembled]
188
+ output_merged(entry1, entry2, output)
189
+ elsif @options[:allow_unassembled]
190
+ output_entries(entry1, entry2, output)
191
+ else
192
+ @status[:unassembled] += 1
193
+ end
194
+ end
195
+
196
+ # Given a pair of records convert these into sequence entries and
197
+ # reverse-complment if need be.
198
+ #
199
+ # @param record1 [Hash] Record1.
200
+ # @param record2 [Hash] Record2.
201
+ #
202
+ # @return [Array] Returns a tuple of sequence entries.
203
+ def records2entries(record1, record2)
204
+ entry1 = BioDSL::Seq.new_bp(record1)
205
+ entry2 = BioDSL::Seq.new_bp(record2)
206
+ entry1.type = :dna
207
+ entry2.type = :dna
208
+
209
+ if @options[:reverse_complement] && entry2.length > 0
210
+ entry2.reverse!.complement!
211
+ end
212
+
213
+ @status[:sequences_in] += 2
214
+ @status[:residues_in] += entry1.length + entry2.length
215
+
216
+ [entry1, entry2]
217
+ end
218
+
219
+ # Determines if an overlap between two given entries is possible considering
220
+ # the minimum overlap length.
221
+ #
222
+ # @param entry1 [BioDSL::Seq] Sequence entry1.
223
+ # @param entry2 [BioDSL::Seq] Sequence entry2.
224
+ # @param overlap_min [Integer] Minimum overlap.
225
+ #
226
+ # @return [Boolean] True if overlap possible otherwise false.
227
+ def overlap_possible?(entry1, entry2, overlap_min)
228
+ entry1.length >= overlap_min && entry2.length >= overlap_min
229
+ end
230
+
231
+ # Assemble a pair of given entries if possible and return an assembled
232
+ # entry, or nil the entries could not be assembled.
233
+ #
234
+ # @param entry1 [BioDSL::Seq] Sequence entry1.
235
+ # @param entry2 [BioDSL::Seq] Sequence entry2.
236
+ #
237
+ # @return [BioDSL::Seq, nil] Returns Seq entry or nil.
238
+ def assemble_entries(entry1, entry2)
239
+ BioDSL::Assemble.pair(
240
+ entry1,
241
+ entry2,
242
+ mismatches_max: @options[:mismatch_percent],
243
+ overlap_min: @options[:overlap_min],
244
+ overlap_max: @options[:overlap_max]
245
+ )
246
+ end
247
+
248
+ # Output assembled pairs to the output stream.
249
+ #
250
+ # @param assembled [BioDSL::Seq] Assembled sequence entry.
251
+ # @param output [Enumerator::Yielder] Output stream.
252
+ def output_assembled(assembled, output)
253
+ output << assembled2record(assembled)
254
+
255
+ @status[:assembled] += 1
256
+ @status[:records_out] += 1
257
+ @status[:sequences_out] += 1
258
+ @status[:residues_out] += assembled.length
259
+ end
260
+
261
+ # Convert a sequence entry to a BioPiece record with hamming distance and
262
+ # overlap length from the entry's seq_name.
263
+ #
264
+ # @param assembled [BioDSL::Seq] Merged sequence entry.
265
+ #
266
+ # @return [Hash] BioDSL record.
267
+ def assembled2record(assembled)
268
+ new_record = assembled.to_bp
269
+
270
+ if assembled.seq_name =~ /overlap=(\d+):hamming=(\d+)$/
271
+ overlap = Regexp.last_match(1).to_i
272
+ hamming = Regexp.last_match(2).to_i
273
+ @overlap_sum += overlap
274
+ @hamming_sum += hamming
275
+ new_record[:OVERLAP_LEN] = overlap
276
+ new_record[:HAMMING_DIST] = hamming
277
+ end
278
+
279
+ new_record
280
+ end
281
+
282
+ # Merge and output entries to the stream.
283
+ #
284
+ # @param entry1 [BioDSL::Seq] Entry1.
285
+ # @param entry2 [BioDSL::Seq] Entry2.
286
+ # @param output [Enumerator::Yielder] Output stream.
287
+ def output_merged(entry1, entry2, output)
288
+ entry1 << entry2
289
+
290
+ output << entry2record(entry1)
291
+
292
+ @status[:unassembled] += 1
293
+ @status[:sequences_out] += 1
294
+ @status[:residues_out] += entry1.length
295
+ @status[:records_out] += 1
296
+ end
297
+
298
+ # Output unassembled entries to the stream.
299
+ #
300
+ # @param entry1 [BioDSL::Seq] Entry1.
301
+ # @param entry2 [BioDSL::Seq] Entry2.
302
+ # @param output [Enumerator::Yielder] Output stream.
303
+ def output_entries(entry1, entry2, output)
304
+ output << entry2record(entry1)
305
+ output << entry2record(entry2)
306
+
307
+ @status[:unassembled] += 2
308
+ @status[:sequences_out] += 2
309
+ @status[:residues_out] += entry1.length + entry2.length
310
+ @status[:records_out] += 2
311
+ end
312
+
313
+ # Converts a sequence entry to a BioPeice record.
314
+ #
315
+ # @param entry [BioDSL::Seq] Sequence entry.
316
+ #
317
+ # @return [Hash] BioDSL record.
318
+ def entry2record(entry)
319
+ record = entry.to_bp
320
+ record[:OVERLAP_LEN] = 0
321
+ record[:HAMMING_DIST] = entry.length
322
+ record
323
+ end
324
+
325
+ # Calculate additional status values.
326
+ def calc_status
327
+ assembled_percent =
328
+ (100 * 2 * @status[:assembled].to_f / @status[:sequences_in]).round(2)
329
+ @status[:assembled_percent] = assembled_percent
330
+ @status[:overlap_mean] =
331
+ (@overlap_sum.to_f / @status[:records_out]).round(2)
332
+ @status[:hamming_dist_mean] =
333
+ (@hamming_sum.to_f / @status[:records_out]).round(2)
334
+ end
335
+ end
336
+ end
@@ -0,0 +1,230 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # rubocop:disable ClassLength
30
+
31
+ # == Assemble sequences the stream using IDBA_UD.
32
+ #
33
+ # +assemble_seq_idba+ is a wrapper around the prokaryotic metagenome
34
+ # assembler IDBA_UD:
35
+ #
36
+ # http://i.cs.hku.hk/~alse/hkubrg/projects/idba_ud/
37
+ #
38
+ # Any records containing sequence information will be included in the
39
+ # assembly, but only the assembled contig sequences will be output to the
40
+ # stream.
41
+ #
42
+ # The sequences records may contain quality scores, and if the sequence
43
+ # names indicates that the sequence order is inter-leaved paired-end
44
+ # assembly will be performed.
45
+ #
46
+ # == Usage
47
+ #
48
+ # assemble_seq_idba([kmer_min: <uint>[, kmer_max: <uint>[, cpus: <uint>]]])
49
+ #
50
+ # === Options
51
+ #
52
+ # * kmer_min: <uint> - Minimum k-mer value (default: 24).
53
+ # * kmer_max: <uint> - Maximum k-mer value (default: 128).
54
+ # * cpus: <uint> - Number of CPUs to use (default: 1).
55
+ #
56
+ # == Examples
57
+ #
58
+ # If you have two pair-end sequence files with the Illumina data then you
59
+ # can assemble these using +assemble_seq_idba+ like this:
60
+ #
61
+ # BP.new.
62
+ # read_fastq(input: "file1.fq", input2: "file2.fq).
63
+ # assemble_seq_idba.
64
+ # write_fasta(output: "contigs.fna").
65
+ # run
66
+ class AssembleSeqIdba
67
+ require 'English'
68
+ require 'BioDSL/helpers/aux_helper'
69
+
70
+ include AuxHelper
71
+
72
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
73
+ residues_out)
74
+
75
+ # Constructor for the AssembleSeqIdba class.
76
+ #
77
+ # @param [Hash] options Options hash.
78
+ # @option options [Integer] :kmer_min Minimum kmer value.
79
+ # @option options [Integer] :kmer_max Maximum kmer value.
80
+ # @option options [Integer] :cpus CPUs to use.
81
+ #
82
+ # @return [AssembleSeqIdba] Returns an instance of the class.
83
+ def initialize(options)
84
+ @options = options
85
+ @lengths = []
86
+
87
+ aux_exist('idba_ud')
88
+ check_options
89
+ defaults
90
+ end
91
+
92
+ # Return a lambda for the AssembleSeqIdba command.
93
+ #
94
+ # @return [Proc] Returns the command lambda.
95
+ def lmb
96
+ lambda do |input, output, status|
97
+ status_init(status, STATS)
98
+
99
+ TmpDir.create('reads.fna', 'contig.fa') do |fa_in, fa_out, tmp_dir|
100
+ process_input(input, output, fa_in)
101
+ execute_idba(fa_in, tmp_dir)
102
+ lengths = process_output(output, fa_out)
103
+ end
104
+
105
+ calc_n50(status)
106
+ end
107
+ end
108
+
109
+ private
110
+
111
+ # Check the options.
112
+ def check_options
113
+ options_allowed(@options, :kmer_min, :kmer_max, :cpus)
114
+ options_assert(@options, ':kmer_min >= 16')
115
+ options_assert(@options, ':kmer_min <= 256')
116
+ options_assert(@options, ':kmer_max >= 16')
117
+ options_assert(@options, ':kmer_max <= 512')
118
+ options_assert(@options, ':cpus >= 1')
119
+ options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
120
+ end
121
+
122
+ # Set the default option values.
123
+ def defaults
124
+ @options[:kmer_min] ||= 24
125
+ @options[:kmer_max] ||= 48
126
+ @options[:cpus] ||= 1
127
+ end
128
+
129
+ # Read all records from input and emit non-sequence records to the output
130
+ # stream. Sequence records are saved to a temporary file.
131
+ #
132
+ # @param input [Enumerator] input stream.
133
+ # @param output [Enumerator::Yielder] Output stream.
134
+ # @param fa_in [String] Path to temporary FASTA file.
135
+ def process_input(input, output, fa_in)
136
+ BioDSL::Fasta.open(fa_in, 'w') do |fasta_io|
137
+ input.each do |record|
138
+ @status[:records_in] += 1
139
+
140
+ if record.key? :SEQ
141
+ entry = BioDSL::Seq.new_bp(record)
142
+
143
+ @status[:sequences_in] += 1
144
+ @status[:residues_in] += entry.length
145
+
146
+ fasta_io.puts entry.to_fasta
147
+ else
148
+ @status[:records_out] += 1
149
+ output.puts record
150
+ end
151
+ end
152
+ end
153
+ end
154
+
155
+ # Execute IDBA.
156
+ #
157
+ # @param fa_in [String] Path to input FASTA file.
158
+ # @param tmp_dir [String] Temporary directory path.
159
+ #
160
+ # @raise If execution fails.
161
+ def execute_idba(fa_in, tmp_dir)
162
+ cmd_line = compile_cmd_line(fa_in, tmp_dir)
163
+ $stderr.puts "Running: #{cmd_line}" if BioDSL.verbose
164
+ system(cmd_line)
165
+
166
+ fail cmd_line unless $CHILD_STATUS.success?
167
+ end
168
+
169
+ # Compile the command and options for executing IDBA.
170
+ #
171
+ # @param fa_in [String] Path to input FASTA file.
172
+ # @param tmp_dir [String] Temporary directory path.
173
+ #
174
+ # @return [String] The command line for the IDBA system call.
175
+ def compile_cmd_line(fa_in, tmp_dir)
176
+ cmd = []
177
+ cmd << 'idba_ud'
178
+ cmd << "--read #{fa_in}"
179
+ cmd << "--out #{tmp_dir}"
180
+ cmd << "--mink #{@options[:kmer_min]}"
181
+ cmd << "--maxk #{@options[:kmer_max]}"
182
+ cmd << "--num_threads #{@options[:cpus]}"
183
+ cmd << '> /dev/null 2>&1' unless BioDSL.verbose
184
+
185
+ cmd.join(' ')
186
+ end
187
+
188
+ # Read the IDBA assembled contigs and output to the stream.
189
+ #
190
+ # @param output [Enumerator::Yielder] Output stream.
191
+ # @param fa_out [String] Path to contig FASTA file.
192
+ def process_output(output, fa_out)
193
+ BioDSL::Fasta.open(fa_out, 'r') do |ios|
194
+ ios.each do |entry|
195
+ output << entry.to_bp
196
+ @status[:records_out] += 1
197
+ @status[:sequences_out] += 1
198
+ @status[:residues_out] += entry.length
199
+
200
+ @lengths << entry.length
201
+ end
202
+ end
203
+ end
204
+
205
+ # Calculate the n50 and add to the status.
206
+ #
207
+ # {http://en.wikipedia.org/wiki/N50_statistic}
208
+ #
209
+ # @param status [Hash] Status hash.
210
+ def calc_n50(status)
211
+ @lengths.sort!
212
+ @lengths.reverse!
213
+
214
+ status[:contig_max] = @lengths.first || 0
215
+ status[:contig_min] = @lengths.last || 0
216
+ status[:contig_n50] = 0
217
+
218
+ count = 0
219
+
220
+ @lengths.each do |length|
221
+ count += length
222
+
223
+ if count >= status[:residues_out] * 0.50
224
+ status[:contig_n50] = length
225
+ break
226
+ end
227
+ end
228
+ end
229
+ end
230
+ end