BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,414 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Read FASTQ entries from one or more files.
30
+ #
31
+ # +read_fastq+ read in sequence entries from FASTQ files. Each sequence entry
32
+ # consists of a sequence name prefixed by a '>' followed by the sequence name
33
+ # on a line of its own, followed by one or my lines of sequence until the next
34
+ # entry or the end of the file. The resulting Biopiece record consists of the
35
+ # following record type:
36
+ #
37
+ # {:SEQ_NAME=>"test",
38
+ # :SEQ=>"AGCATCGACTAGCAGCATTT",
39
+ # :SEQ_LEN=>20}
40
+ #
41
+ # It is possible to read in pair-end data interleaved by using the +input2+
42
+ # option. Thus a read is in turn from input and input2. If the
43
+ # +reverse_complement+ option is used, then the input2 reads will be
44
+ # reverse-complemented.
45
+ #
46
+ # Input files may be compressed with gzip og bzip2.
47
+ #
48
+ # For more about the FASTQ format:
49
+ #
50
+ # http://en.wikipedia.org/wiki/Fasta_format
51
+ #
52
+ # == Usage
53
+ # read_fastq(input: <glob>[, input2: <glob>[, first: <uint>|last: <uint>
54
+ # [, reverse_complement: <bool>]]])
55
+ #
56
+ # === Options
57
+ # * input <glob> - Input file or file glob expression.
58
+ # * input2 <glob> - Input file or file glob expression.
59
+ # * first <uint> - Only read in the _first_ number of entries.
60
+ # * last <uint> - Only read in the _last_ number of entries.
61
+ # * reverse_complement: <bool> - Reverse-complements input2 reads.
62
+ #
63
+ # == Examples
64
+ #
65
+ # To read all FASTQ entries from a file:
66
+ #
67
+ # BP.new.read_fastq(input: "test.fq").dump.run
68
+ #
69
+ # To read all FASTQ entries from a gzipped file:
70
+ #
71
+ # BP.new.read_fastq(input: "test.fq.gz").dump.run
72
+ #
73
+ # To read in only 10 records from a FASTQ file:
74
+ #
75
+ # BP.new.read_fastq(input: "test.fq", first: 10).dump.run
76
+ #
77
+ # To read in the last 10 records from a FASTQ file:
78
+ #
79
+ # BP.new.read_fastq(input: "test.fq", last: 10).dump.run
80
+ #
81
+ # To read all FASTQ entries from multiple files:
82
+ #
83
+ # BP.new.read_fastq(input: "test1.fq,test2.fq").dump.run
84
+ #
85
+ # To read FASTQ entries from multiple files using a glob expression:
86
+ #
87
+ # BP.new.read_fastq(input: "*.fq").dump.run
88
+ #
89
+ # To read FASTQ entries from pair-end data:
90
+ #
91
+ # BP.new.read_fastq(input: "file1.fq", input2: "file2.fq").dump.run
92
+ #
93
+ # To read FASTQ entries from pair-end data:
94
+ #
95
+ # BP.new.read_fastq(input: "file1.fq", input2: "file2.fq").dump.run
96
+ #
97
+ # To read FASTQ entries from pair-end data and reverse-complement read2:
98
+ #
99
+ # BP.new.
100
+ # read_fastq(input: "file1.fq", input2: "file2.fq",
101
+ # reverse_complement: true)
102
+ # .dump.run
103
+ #
104
+ # rubocop: disable ClassLength
105
+ # rubocop: disable Metrics/AbcSize
106
+ # rubocop: disable Metrics/CyclomaticComplexity
107
+ # rubocop: disable Metrics/PerceivedComplexity
108
+ class ReadFastq
109
+ MAX_TEST = 1_000
110
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
111
+ residues_out)
112
+
113
+ # Constructor for ReadFastq.
114
+ #
115
+ # @param options [Hash] Options hash.
116
+ # @option options [Symbol,String] :encoding
117
+ # @option options [String] :input
118
+ # @option options [String] :input2
119
+ # @option options [Integer] :first
120
+ # @option options [Integer] :last
121
+ # @option options [Boolean] :reverse_complement
122
+ #
123
+ # @return [ReadFastq] Class instance.
124
+ def initialize(options)
125
+ @options = options
126
+ @encoding = options[:encoding] ? options[:encoding].to_sym : :auto
127
+ @pair = options[:input2]
128
+ @buffer = []
129
+ @type = nil
130
+
131
+ check_options
132
+ end
133
+
134
+ # Return command lambda for ReadFastq.
135
+ #
136
+ # @return [Proc] Command lambda.
137
+ def lmb
138
+ lambda do |input, output, status|
139
+ status_init(status, STATS)
140
+
141
+ process_input(input, output)
142
+
143
+ case
144
+ when @options[:first] && @pair then read_first_pair(output)
145
+ when @options[:first] then read_first_single(output)
146
+ when @options[:last] && @pair then read_last_pair(output)
147
+ when @options[:last] then read_last_single(output)
148
+ when @pair then read_all_pair(output)
149
+ else
150
+ read_all_single(output)
151
+ end
152
+ end
153
+ end
154
+
155
+ private
156
+
157
+ # Check options.
158
+ def check_options
159
+ options_allowed(@options, :encoding, :input, :input2, :first, :last,
160
+ :reverse_complement)
161
+ options_allowed_values(@options, encoding: [:auto, :base_33, :base_64])
162
+ options_allowed_values(@options, reverse_complement: [nil, true, false])
163
+ options_tie(@options, reverse_complement: :input2)
164
+ options_required(@options, :input)
165
+ options_files_exist(@options, :input, :input2)
166
+ options_unique(@options, :first, :last)
167
+ options_assert(@options, ':first >= 0')
168
+ options_assert(@options, ':last >= 0')
169
+ end
170
+
171
+ # Emit all records from the input stream to the output stream.
172
+ #
173
+ # @param input [Enumerator] Input stream.
174
+ # @param output [Enumerator::Yielder] Output stream.
175
+ def process_input(input, output)
176
+ return unless input
177
+
178
+ input.each do |record|
179
+ @status[:records_in] += 1
180
+ @status[:records_out] += 1
181
+
182
+ if (seq = record[:SEQ])
183
+ @status[:sequences_in] += 1
184
+ @status[:residues_in] += seq.length
185
+ end
186
+
187
+ output << record
188
+ end
189
+ end
190
+
191
+ # Read :first FASTQ entries from single files.
192
+ #
193
+ # @param output [Enumerator::Yielder] Output stream.
194
+ def read_first_single(output)
195
+ fastq_files.each do |file|
196
+ BioDSL::Fastq.open(file) do |ios|
197
+ ios.each do |entry|
198
+ check_entry(entry)
199
+ output << entry.to_bp
200
+ @status[:records_out] += 1
201
+ @status[:sequences_out] += 1
202
+ @status[:residues_out] += entry.length
203
+ return if @status[:sequences_out] >= @options[:first]
204
+ end
205
+ end
206
+ end
207
+ end
208
+
209
+ # Read :first FASTQ entries from paired files interleaved.
210
+ #
211
+ # @param output [Enumerator::Yielder] Output stream.
212
+ #
213
+ # rubocop: disable MethodLength
214
+ def read_first_pair(output)
215
+ fastq_files.each_slice(2) do |file1, file2|
216
+ BioDSL::Fastq.open(file1) do |ios1|
217
+ BioDSL::Fastq.open(file2) do |ios2|
218
+ while (entry1 = ios1.next_entry) && (entry2 = ios2.next_entry)
219
+ check_entry(entry1, entry2)
220
+ reverse_complement(entry2) if @options[:reverse_complement]
221
+ output << entry1.to_bp
222
+ output << entry2.to_bp
223
+ @status[:records_out] += 2
224
+ @status[:sequences_out] += 2
225
+ @status[:residues_out] += entry1.length + entry2.length
226
+ return if @status[:sequences_out] >= @options[:first]
227
+ end
228
+ end
229
+ end
230
+ end
231
+ end
232
+
233
+ # Read :last FASTQ entries from single files.
234
+ #
235
+ # @param output [Enumerator::Yielder] Output stream.
236
+ #
237
+ # rubocop: enable MethodLength
238
+ def read_last_single(output)
239
+ fastq_files.each do |file|
240
+ BioDSL::Fastq.open(file) do |ios|
241
+ ios.each do |entry|
242
+ check_entry(entry)
243
+ @buffer << entry
244
+ @buffer.shift if @buffer.size > @options[:last]
245
+ end
246
+ end
247
+ end
248
+
249
+ output_buffer(output)
250
+ end
251
+
252
+ # Read :last FASTQ entries from paired files interleaved.
253
+ #
254
+ # @param output [Enumerator::Yielder] Output stream.
255
+ def read_last_pair(output)
256
+ fastq_files.each_slice(2) do |file1, file2|
257
+ BioDSL::Fastq.open(file1) do |ios1|
258
+ BioDSL::Fastq.open(file2) do |ios2|
259
+ while (entry1 = ios1.next_entry) && (entry2 = ios2.next_entry)
260
+ check_entry(entry1, entry2)
261
+ reverse_complement(entry2) if @options[:reverse_complement]
262
+ @buffer << entry1
263
+ @buffer << entry2
264
+ @buffer.shift(@buffer.size - @options[:last])
265
+ end
266
+ end
267
+ end
268
+ end
269
+
270
+ output_buffer(output)
271
+ end
272
+
273
+ # Read all FASTQ entries from single files.
274
+ #
275
+ # @param output [Enumerator::Yielder] Output stream.
276
+ def read_all_single(output)
277
+ fastq_files.each do |file|
278
+ BioDSL::Fastq.open(file) do |ios|
279
+ ios.each do |entry|
280
+ check_entry(entry)
281
+ output << entry.to_bp
282
+ @status[:records_out] += 1
283
+ @status[:sequences_out] += 1
284
+ @status[:residues_out] += entry.length
285
+ end
286
+ end
287
+ end
288
+ end
289
+
290
+ # Read all FASTQ entries from paired files interleaved.
291
+ #
292
+ # @param output [Enumerator::Yielder] Output stream.
293
+ def read_all_pair(output)
294
+ fastq_files.each_slice(2) do |file1, file2|
295
+ BioDSL::Fastq.open(file1) do |ios1|
296
+ BioDSL::Fastq.open(file2) do |ios2|
297
+ while (entry1 = ios1.next_entry) && (entry2 = ios2.next_entry)
298
+ check_entry(entry1, entry2)
299
+ reverse_complement(entry2) if @options[:reverse_complement]
300
+ output << entry1.to_bp
301
+ output << entry2.to_bp
302
+ @status[:records_out] += 2
303
+ @status[:sequences_out] += 2
304
+ @status[:residues_out] += entry1.length + entry2.length
305
+ end
306
+ end
307
+ end
308
+ end
309
+ end
310
+
311
+ # Return a list of input files or an interleaved list of input files if
312
+ # :input2 is specified.
313
+ #
314
+ # @return [Array] List of FASTQ files.
315
+ def fastq_files
316
+ if @options[:input2]
317
+ files1 = options_glob(@options[:input])
318
+ files2 = options_glob(@options[:input2])
319
+
320
+ check_input_files(files1, files2)
321
+
322
+ files1.zip(files2).flatten
323
+ else
324
+ options_glob(@options[:input])
325
+ end
326
+ end
327
+
328
+ # Do the following for the given entry:
329
+ #
330
+ # * determine encoding.
331
+ # * reverse complement if indicated.
332
+ # * convert encoding
333
+ # * coerce encoding
334
+ # * check score range
335
+ #
336
+ # @param entries [Array] Sequence entries.
337
+ def check_entry(*entries)
338
+ entries.each do |entry|
339
+ determine_encoding(entry)
340
+
341
+ entry.qual_convert!(@encoding, :base_33)
342
+ entry.qual_coerce!(:base_33)
343
+
344
+ check_score_range(entry)
345
+ end
346
+ end
347
+
348
+ # Reverse complement sequence.
349
+ #
350
+ # @param entry [BioDSL::Seq] Sequence entry.
351
+ def reverse_complement(entry)
352
+ @type = entry.type_guess unless @type
353
+ entry.type = @type
354
+ entry.reverse!.complement!
355
+ end
356
+
357
+ # Check that files1 and files2 are equal.
358
+ #
359
+ # @param files1 [Array] List of files.
360
+ # @param files2 [Array] List of files.
361
+ #
362
+ # @raise [BioDSL::OptionError] If not equal.
363
+ def check_input_files(files1, files2)
364
+ size1 = files1.size
365
+ size2 = files2.size
366
+ return if size1 == size2
367
+
368
+ msg = "input and input2 file count don't match: #{size1} != #{size2}"
369
+ fail BioDSL::OptionError, msg
370
+ end
371
+
372
+ # Check the score range for a given entry.
373
+ #
374
+ # @param entry [BioDSL::Seq] Sequence entry.
375
+ #
376
+ # @raise [BioDSL::SeqError] If quality score is outside range.
377
+ def check_score_range(entry)
378
+ return if @status[:sequences_out] >= MAX_TEST
379
+ return if entry.qual_valid?(:base_33)
380
+ fail BioDSL::SeqError, 'Quality score outside valid range'
381
+ end
382
+
383
+ # Determine the quality score encoding.
384
+ #
385
+ # @raise [BioDSL::SeqError] If encoding wasn't determined.
386
+ def determine_encoding(entry)
387
+ return unless @encoding == :auto
388
+
389
+ @encoding = if entry.qual_base33?
390
+ :base_33
391
+ elsif entry.qual_base64?
392
+ :base_64
393
+ else
394
+ msg = 'Could not auto-detect quality score encoding'
395
+ fail BioDSL::SeqError, msg
396
+ end
397
+ end
398
+
399
+ # Emit all records in the buffer to the output stream.
400
+ #
401
+ # @param output [Enumerator::Yielder] Output stream.
402
+ def output_buffer(output)
403
+ return unless @options[:last]
404
+
405
+ @buffer.each do |entry|
406
+ output << entry.to_bp
407
+
408
+ @status[:records_out] += 1
409
+ @status[:sequences_out] += 1
410
+ @status[:residues_out] += entry.length
411
+ end
412
+ end
413
+ end
414
+ end