BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
@@ -0,0 +1,414 @@
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
18
|
+
# USA. #
|
19
|
+
# #
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
21
|
+
# #
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
|
+
# #
|
24
|
+
# This software is part of BioDSL (www.github.com/maasha/BioDSL). #
|
25
|
+
# #
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
|
+
|
28
|
+
module BioDSL
|
29
|
+
# == Read FASTQ entries from one or more files.
|
30
|
+
#
|
31
|
+
# +read_fastq+ read in sequence entries from FASTQ files. Each sequence entry
|
32
|
+
# consists of a sequence name prefixed by a '>' followed by the sequence name
|
33
|
+
# on a line of its own, followed by one or my lines of sequence until the next
|
34
|
+
# entry or the end of the file. The resulting Biopiece record consists of the
|
35
|
+
# following record type:
|
36
|
+
#
|
37
|
+
# {:SEQ_NAME=>"test",
|
38
|
+
# :SEQ=>"AGCATCGACTAGCAGCATTT",
|
39
|
+
# :SEQ_LEN=>20}
|
40
|
+
#
|
41
|
+
# It is possible to read in pair-end data interleaved by using the +input2+
|
42
|
+
# option. Thus a read is in turn from input and input2. If the
|
43
|
+
# +reverse_complement+ option is used, then the input2 reads will be
|
44
|
+
# reverse-complemented.
|
45
|
+
#
|
46
|
+
# Input files may be compressed with gzip og bzip2.
|
47
|
+
#
|
48
|
+
# For more about the FASTQ format:
|
49
|
+
#
|
50
|
+
# http://en.wikipedia.org/wiki/Fasta_format
|
51
|
+
#
|
52
|
+
# == Usage
|
53
|
+
# read_fastq(input: <glob>[, input2: <glob>[, first: <uint>|last: <uint>
|
54
|
+
# [, reverse_complement: <bool>]]])
|
55
|
+
#
|
56
|
+
# === Options
|
57
|
+
# * input <glob> - Input file or file glob expression.
|
58
|
+
# * input2 <glob> - Input file or file glob expression.
|
59
|
+
# * first <uint> - Only read in the _first_ number of entries.
|
60
|
+
# * last <uint> - Only read in the _last_ number of entries.
|
61
|
+
# * reverse_complement: <bool> - Reverse-complements input2 reads.
|
62
|
+
#
|
63
|
+
# == Examples
|
64
|
+
#
|
65
|
+
# To read all FASTQ entries from a file:
|
66
|
+
#
|
67
|
+
# BP.new.read_fastq(input: "test.fq").dump.run
|
68
|
+
#
|
69
|
+
# To read all FASTQ entries from a gzipped file:
|
70
|
+
#
|
71
|
+
# BP.new.read_fastq(input: "test.fq.gz").dump.run
|
72
|
+
#
|
73
|
+
# To read in only 10 records from a FASTQ file:
|
74
|
+
#
|
75
|
+
# BP.new.read_fastq(input: "test.fq", first: 10).dump.run
|
76
|
+
#
|
77
|
+
# To read in the last 10 records from a FASTQ file:
|
78
|
+
#
|
79
|
+
# BP.new.read_fastq(input: "test.fq", last: 10).dump.run
|
80
|
+
#
|
81
|
+
# To read all FASTQ entries from multiple files:
|
82
|
+
#
|
83
|
+
# BP.new.read_fastq(input: "test1.fq,test2.fq").dump.run
|
84
|
+
#
|
85
|
+
# To read FASTQ entries from multiple files using a glob expression:
|
86
|
+
#
|
87
|
+
# BP.new.read_fastq(input: "*.fq").dump.run
|
88
|
+
#
|
89
|
+
# To read FASTQ entries from pair-end data:
|
90
|
+
#
|
91
|
+
# BP.new.read_fastq(input: "file1.fq", input2: "file2.fq").dump.run
|
92
|
+
#
|
93
|
+
# To read FASTQ entries from pair-end data:
|
94
|
+
#
|
95
|
+
# BP.new.read_fastq(input: "file1.fq", input2: "file2.fq").dump.run
|
96
|
+
#
|
97
|
+
# To read FASTQ entries from pair-end data and reverse-complement read2:
|
98
|
+
#
|
99
|
+
# BP.new.
|
100
|
+
# read_fastq(input: "file1.fq", input2: "file2.fq",
|
101
|
+
# reverse_complement: true)
|
102
|
+
# .dump.run
|
103
|
+
#
|
104
|
+
# rubocop: disable ClassLength
|
105
|
+
# rubocop: disable Metrics/AbcSize
|
106
|
+
# rubocop: disable Metrics/CyclomaticComplexity
|
107
|
+
# rubocop: disable Metrics/PerceivedComplexity
|
108
|
+
class ReadFastq
|
109
|
+
MAX_TEST = 1_000
|
110
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
111
|
+
residues_out)
|
112
|
+
|
113
|
+
# Constructor for ReadFastq.
|
114
|
+
#
|
115
|
+
# @param options [Hash] Options hash.
|
116
|
+
# @option options [Symbol,String] :encoding
|
117
|
+
# @option options [String] :input
|
118
|
+
# @option options [String] :input2
|
119
|
+
# @option options [Integer] :first
|
120
|
+
# @option options [Integer] :last
|
121
|
+
# @option options [Boolean] :reverse_complement
|
122
|
+
#
|
123
|
+
# @return [ReadFastq] Class instance.
|
124
|
+
def initialize(options)
|
125
|
+
@options = options
|
126
|
+
@encoding = options[:encoding] ? options[:encoding].to_sym : :auto
|
127
|
+
@pair = options[:input2]
|
128
|
+
@buffer = []
|
129
|
+
@type = nil
|
130
|
+
|
131
|
+
check_options
|
132
|
+
end
|
133
|
+
|
134
|
+
# Return command lambda for ReadFastq.
|
135
|
+
#
|
136
|
+
# @return [Proc] Command lambda.
|
137
|
+
def lmb
|
138
|
+
lambda do |input, output, status|
|
139
|
+
status_init(status, STATS)
|
140
|
+
|
141
|
+
process_input(input, output)
|
142
|
+
|
143
|
+
case
|
144
|
+
when @options[:first] && @pair then read_first_pair(output)
|
145
|
+
when @options[:first] then read_first_single(output)
|
146
|
+
when @options[:last] && @pair then read_last_pair(output)
|
147
|
+
when @options[:last] then read_last_single(output)
|
148
|
+
when @pair then read_all_pair(output)
|
149
|
+
else
|
150
|
+
read_all_single(output)
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
private
|
156
|
+
|
157
|
+
# Check options.
|
158
|
+
def check_options
|
159
|
+
options_allowed(@options, :encoding, :input, :input2, :first, :last,
|
160
|
+
:reverse_complement)
|
161
|
+
options_allowed_values(@options, encoding: [:auto, :base_33, :base_64])
|
162
|
+
options_allowed_values(@options, reverse_complement: [nil, true, false])
|
163
|
+
options_tie(@options, reverse_complement: :input2)
|
164
|
+
options_required(@options, :input)
|
165
|
+
options_files_exist(@options, :input, :input2)
|
166
|
+
options_unique(@options, :first, :last)
|
167
|
+
options_assert(@options, ':first >= 0')
|
168
|
+
options_assert(@options, ':last >= 0')
|
169
|
+
end
|
170
|
+
|
171
|
+
# Emit all records from the input stream to the output stream.
|
172
|
+
#
|
173
|
+
# @param input [Enumerator] Input stream.
|
174
|
+
# @param output [Enumerator::Yielder] Output stream.
|
175
|
+
def process_input(input, output)
|
176
|
+
return unless input
|
177
|
+
|
178
|
+
input.each do |record|
|
179
|
+
@status[:records_in] += 1
|
180
|
+
@status[:records_out] += 1
|
181
|
+
|
182
|
+
if (seq = record[:SEQ])
|
183
|
+
@status[:sequences_in] += 1
|
184
|
+
@status[:residues_in] += seq.length
|
185
|
+
end
|
186
|
+
|
187
|
+
output << record
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
# Read :first FASTQ entries from single files.
|
192
|
+
#
|
193
|
+
# @param output [Enumerator::Yielder] Output stream.
|
194
|
+
def read_first_single(output)
|
195
|
+
fastq_files.each do |file|
|
196
|
+
BioDSL::Fastq.open(file) do |ios|
|
197
|
+
ios.each do |entry|
|
198
|
+
check_entry(entry)
|
199
|
+
output << entry.to_bp
|
200
|
+
@status[:records_out] += 1
|
201
|
+
@status[:sequences_out] += 1
|
202
|
+
@status[:residues_out] += entry.length
|
203
|
+
return if @status[:sequences_out] >= @options[:first]
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
# Read :first FASTQ entries from paired files interleaved.
|
210
|
+
#
|
211
|
+
# @param output [Enumerator::Yielder] Output stream.
|
212
|
+
#
|
213
|
+
# rubocop: disable MethodLength
|
214
|
+
def read_first_pair(output)
|
215
|
+
fastq_files.each_slice(2) do |file1, file2|
|
216
|
+
BioDSL::Fastq.open(file1) do |ios1|
|
217
|
+
BioDSL::Fastq.open(file2) do |ios2|
|
218
|
+
while (entry1 = ios1.next_entry) && (entry2 = ios2.next_entry)
|
219
|
+
check_entry(entry1, entry2)
|
220
|
+
reverse_complement(entry2) if @options[:reverse_complement]
|
221
|
+
output << entry1.to_bp
|
222
|
+
output << entry2.to_bp
|
223
|
+
@status[:records_out] += 2
|
224
|
+
@status[:sequences_out] += 2
|
225
|
+
@status[:residues_out] += entry1.length + entry2.length
|
226
|
+
return if @status[:sequences_out] >= @options[:first]
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
# Read :last FASTQ entries from single files.
|
234
|
+
#
|
235
|
+
# @param output [Enumerator::Yielder] Output stream.
|
236
|
+
#
|
237
|
+
# rubocop: enable MethodLength
|
238
|
+
def read_last_single(output)
|
239
|
+
fastq_files.each do |file|
|
240
|
+
BioDSL::Fastq.open(file) do |ios|
|
241
|
+
ios.each do |entry|
|
242
|
+
check_entry(entry)
|
243
|
+
@buffer << entry
|
244
|
+
@buffer.shift if @buffer.size > @options[:last]
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
output_buffer(output)
|
250
|
+
end
|
251
|
+
|
252
|
+
# Read :last FASTQ entries from paired files interleaved.
|
253
|
+
#
|
254
|
+
# @param output [Enumerator::Yielder] Output stream.
|
255
|
+
def read_last_pair(output)
|
256
|
+
fastq_files.each_slice(2) do |file1, file2|
|
257
|
+
BioDSL::Fastq.open(file1) do |ios1|
|
258
|
+
BioDSL::Fastq.open(file2) do |ios2|
|
259
|
+
while (entry1 = ios1.next_entry) && (entry2 = ios2.next_entry)
|
260
|
+
check_entry(entry1, entry2)
|
261
|
+
reverse_complement(entry2) if @options[:reverse_complement]
|
262
|
+
@buffer << entry1
|
263
|
+
@buffer << entry2
|
264
|
+
@buffer.shift(@buffer.size - @options[:last])
|
265
|
+
end
|
266
|
+
end
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
output_buffer(output)
|
271
|
+
end
|
272
|
+
|
273
|
+
# Read all FASTQ entries from single files.
|
274
|
+
#
|
275
|
+
# @param output [Enumerator::Yielder] Output stream.
|
276
|
+
def read_all_single(output)
|
277
|
+
fastq_files.each do |file|
|
278
|
+
BioDSL::Fastq.open(file) do |ios|
|
279
|
+
ios.each do |entry|
|
280
|
+
check_entry(entry)
|
281
|
+
output << entry.to_bp
|
282
|
+
@status[:records_out] += 1
|
283
|
+
@status[:sequences_out] += 1
|
284
|
+
@status[:residues_out] += entry.length
|
285
|
+
end
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
# Read all FASTQ entries from paired files interleaved.
|
291
|
+
#
|
292
|
+
# @param output [Enumerator::Yielder] Output stream.
|
293
|
+
def read_all_pair(output)
|
294
|
+
fastq_files.each_slice(2) do |file1, file2|
|
295
|
+
BioDSL::Fastq.open(file1) do |ios1|
|
296
|
+
BioDSL::Fastq.open(file2) do |ios2|
|
297
|
+
while (entry1 = ios1.next_entry) && (entry2 = ios2.next_entry)
|
298
|
+
check_entry(entry1, entry2)
|
299
|
+
reverse_complement(entry2) if @options[:reverse_complement]
|
300
|
+
output << entry1.to_bp
|
301
|
+
output << entry2.to_bp
|
302
|
+
@status[:records_out] += 2
|
303
|
+
@status[:sequences_out] += 2
|
304
|
+
@status[:residues_out] += entry1.length + entry2.length
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
# Return a list of input files or an interleaved list of input files if
|
312
|
+
# :input2 is specified.
|
313
|
+
#
|
314
|
+
# @return [Array] List of FASTQ files.
|
315
|
+
def fastq_files
|
316
|
+
if @options[:input2]
|
317
|
+
files1 = options_glob(@options[:input])
|
318
|
+
files2 = options_glob(@options[:input2])
|
319
|
+
|
320
|
+
check_input_files(files1, files2)
|
321
|
+
|
322
|
+
files1.zip(files2).flatten
|
323
|
+
else
|
324
|
+
options_glob(@options[:input])
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
# Do the following for the given entry:
|
329
|
+
#
|
330
|
+
# * determine encoding.
|
331
|
+
# * reverse complement if indicated.
|
332
|
+
# * convert encoding
|
333
|
+
# * coerce encoding
|
334
|
+
# * check score range
|
335
|
+
#
|
336
|
+
# @param entries [Array] Sequence entries.
|
337
|
+
def check_entry(*entries)
|
338
|
+
entries.each do |entry|
|
339
|
+
determine_encoding(entry)
|
340
|
+
|
341
|
+
entry.qual_convert!(@encoding, :base_33)
|
342
|
+
entry.qual_coerce!(:base_33)
|
343
|
+
|
344
|
+
check_score_range(entry)
|
345
|
+
end
|
346
|
+
end
|
347
|
+
|
348
|
+
# Reverse complement sequence.
|
349
|
+
#
|
350
|
+
# @param entry [BioDSL::Seq] Sequence entry.
|
351
|
+
def reverse_complement(entry)
|
352
|
+
@type = entry.type_guess unless @type
|
353
|
+
entry.type = @type
|
354
|
+
entry.reverse!.complement!
|
355
|
+
end
|
356
|
+
|
357
|
+
# Check that files1 and files2 are equal.
|
358
|
+
#
|
359
|
+
# @param files1 [Array] List of files.
|
360
|
+
# @param files2 [Array] List of files.
|
361
|
+
#
|
362
|
+
# @raise [BioDSL::OptionError] If not equal.
|
363
|
+
def check_input_files(files1, files2)
|
364
|
+
size1 = files1.size
|
365
|
+
size2 = files2.size
|
366
|
+
return if size1 == size2
|
367
|
+
|
368
|
+
msg = "input and input2 file count don't match: #{size1} != #{size2}"
|
369
|
+
fail BioDSL::OptionError, msg
|
370
|
+
end
|
371
|
+
|
372
|
+
# Check the score range for a given entry.
|
373
|
+
#
|
374
|
+
# @param entry [BioDSL::Seq] Sequence entry.
|
375
|
+
#
|
376
|
+
# @raise [BioDSL::SeqError] If quality score is outside range.
|
377
|
+
def check_score_range(entry)
|
378
|
+
return if @status[:sequences_out] >= MAX_TEST
|
379
|
+
return if entry.qual_valid?(:base_33)
|
380
|
+
fail BioDSL::SeqError, 'Quality score outside valid range'
|
381
|
+
end
|
382
|
+
|
383
|
+
# Determine the quality score encoding.
|
384
|
+
#
|
385
|
+
# @raise [BioDSL::SeqError] If encoding wasn't determined.
|
386
|
+
def determine_encoding(entry)
|
387
|
+
return unless @encoding == :auto
|
388
|
+
|
389
|
+
@encoding = if entry.qual_base33?
|
390
|
+
:base_33
|
391
|
+
elsif entry.qual_base64?
|
392
|
+
:base_64
|
393
|
+
else
|
394
|
+
msg = 'Could not auto-detect quality score encoding'
|
395
|
+
fail BioDSL::SeqError, msg
|
396
|
+
end
|
397
|
+
end
|
398
|
+
|
399
|
+
# Emit all records in the buffer to the output stream.
|
400
|
+
#
|
401
|
+
# @param output [Enumerator::Yielder] Output stream.
|
402
|
+
def output_buffer(output)
|
403
|
+
return unless @options[:last]
|
404
|
+
|
405
|
+
@buffer.each do |entry|
|
406
|
+
output << entry.to_bp
|
407
|
+
|
408
|
+
@status[:records_out] += 1
|
409
|
+
@status[:sequences_out] += 1
|
410
|
+
@status[:residues_out] += entry.length
|
411
|
+
end
|
412
|
+
end
|
413
|
+
end
|
414
|
+
end
|