BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Trim sequence ends in the stream matching a specified primer.
|
|
30
|
+
#
|
|
31
|
+
# +trim_primer+ can trim full or partial primer sequence from sequence ends.
|
|
32
|
+
# This is done by matching the primer at the end specified by the +direction+
|
|
33
|
+
# option:
|
|
34
|
+
#
|
|
35
|
+
# Forward clip:
|
|
36
|
+
# sequence ATCGACTGCATCACGACG
|
|
37
|
+
# primer CATGAATCGA
|
|
38
|
+
# result CTGCATCACGACG
|
|
39
|
+
#
|
|
40
|
+
# Reverse clip:
|
|
41
|
+
# sequence ATCGACTGCATCACGACG
|
|
42
|
+
# primer GACGATAGCA
|
|
43
|
+
# result ATCGACTGCATCAC
|
|
44
|
+
#
|
|
45
|
+
# The primer sequence can be reverse complemented using the
|
|
46
|
+
# +reverse_complement+ option. Also, a minimum overlap for trimming can be
|
|
47
|
+
# specified using the +overlap_min+ option (default=1).
|
|
48
|
+
#
|
|
49
|
+
# Non-perfect matching can be allowed by setting the allowed
|
|
50
|
+
# +mismatch_percent+, +insertion_percent+ and +deletion_percent+.
|
|
51
|
+
#
|
|
52
|
+
# The following keys are added to clipped records:
|
|
53
|
+
#
|
|
54
|
+
# * TRIM_PRIMER_DIR - Direction of clip.
|
|
55
|
+
# * TRIM_PRIMER_POS - Sequence position of clip (0 based).
|
|
56
|
+
# * TRIM_PRIMER_LEN - Length of clip match.
|
|
57
|
+
# * TRIM_PRIMER_PAT - Clip match pattern.
|
|
58
|
+
# == Usage
|
|
59
|
+
#
|
|
60
|
+
# trim_primer(<primer: <string>>, <direction: <:forward|:reverse>
|
|
61
|
+
# [, reverse_complement: <bool>[, overlap_min: <uint>
|
|
62
|
+
# [, mismatch_percent: <uint>
|
|
63
|
+
# [, insertion_percent: <uint>
|
|
64
|
+
# [, deletion_percent: <uint>]]]]])
|
|
65
|
+
#
|
|
66
|
+
# === Options
|
|
67
|
+
#
|
|
68
|
+
# * primer: <string> - Primer sequence to search for.
|
|
69
|
+
# * direction: <:forward|:reverse> - Clip direction.
|
|
70
|
+
# * reverse_complement: <bool> - Reverse complement primer (default=false)
|
|
71
|
+
# * overlap_min: <uint> - Minimum primer length used (default=1)
|
|
72
|
+
# * mismatch_percent: <unit> - Allowed percent mismatches (default=0)
|
|
73
|
+
# * insertion_percent: <unit> - Allowed percent insertions (default=0)
|
|
74
|
+
# * deletion_percent: <unit> - Allowed percent mismatches (default=0)
|
|
75
|
+
#
|
|
76
|
+
# == Examples
|
|
77
|
+
#
|
|
78
|
+
# Consider the following FASTA entry in the file test.fna:
|
|
79
|
+
#
|
|
80
|
+
# >test
|
|
81
|
+
# ACTGACTGATGACTACGACTACGACTACTACTACGT
|
|
82
|
+
#
|
|
83
|
+
# The forward end can be trimmed like this:
|
|
84
|
+
#
|
|
85
|
+
# BP.new.
|
|
86
|
+
# read_fasta(input: "test.fna").
|
|
87
|
+
# trim_primer(primer: "ATAGAACTGAC", direction: :forward).
|
|
88
|
+
# dump.
|
|
89
|
+
# run
|
|
90
|
+
#
|
|
91
|
+
# {:SEQ_NAME=>"test",
|
|
92
|
+
# :SEQ=>"TGATGACTACGACTACGACTACTACTACGT",
|
|
93
|
+
# :SEQ_LEN=>30,
|
|
94
|
+
# :TRIM_PRIMER_DIR=>"FORWARD",
|
|
95
|
+
# :TRIM_PRIMER_POS=>0,
|
|
96
|
+
# :TRIM_PRIMER_LEN=>6,
|
|
97
|
+
# :TRIM_PRIMER_PAT=>"ACTGAC"}
|
|
98
|
+
#
|
|
99
|
+
# And trimming a reverse primer:
|
|
100
|
+
#
|
|
101
|
+
# BP.new.
|
|
102
|
+
# read_fasta(input: "test.fna").
|
|
103
|
+
# trim_primer(primer: "ACTACGTGCGGAT", direction: :reverse).
|
|
104
|
+
# dump.
|
|
105
|
+
# run
|
|
106
|
+
#
|
|
107
|
+
# {:SEQ_NAME=>"test",
|
|
108
|
+
# :SEQ=>"ACTGACTGATGACTACGACTACGACTACT",
|
|
109
|
+
# :SEQ_LEN=>29,
|
|
110
|
+
# :TRIM_PRIMER_DIR=>"REVERSE",
|
|
111
|
+
# :TRIM_PRIMER_POS=>29,
|
|
112
|
+
# :TRIM_PRIMER_LEN=>7,
|
|
113
|
+
# :TRIM_PRIMER_PAT=>"ACTACGT"}
|
|
114
|
+
#
|
|
115
|
+
# rubocop: disable ClassLength
|
|
116
|
+
class TrimPrimer
|
|
117
|
+
STATS = %i(records_in records_out sequences_in sequences_out pattern_hits
|
|
118
|
+
pattern_misses residues_in residues_out)
|
|
119
|
+
|
|
120
|
+
# Constructor for TrimPrimer.
|
|
121
|
+
#
|
|
122
|
+
# @param options [Hash] Options hash.
|
|
123
|
+
# @option options [String] :primer
|
|
124
|
+
# @option options [Symbol] :direction
|
|
125
|
+
# @option options [Boolean] :overlap_min
|
|
126
|
+
# @option options [Boolean] :reverse_complement
|
|
127
|
+
# @option options [Integer] :mismatch_percent
|
|
128
|
+
# @option options [Ingetger] :insertion_percent
|
|
129
|
+
# @option options [Integer] :deletion_percent
|
|
130
|
+
#
|
|
131
|
+
# @return [TrimPrimer] Class instance.
|
|
132
|
+
def initialize(options)
|
|
133
|
+
@options = options
|
|
134
|
+
@options[:overlap_min] ||= 1
|
|
135
|
+
@options[:mismatch_percent] ||= 0
|
|
136
|
+
@options[:insertion_percent] ||= 0
|
|
137
|
+
@options[:deletion_percent] ||= 0
|
|
138
|
+
@pattern = pattern
|
|
139
|
+
@hit = false
|
|
140
|
+
|
|
141
|
+
check_options
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Return command lambda for trim_primer.
|
|
145
|
+
#
|
|
146
|
+
# @return [Proc] Command lambda.
|
|
147
|
+
def lmb
|
|
148
|
+
lambda do |input, output, status|
|
|
149
|
+
status_init(status, STATS)
|
|
150
|
+
|
|
151
|
+
input.each do |record|
|
|
152
|
+
@status[:records_in] += 1
|
|
153
|
+
|
|
154
|
+
if record[:SEQ] && record[:SEQ].length > 0
|
|
155
|
+
@status[:sequences_in] += 1
|
|
156
|
+
|
|
157
|
+
case @options[:direction]
|
|
158
|
+
when :forward then trim_forward(record)
|
|
159
|
+
when :reverse then trim_reverse(record)
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
output << record
|
|
164
|
+
|
|
165
|
+
@status[:records_out] += 1
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
private
|
|
171
|
+
|
|
172
|
+
# Check options.
|
|
173
|
+
def check_options
|
|
174
|
+
options_allowed(@options, :primer, :direction, :overlap_min,
|
|
175
|
+
:reverse_complement, :mismatch_percent,
|
|
176
|
+
:insertion_percent, :deletion_percent)
|
|
177
|
+
options_required(@options, :primer, :direction)
|
|
178
|
+
options_allowed_values(@options, direction: [:forward, :reverse])
|
|
179
|
+
options_allowed_values(@options, reverse_complement: [true, false])
|
|
180
|
+
options_assert(@options, ':overlap_min > 0')
|
|
181
|
+
options_assert(@options, ':mismatch_percent >= 0')
|
|
182
|
+
options_assert(@options, ':insertion_percent >= 0')
|
|
183
|
+
options_assert(@options, ':deletion_percent >= 0')
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Determine the pattern from the sequence and reverse complement if need be.
|
|
187
|
+
def pattern
|
|
188
|
+
if @options[:reverse_complement]
|
|
189
|
+
Seq.new(seq: @options[:primer], type: :dna).reverse.complement.seq
|
|
190
|
+
else
|
|
191
|
+
@options[:primer]
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Trim record with sequence in the forward direction.
|
|
196
|
+
#
|
|
197
|
+
# @param record [Hash] BioDSL record
|
|
198
|
+
def trim_forward(record)
|
|
199
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
200
|
+
|
|
201
|
+
@status[:residues_in] += entry.length
|
|
202
|
+
|
|
203
|
+
while @pattern.length >= @options[:overlap_min]
|
|
204
|
+
if (match = match_forward(entry))
|
|
205
|
+
merge_forward(record, entry, match)
|
|
206
|
+
@hit = true
|
|
207
|
+
break
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
@pattern = @pattern[1...@pattern.length]
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
@hit ? @status[:pattern_hits] += 1 : @status[:pattern_misses] += 1
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Search a given entry and return match data.
|
|
217
|
+
#
|
|
218
|
+
# @param entry [BioDSL::Seq] Sequence entry.
|
|
219
|
+
#
|
|
220
|
+
# @return [BioDSL::Seq::Match,nil] Match result.
|
|
221
|
+
def match_forward(entry)
|
|
222
|
+
match_opt = match_options(@pattern.length)
|
|
223
|
+
match_opt[:start] = 0
|
|
224
|
+
match_opt[:stop] = 0
|
|
225
|
+
|
|
226
|
+
entry.patmatch(@pattern, match_opt)
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Use given match data to extract subsequence from given entry and merge to
|
|
230
|
+
# the given record.
|
|
231
|
+
#
|
|
232
|
+
# @param record [Hash] BioDSL record
|
|
233
|
+
# @param entry [BioDSL::Seq] Sequence entry.
|
|
234
|
+
# @param match [BioDSL::Seq::Match] Match data.
|
|
235
|
+
def merge_forward(record, entry, match)
|
|
236
|
+
entry = entry[match.pos + match.length..-1]
|
|
237
|
+
|
|
238
|
+
record.merge!(entry.to_bp)
|
|
239
|
+
record[:TRIM_PRIMER_DIR] = 'FORWARD'
|
|
240
|
+
record[:TRIM_PRIMER_POS] = match.pos
|
|
241
|
+
record[:TRIM_PRIMER_LEN] = match.length
|
|
242
|
+
record[:TRIM_PRIMER_PAT] = match.match
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# Trim record with sequence in the reverse direction.
|
|
246
|
+
#
|
|
247
|
+
# @param record [Hash] BioDSL record
|
|
248
|
+
def trim_reverse(record)
|
|
249
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
250
|
+
|
|
251
|
+
@status[:residues_in] += entry.length
|
|
252
|
+
|
|
253
|
+
while @pattern.length >= @options[:overlap_min]
|
|
254
|
+
if (match = match_reverse(entry))
|
|
255
|
+
merge_reverse(record, entry, match)
|
|
256
|
+
@hit = true
|
|
257
|
+
break
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
@pattern = @pattern[0...@pattern.length - 1]
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
@hit ? @status[:pattern_hits] += 1 : @status[:pattern_misses] += 1
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
# Search a given entry and return match data.
|
|
267
|
+
#
|
|
268
|
+
# @param entry [BioDSL::Seq] Sequence entry.
|
|
269
|
+
#
|
|
270
|
+
# @return [BioDSL::Seq::Match,nil] Match result.
|
|
271
|
+
def match_reverse(entry)
|
|
272
|
+
match_opt = match_options(@pattern.length)
|
|
273
|
+
|
|
274
|
+
start = entry.length - @pattern.length
|
|
275
|
+
start = 0 if start < 0
|
|
276
|
+
|
|
277
|
+
match_opt[:start] = start
|
|
278
|
+
|
|
279
|
+
entry.patmatch(@pattern, match_opt)
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
# Use given match data to extract subsequence from given entry and merge to
|
|
283
|
+
# the given record.
|
|
284
|
+
#
|
|
285
|
+
# @param record [Hash] BioDSL record
|
|
286
|
+
# @param entry [BioDSL::Seq] Sequence entry.
|
|
287
|
+
# @param match [BioDSL::Seq::Match] Match data.
|
|
288
|
+
def merge_reverse(record, entry, match)
|
|
289
|
+
entry = entry[0...match.pos]
|
|
290
|
+
|
|
291
|
+
record.merge!(entry.to_bp)
|
|
292
|
+
record[:TRIM_PRIMER_DIR] = 'REVERSE'
|
|
293
|
+
record[:TRIM_PRIMER_POS] = match.pos
|
|
294
|
+
record[:TRIM_PRIMER_LEN] = match.length
|
|
295
|
+
record[:TRIM_PRIMER_PAT] = match.match
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
# Calculate from the given pattern lenght the absolue mismatches, insertions
|
|
299
|
+
# and deletions allowed and return a hash with these values.
|
|
300
|
+
#
|
|
301
|
+
# @param length [Integer] Pattern length.
|
|
302
|
+
#
|
|
303
|
+
# @return [Hash] Match options hash.
|
|
304
|
+
def match_options(length)
|
|
305
|
+
mis = (length * @options[:mismatch_percent] * 0.01).round
|
|
306
|
+
ins = (length * @options[:insertion_percent] * 0.01).round
|
|
307
|
+
del = (length * @options[:deletion_percent] * 0.01).round
|
|
308
|
+
|
|
309
|
+
{max_mismatches: mis,
|
|
310
|
+
max_insertions: ins,
|
|
311
|
+
max_deletions: del}
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
end
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Trim sequence ends removing residues with a low quality score.
|
|
30
|
+
#
|
|
31
|
+
# +trim_seq+ removes subquality residues from the ends of sequences in the
|
|
32
|
+
# stream based on quality SCORES in a FASTQ type quality score string.
|
|
33
|
+
# Trimming progresses until a stretch, specified with the +length_min+
|
|
34
|
+
# option, is found thus preventing premature termination of the trimming
|
|
35
|
+
# by e.g. a single good quality residue at the end. It is possible, using
|
|
36
|
+
# the +mode+ option to indicate if the sequence should be trimmed from the
|
|
37
|
+
# left or right end or both (default=:both).
|
|
38
|
+
#
|
|
39
|
+
# == Usage
|
|
40
|
+
#
|
|
41
|
+
# trim_seq([quality_min: <uint>[, length_min: <uint>
|
|
42
|
+
# [, mode: <:left|:right|:both>]]])
|
|
43
|
+
#
|
|
44
|
+
# === Options
|
|
45
|
+
#
|
|
46
|
+
# * quality_min: <uint> - Minimum quality (default=20).
|
|
47
|
+
# * length_min: <uint> - Minimum stretch length (default=3).
|
|
48
|
+
# * mode: <string> - Trim mode :left|:right|:both (default=:both).
|
|
49
|
+
#
|
|
50
|
+
# == Examples
|
|
51
|
+
#
|
|
52
|
+
# Consider the following FASTQ entry in the file test.fq:
|
|
53
|
+
#
|
|
54
|
+
# @test
|
|
55
|
+
# gatcgatcgtacgagcagcatctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcat
|
|
56
|
+
# +
|
|
57
|
+
# @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJI
|
|
58
|
+
#
|
|
59
|
+
# To trim both ends simply do:
|
|
60
|
+
#
|
|
61
|
+
# BP.new.read_fastq(input: "test.fq").trim_seq.trim_seq.run
|
|
62
|
+
#
|
|
63
|
+
# SEQ_NAME: test
|
|
64
|
+
# SEQ: tctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcat
|
|
65
|
+
# SEQ_LEN: 62
|
|
66
|
+
# SCORES: TUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJI
|
|
67
|
+
# ---
|
|
68
|
+
#
|
|
69
|
+
# Use the +quality_min+ option to change the minimum value to discard:
|
|
70
|
+
#
|
|
71
|
+
# BP.new.
|
|
72
|
+
# read_fastq(input: "test.fq").
|
|
73
|
+
# trim_seq(quality_min: 25).
|
|
74
|
+
# trim_seq.
|
|
75
|
+
# run
|
|
76
|
+
#
|
|
77
|
+
# SEQ_NAME: test
|
|
78
|
+
# SEQ: cgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag
|
|
79
|
+
# SEQ_LEN: 57
|
|
80
|
+
# SCORES: YZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh
|
|
81
|
+
# ---
|
|
82
|
+
#
|
|
83
|
+
# To trim the left end only (use :rigth for right end only), do:
|
|
84
|
+
#
|
|
85
|
+
# BP.new.read_fastq(input: "test.fq").trim_seq(mode: :left).trim_seq.run
|
|
86
|
+
#
|
|
87
|
+
# SEQ_NAME: test
|
|
88
|
+
# SEQ: tctgacgtatcgatcgttgattagttgctagctatgcagtctacgacgagcatgctagctag
|
|
89
|
+
# SEQ_LEN: 62
|
|
90
|
+
# SCORES: TUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDChhh
|
|
91
|
+
# ---
|
|
92
|
+
#
|
|
93
|
+
# To increase the length of stretch of good quality residues to match, use
|
|
94
|
+
# the +length_min+ option:
|
|
95
|
+
#
|
|
96
|
+
# BP.new.read_fastq(input: "test.fq").trim_seq(length_min: 4).trim_seq.run
|
|
97
|
+
#
|
|
98
|
+
# SEQ_NAME: test
|
|
99
|
+
# SEQ: tctgacgtatcgatcgttgattagttgctagctatgcagtct
|
|
100
|
+
# SEQ_LEN: 42
|
|
101
|
+
# SCORES: TUVWXYZ[\]^_`abcdefghhgfedcba`_^]\[ZYXWVUT
|
|
102
|
+
# ---
|
|
103
|
+
class TrimSeq
|
|
104
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
105
|
+
residues_out)
|
|
106
|
+
|
|
107
|
+
# Constructor for the TrimSeq class.
|
|
108
|
+
#
|
|
109
|
+
# @param [Hash] options Options hash.
|
|
110
|
+
#
|
|
111
|
+
# @option options [Integer] :quality_min
|
|
112
|
+
# TrimSeq minimum quality (default=20).
|
|
113
|
+
#
|
|
114
|
+
# @option options [Symbol] :mode
|
|
115
|
+
# TrimSeq mode (default=:both).
|
|
116
|
+
#
|
|
117
|
+
# @option options [Integer] :length_min
|
|
118
|
+
# TrimSeq stretch length triggering trim (default=3).
|
|
119
|
+
#
|
|
120
|
+
# @return [Proc] Returns the trim_seq command lambda.
|
|
121
|
+
#
|
|
122
|
+
# @return [TrimSeq] Returns an instance of the TrimSeq class.
|
|
123
|
+
def initialize(options)
|
|
124
|
+
@options = options
|
|
125
|
+
|
|
126
|
+
check_options
|
|
127
|
+
defaults
|
|
128
|
+
|
|
129
|
+
@mode = @options[:mode].to_sym
|
|
130
|
+
@min = @options[:quality_min]
|
|
131
|
+
@len = @options[:length_min]
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Return a lambda for the trim_seq command.
|
|
135
|
+
#
|
|
136
|
+
# @return [Proc] Returns the trim_seq command lambda.
|
|
137
|
+
def lmb
|
|
138
|
+
lambda do |input, output, status|
|
|
139
|
+
status_init(status, STATS)
|
|
140
|
+
|
|
141
|
+
input.each do |record|
|
|
142
|
+
@status[:records_in] += 1
|
|
143
|
+
|
|
144
|
+
trim_seq(record) if record[:SEQ] && record[:SCORES]
|
|
145
|
+
|
|
146
|
+
output << record
|
|
147
|
+
|
|
148
|
+
@status[:records_out] += 1
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
private
|
|
154
|
+
|
|
155
|
+
# Check the options.
|
|
156
|
+
def check_options
|
|
157
|
+
options_allowed(@options, :quality_min, :length_min, :mode)
|
|
158
|
+
options_allowed_values(@options, mode: [:left, :right, :both])
|
|
159
|
+
options_assert(@options, ':quality_min >= 0')
|
|
160
|
+
options_assert(@options, ':quality_min <= 40')
|
|
161
|
+
options_assert(@options, ':length_min > 0')
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Set defaul options.
|
|
165
|
+
def defaults
|
|
166
|
+
@options[:quality_min] ||= 20
|
|
167
|
+
@options[:mode] ||= :both
|
|
168
|
+
@options[:length_min] ||= 3
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Trim sequence in a given record with sequence info.
|
|
172
|
+
#
|
|
173
|
+
# @param record [Hash] BioDSL record
|
|
174
|
+
def trim_seq(record)
|
|
175
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
176
|
+
|
|
177
|
+
@status[:sequences_in] += 1
|
|
178
|
+
@status[:residues_in] += entry.length
|
|
179
|
+
|
|
180
|
+
case @mode
|
|
181
|
+
when :both then entry.quality_trim!(@min, @len)
|
|
182
|
+
when :left then entry.quality_trim_left!(@min, @len)
|
|
183
|
+
when :right then entry.quality_trim_right!(@min, @len)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
@status[:sequences_out] += 1
|
|
187
|
+
@status[:residues_out] += entry.length
|
|
188
|
+
|
|
189
|
+
record.merge! entry.to_bp
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
end
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Run uchime_ref on sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# This is a wrapper for the +usearch+ tool to run the program uchime_ref.
|
|
32
|
+
# Basically sequence type records are searched against a reference database or
|
|
33
|
+
# non-chimeric sequences, and chimirec sequences are filtered out so only
|
|
34
|
+
# non-chimeric sequences are output.
|
|
35
|
+
#
|
|
36
|
+
# Please refer to the manual:
|
|
37
|
+
#
|
|
38
|
+
# http://drive5.com/usearch/manual/uchime_ref.html
|
|
39
|
+
#
|
|
40
|
+
# Usearch 7.0 must be installed for +usearch+ to work. Read more here:
|
|
41
|
+
#
|
|
42
|
+
# http://www.drive5.com/usearch/
|
|
43
|
+
#
|
|
44
|
+
# == Usage
|
|
45
|
+
#
|
|
46
|
+
# uchime_ref(<database: <file>[cpus: <uint>])
|
|
47
|
+
#
|
|
48
|
+
# === Options
|
|
49
|
+
#
|
|
50
|
+
# * database: <file> - Database to search (in FASTA format).
|
|
51
|
+
# * cpus: <uint> - Number of CPU cores to use (default=1).
|
|
52
|
+
#
|
|
53
|
+
# == Examples
|
|
54
|
+
#
|
|
55
|
+
class UchimeRef
|
|
56
|
+
require 'BioDSL/helpers/aux_helper'
|
|
57
|
+
|
|
58
|
+
include AuxHelper
|
|
59
|
+
|
|
60
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
61
|
+
residues_out)
|
|
62
|
+
|
|
63
|
+
# Constructor for UchimeRef.
|
|
64
|
+
#
|
|
65
|
+
# @param options [Hash] Options hash.
|
|
66
|
+
# @option options [String] :database
|
|
67
|
+
# @option options [Integer] :cpus
|
|
68
|
+
#
|
|
69
|
+
# @return [UchimeRef] Class instance.
|
|
70
|
+
def initialize(options)
|
|
71
|
+
@options = options
|
|
72
|
+
aux_exist('usearch')
|
|
73
|
+
check_options
|
|
74
|
+
@options[:cpus] ||= 1
|
|
75
|
+
@options[:strand] ||= 'plus' # This option cant be changed in usearch7.0
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Return command lambda for uchime_ref.
|
|
79
|
+
#
|
|
80
|
+
# @return [Proc] Command lambda.
|
|
81
|
+
def lmb
|
|
82
|
+
lambda do |input, output, status|
|
|
83
|
+
status_init(status, STATS)
|
|
84
|
+
|
|
85
|
+
TmpDir.create('input', 'output') do |tmp_in, tmp_out|
|
|
86
|
+
process_input(input, output, tmp_in)
|
|
87
|
+
run_uchime_ref(tmp_in, tmp_out)
|
|
88
|
+
|
|
89
|
+
process_output(output, tmp_out)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
private
|
|
95
|
+
|
|
96
|
+
# Check options.
|
|
97
|
+
def check_options
|
|
98
|
+
options_allowed(@options, :database, :cpus)
|
|
99
|
+
options_required(@options, :database)
|
|
100
|
+
options_files_exist(@options, :database)
|
|
101
|
+
options_assert(@options, ':cpus >= 1')
|
|
102
|
+
options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Process input stream and save records with sequences to a temporary FASTA
|
|
106
|
+
# file or emit non-sequence containing records to the output stream.
|
|
107
|
+
#
|
|
108
|
+
# @param input [Enumerator] Input stream.
|
|
109
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
110
|
+
# @param tmp_in [String] Path to temporary FASTA file.
|
|
111
|
+
def process_input(input, output, tmp_in)
|
|
112
|
+
BioDSL::Fasta.open(tmp_in, 'w') do |ios|
|
|
113
|
+
input.each_with_index do |record, i|
|
|
114
|
+
@status[:records_in] += 1
|
|
115
|
+
|
|
116
|
+
if record[:SEQ]
|
|
117
|
+
@status[:sequences_in] += 1
|
|
118
|
+
@status[:residues_in] += record[:SEQ].length
|
|
119
|
+
seq_name = record[:SEQ_NAME] || i.to_s
|
|
120
|
+
|
|
121
|
+
entry = BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ])
|
|
122
|
+
|
|
123
|
+
ios.puts entry.to_fasta
|
|
124
|
+
else
|
|
125
|
+
output << record
|
|
126
|
+
@status[:records_out] += 1
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Run uchime_ref on input file and save result input file.
|
|
133
|
+
#
|
|
134
|
+
# @param tmp_in [String] Path to input file.
|
|
135
|
+
# @param tmp_out [String] Path to output file.
|
|
136
|
+
#
|
|
137
|
+
# @raise [BioDSL::UsearchError] If command fails.
|
|
138
|
+
def run_uchime_ref(tmp_in, tmp_out)
|
|
139
|
+
uchime_opts = {
|
|
140
|
+
input: tmp_in,
|
|
141
|
+
output: tmp_out,
|
|
142
|
+
database: @options[:database],
|
|
143
|
+
strand: @options[:strand],
|
|
144
|
+
cpus: @options[:cpus],
|
|
145
|
+
verbose: @options[:verbose]
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
BioDSL::Usearch.uchime_ref(uchime_opts)
|
|
149
|
+
rescue BioDSL::UsearchError => e
|
|
150
|
+
raise unless e.message =~ /Empty input file/
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Process uchime_ref output data and emit to output stream.
|
|
154
|
+
#
|
|
155
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
156
|
+
# @param tmp_out [String] Path to file with uchime_ref data.
|
|
157
|
+
def process_output(output, tmp_out)
|
|
158
|
+
Fasta.open(tmp_out) do |ios|
|
|
159
|
+
ios.each do |entry|
|
|
160
|
+
record = entry.to_bp
|
|
161
|
+
|
|
162
|
+
output << record
|
|
163
|
+
@status[:sequences_out] += 1
|
|
164
|
+
@status[:residues_out] += entry.length
|
|
165
|
+
@status[:records_out] += 1
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
end
|