BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Clip sequences in the stream at a specified primer location.
|
|
30
|
+
#
|
|
31
|
+
# +clip_primer+ locates a specified +primer+ in sequences in the stream and
|
|
32
|
+
# clips the sequence after the match if the +direction+ is forward or before
|
|
33
|
+
# the match is the +direction+ is reverse. Using the +reverse_complement+
|
|
34
|
+
# option the primer sequence will be reverse complemented prior to matching.
|
|
35
|
+
# Using the +search_distance+ option will limit the primer search to the
|
|
36
|
+
# beginning of the sequence if the +direction+ is forward and to the end if
|
|
37
|
+
# the direction is +reverse+.
|
|
38
|
+
#
|
|
39
|
+
# Non-perfect matching can be allowed by setting the allowed
|
|
40
|
+
# +mismatch_percent+, +insertion_percent+ and +deletion_percent+.
|
|
41
|
+
#
|
|
42
|
+
# The following keys are added to clipped records:
|
|
43
|
+
#
|
|
44
|
+
# * CLIP_PRIMER_DIR - Direction of clip.
|
|
45
|
+
# * CLIP_PRIMER_POS - Sequence position of clip (0 based).
|
|
46
|
+
# * CLIP_PRIMER_LEN - Length of clip match.
|
|
47
|
+
# * CLIP_PRIMER_PAT - Clip match pattern.
|
|
48
|
+
# == Usage
|
|
49
|
+
#
|
|
50
|
+
# clip_primer(<primer: <string>>, <direction: <:forward|:reverse>
|
|
51
|
+
# [, reverse_complement: <bool>[, search_distance: <uint>
|
|
52
|
+
# [, mismatch_percent: <uint>
|
|
53
|
+
# [, insertion_percent: <uint>
|
|
54
|
+
# [, deletion_percent: <uint>]]]]])
|
|
55
|
+
#
|
|
56
|
+
# === Options
|
|
57
|
+
#
|
|
58
|
+
# * primer: <string> - Primer sequence to search for.
|
|
59
|
+
# * direction: <:forward|:reverse> - Clip direction.
|
|
60
|
+
# * reverse_complement: <bool> -
|
|
61
|
+
# Reverse complement primer (default=false).
|
|
62
|
+
# * search_distance: <uint> -
|
|
63
|
+
# Search distance from forward or reverse end.
|
|
64
|
+
# * mismatch_percent: <unit> - Allowed percent mismatches (default=0).
|
|
65
|
+
# * insertion_percent: <unit> - Allowed percent insertions (default=0).
|
|
66
|
+
# * deletion_percent: <unit> - Allowed percent mismatches (default=0).
|
|
67
|
+
#
|
|
68
|
+
# == Examples
|
|
69
|
+
#
|
|
70
|
+
# Consider the following FASTA entry in the file test.fq:
|
|
71
|
+
#
|
|
72
|
+
# >test
|
|
73
|
+
# actgactgaTCGTATGCCGTCTTCTGCTTactacgt
|
|
74
|
+
#
|
|
75
|
+
# To clip this sequence in the forward direction with the primer
|
|
76
|
+
# 'TGACTACGACTACGACTACT' do:
|
|
77
|
+
#
|
|
78
|
+
# BP.new.
|
|
79
|
+
# read_fasta(input: "test.fna").
|
|
80
|
+
# clip_primer(primer: "TGACTACGACTACGACTACT", direction: :forward).
|
|
81
|
+
# dump.
|
|
82
|
+
# run
|
|
83
|
+
#
|
|
84
|
+
# {:SEQ_NAME=>"test",
|
|
85
|
+
# :SEQ=>"actacgt",
|
|
86
|
+
# :SEQ_LEN=>7,
|
|
87
|
+
# :CLIP_PRIMER_DIR=>"FORWARD",
|
|
88
|
+
# :CLIP_PRIMER_POS=>9,
|
|
89
|
+
# :CLIP_PRIMER_LEN=>20,
|
|
90
|
+
# :CLIP_PRIMER_PAT=>"TGACTACGACTACGACTACT"}
|
|
91
|
+
#
|
|
92
|
+
# Or in the reverse direction:
|
|
93
|
+
#
|
|
94
|
+
# BP.new.
|
|
95
|
+
# read_fasta(input: "test.fna").
|
|
96
|
+
# clip_primer(primer: "TGACTACGACTACGACTACT", direction: :reverse).
|
|
97
|
+
# dump.
|
|
98
|
+
# run
|
|
99
|
+
#
|
|
100
|
+
# {:SEQ_NAME=>"test",
|
|
101
|
+
# :SEQ=>"actgactga",
|
|
102
|
+
# :SEQ_LEN=>9,
|
|
103
|
+
# :CLIP_PRIMER_DIR=>"REVERSE",
|
|
104
|
+
# :CLIP_PRIMER_POS=>9,
|
|
105
|
+
# :CLIP_PRIMER_LEN=>20,
|
|
106
|
+
# :CLIP_PRIMER_PAT=>"TGACTACGACTACGACTACT"}
|
|
107
|
+
# rubocop:disable ClassLength
|
|
108
|
+
class ClipPrimer
|
|
109
|
+
STATS = %i(records_in records_out sequences_in sequences_out
|
|
110
|
+
residues_in residues_out pattern_hits pattern_misses)
|
|
111
|
+
|
|
112
|
+
# Constructor for ClipPrimer.
|
|
113
|
+
#
|
|
114
|
+
# @param options [Hash] Options hash.
|
|
115
|
+
# @option options [String] :primer Primer used for matching.
|
|
116
|
+
# @option options [Symbol] :direction Direction for clipping.
|
|
117
|
+
# @option options [Integer] :search_distance Search distance.
|
|
118
|
+
# @option options [Boolean] :reverse_complment
|
|
119
|
+
# Flag indicating that primer should be reverse complemented.
|
|
120
|
+
#
|
|
121
|
+
# @return [ClipPrimer] Returns ClipPrimer instance.
|
|
122
|
+
def initialize(options)
|
|
123
|
+
@options = options
|
|
124
|
+
defaults
|
|
125
|
+
check_options
|
|
126
|
+
|
|
127
|
+
@primer = primer
|
|
128
|
+
@mis = calc_mis
|
|
129
|
+
@ins = calc_ins
|
|
130
|
+
@del = calc_del
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Lambda for ClipPrimer command.
|
|
134
|
+
#
|
|
135
|
+
# @return [Proc] Lambda for command.
|
|
136
|
+
def lmb
|
|
137
|
+
lambda do |input, output, status|
|
|
138
|
+
status_init(status, STATS)
|
|
139
|
+
|
|
140
|
+
input.each do |record|
|
|
141
|
+
@status[:records_in] += 1
|
|
142
|
+
|
|
143
|
+
clip_primer(record) if record[:SEQ] && record[:SEQ].length > 0
|
|
144
|
+
|
|
145
|
+
output << record
|
|
146
|
+
@status[:records_out] += 1
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
private
|
|
152
|
+
|
|
153
|
+
# Check options.
|
|
154
|
+
def check_options
|
|
155
|
+
options_allowed(@options, :primer, :direction, :search_distance,
|
|
156
|
+
:reverse_complement, :mismatch_percent,
|
|
157
|
+
:insertion_percent, :deletion_percent)
|
|
158
|
+
options_required(@options, :primer, :direction)
|
|
159
|
+
options_allowed_values(@options, direction: [:forward, :reverse])
|
|
160
|
+
options_allowed_values(@options, reverse_complement: [true, false])
|
|
161
|
+
options_assert(@options, ':search_distance > 0')
|
|
162
|
+
options_assert(@options, ':mismatch_percent >= 0')
|
|
163
|
+
options_assert(@options, ':insertion_percent >= 0')
|
|
164
|
+
options_assert(@options, ':deletion_percent >= 0')
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Set default option values.
|
|
168
|
+
def defaults
|
|
169
|
+
@options[:mismatch_percent] ||= 0
|
|
170
|
+
@options[:insertion_percent] ||= 0
|
|
171
|
+
@options[:deletion_percent] ||= 0
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Calculate the mismatch percentage.
|
|
175
|
+
#
|
|
176
|
+
# @return [Float] Mismatch percentage.
|
|
177
|
+
def calc_mis
|
|
178
|
+
(@primer.length * @options[:mismatch_percent] * 0.01).round
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Calculate the insertion percentage.
|
|
182
|
+
#
|
|
183
|
+
# @return [Float] Insertion percentage.
|
|
184
|
+
def calc_ins
|
|
185
|
+
(@primer.length * @options[:insertion_percent] * 0.01).round
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Calculate the deletion percentage.
|
|
189
|
+
#
|
|
190
|
+
# @return [Float] Deletion percentage.
|
|
191
|
+
def calc_del
|
|
192
|
+
(@primer.length * @options[:deletion_percent] * 0.01).round
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Reset any previous clip_primer results from record.
|
|
196
|
+
#
|
|
197
|
+
# @param record [Hash] BioPiece record to reset.
|
|
198
|
+
def reset(record)
|
|
199
|
+
record.delete :CLIP_PRIMER_DIR
|
|
200
|
+
record.delete :CLIP_PRIMER_POS
|
|
201
|
+
record.delete :CLIP_PRIMER_LEN
|
|
202
|
+
record.delete :CLIP_PRIMER_PAT
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
def clip_primer(record)
|
|
206
|
+
reset(record)
|
|
207
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
208
|
+
|
|
209
|
+
@status[:sequences_in] += 1
|
|
210
|
+
@status[:residues_in] += entry.length
|
|
211
|
+
|
|
212
|
+
case @options[:direction]
|
|
213
|
+
when :forward then clip_primer_forward(record, entry)
|
|
214
|
+
when :reverse then clip_primer_reverse(record, entry)
|
|
215
|
+
else
|
|
216
|
+
fail RunTimeError, 'This should never happen'
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
@status[:sequences_out] += 1
|
|
220
|
+
@status[:residues_out] += entry.length
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Clip forward primer from entry and save clip information
|
|
224
|
+
# in record.
|
|
225
|
+
#
|
|
226
|
+
# @param record [Hash] BioPiece record with sequence.
|
|
227
|
+
# @param entry [BioDSL::Seq] Sequence entry.
|
|
228
|
+
def clip_primer_forward(record, entry)
|
|
229
|
+
if (match = entry.patmatch(@primer, start: 0, stop: stop(entry),
|
|
230
|
+
max_mismatches: @mis,
|
|
231
|
+
max_insertions: @ins,
|
|
232
|
+
max_deletions: @del))
|
|
233
|
+
@status[:pattern_hits] += 1
|
|
234
|
+
|
|
235
|
+
if match.pos + match.length <= entry.length
|
|
236
|
+
entry = entry[match.pos + match.length..-1]
|
|
237
|
+
|
|
238
|
+
merge_record_entry(record, entry, match, 'FORWARD')
|
|
239
|
+
end
|
|
240
|
+
else
|
|
241
|
+
@status[:pattern_misses] += 1
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# Calculate the match stop position.
|
|
246
|
+
#
|
|
247
|
+
# @param entry [BioDSL::Seq] Sequence entry.
|
|
248
|
+
#
|
|
249
|
+
# @return [Integer] Match stop position.
|
|
250
|
+
def stop(entry)
|
|
251
|
+
stop = search_distance(entry) - @primer.length
|
|
252
|
+
stop = 0 if stop < 0
|
|
253
|
+
stop
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
# Clip reverse primer from entry and save clip information
|
|
257
|
+
# in record.
|
|
258
|
+
#
|
|
259
|
+
# @param record [Hash] BioPiece record with sequence.
|
|
260
|
+
# @param entry [BioDSL::Seq] Sequence entry.
|
|
261
|
+
def clip_primer_reverse(record, entry)
|
|
262
|
+
start = entry.length - search_distance(entry)
|
|
263
|
+
|
|
264
|
+
if (match = entry.patmatch(@primer, start: start,
|
|
265
|
+
stop: entry.length - 1,
|
|
266
|
+
max_mismatches: @mis,
|
|
267
|
+
max_insertions: @ins,
|
|
268
|
+
max_deletions: @del))
|
|
269
|
+
@status[:pattern_hits] += 1
|
|
270
|
+
|
|
271
|
+
entry = entry[0...match.pos]
|
|
272
|
+
|
|
273
|
+
merge_record_entry(record, entry, match, 'REVERSE')
|
|
274
|
+
else
|
|
275
|
+
@status[:pattern_misses] += 1
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
# Merge entry and match info to record.
|
|
280
|
+
#
|
|
281
|
+
# @param record [Hash] BioDSL record.
|
|
282
|
+
# @param entry [BioDSL::Seq] Sequence entry.
|
|
283
|
+
# @param match [BioDSL::Match] Match object.
|
|
284
|
+
# @param type [String] Type.
|
|
285
|
+
def merge_record_entry(record, entry, match, type)
|
|
286
|
+
record.merge!(entry.to_bp)
|
|
287
|
+
record[:CLIP_PRIMER_DIR] = type
|
|
288
|
+
record[:CLIP_PRIMER_POS] = match.pos
|
|
289
|
+
record[:CLIP_PRIMER_LEN] = match.length
|
|
290
|
+
record[:CLIP_PRIMER_PAT] = match.match
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
# Return the primer sequence and reverse-complement according to options.
|
|
294
|
+
#
|
|
295
|
+
# @return [String] Primer sequence.
|
|
296
|
+
def primer
|
|
297
|
+
if @options[:reverse_complement]
|
|
298
|
+
Seq.new(seq: @options[:primer], type: :dna).reverse.complement.seq
|
|
299
|
+
else
|
|
300
|
+
@options[:primer]
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
# Determine the search distance from the search_distance in the options or
|
|
305
|
+
# as the sequence length.
|
|
306
|
+
#
|
|
307
|
+
# @param entry [BioDSL::Seq] Sequence entry.
|
|
308
|
+
#
|
|
309
|
+
# @return [Integer] Search distance.
|
|
310
|
+
def search_distance(entry)
|
|
311
|
+
if @options[:search_distance] && @options[:search_distance] < entry.length
|
|
312
|
+
@options[:search_distance]
|
|
313
|
+
else
|
|
314
|
+
entry.length
|
|
315
|
+
end
|
|
316
|
+
end
|
|
317
|
+
end
|
|
318
|
+
end
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Create OTUs from sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# Use the +usearch+ program cluster_otus to cluster sequences in the stream
|
|
32
|
+
# and output a representative sequence from each cluster. Sequences must
|
|
33
|
+
# be dereplicated and sorted according to +SEQ_COUNT+ in decreasing order.
|
|
34
|
+
#
|
|
35
|
+
# Please refer to the manual:
|
|
36
|
+
#
|
|
37
|
+
# http://drive5.com/usearch/manual/cluster_otus.html
|
|
38
|
+
#
|
|
39
|
+
# Usearch 7.0 must be installed for +usearch+ to work. Read more here:
|
|
40
|
+
#
|
|
41
|
+
# http://www.drive5.com/usearch/
|
|
42
|
+
#
|
|
43
|
+
# == Usage
|
|
44
|
+
#
|
|
45
|
+
# cluster_otus([identity: <float>])
|
|
46
|
+
#
|
|
47
|
+
# === Options
|
|
48
|
+
#
|
|
49
|
+
# * identity: <float> - OTU cluster identity between 0.0 and 1.0
|
|
50
|
+
# (Default 0.97).
|
|
51
|
+
#
|
|
52
|
+
# == Examples
|
|
53
|
+
#
|
|
54
|
+
# To create OTU clusters do:
|
|
55
|
+
#
|
|
56
|
+
# BP.new.
|
|
57
|
+
# read_fasta(input: "in.fna").
|
|
58
|
+
# dereplicate_seq.
|
|
59
|
+
# sort(key: :SEQ_COUNT, reverse: true).
|
|
60
|
+
# cluster_otus.
|
|
61
|
+
# run
|
|
62
|
+
class ClusterOtus
|
|
63
|
+
require 'BioDSL/helpers/aux_helper'
|
|
64
|
+
|
|
65
|
+
include AuxHelper
|
|
66
|
+
|
|
67
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
68
|
+
residues_out)
|
|
69
|
+
|
|
70
|
+
# Constructor for ClusterOtu.
|
|
71
|
+
#
|
|
72
|
+
# @param options [Hash] Options hash.
|
|
73
|
+
# @option options [Float] :identity Cluster identity.
|
|
74
|
+
#
|
|
75
|
+
# @return [ClusterOtu] Instance of ClusterOtu.
|
|
76
|
+
def initialize(options)
|
|
77
|
+
@options = options
|
|
78
|
+
|
|
79
|
+
aux_exist('usearch')
|
|
80
|
+
check_options
|
|
81
|
+
defaults
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def lmb
|
|
85
|
+
lambda do |input, output, status|
|
|
86
|
+
status_init(status, STATS)
|
|
87
|
+
|
|
88
|
+
TmpDir.create('tmp.fa', 'tmp.uc') do |tmp_in, tmp_out|
|
|
89
|
+
process_input(input, output, tmp_in)
|
|
90
|
+
|
|
91
|
+
BioDSL::Usearch.cluster_otus(input: tmp_in, output: tmp_out,
|
|
92
|
+
identity: @options[:identity],
|
|
93
|
+
verbose: @options[:verbose])
|
|
94
|
+
|
|
95
|
+
process_output(output, tmp_out)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
private
|
|
101
|
+
|
|
102
|
+
# Check options.
|
|
103
|
+
def check_options
|
|
104
|
+
options_allowed(@options, :identity)
|
|
105
|
+
options_assert(@options, ':identity >= 0.0')
|
|
106
|
+
options_assert(@options, ':identity <= 1.0')
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Set default options.
|
|
110
|
+
def defaults
|
|
111
|
+
@options[:identity] ||= 0.97
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Process input records and save sequence data to a temporary FASTA file for
|
|
115
|
+
# use with +usearch cluster_otus+.
|
|
116
|
+
#
|
|
117
|
+
# @param input [Enumerator] Input stream.
|
|
118
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
119
|
+
# @param tmp_in [String] Path to temporary FASTA file.
|
|
120
|
+
def process_input(input, output, tmp_in)
|
|
121
|
+
BioDSL::Fasta.open(tmp_in, 'w') do |ios|
|
|
122
|
+
input.each_with_index do |record, i|
|
|
123
|
+
@status[:records_in] += 1
|
|
124
|
+
|
|
125
|
+
if record.key? :SEQ
|
|
126
|
+
@status[:sequences_in] += 1
|
|
127
|
+
@status[:residues_in] += record[:SEQ].length
|
|
128
|
+
ios.puts record2entry(record, i).to_fasta
|
|
129
|
+
else
|
|
130
|
+
output << record
|
|
131
|
+
@status[:records_out] += 1
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Create a Sequence entry from a record using the record index as sequence
|
|
138
|
+
# name if no such is found.
|
|
139
|
+
#
|
|
140
|
+
# @param record [Hash] BioDSL record.
|
|
141
|
+
# @param i [Integer] Record index
|
|
142
|
+
def record2entry(record, i)
|
|
143
|
+
seq_name = record[:SEQ_NAME] || i.to_s
|
|
144
|
+
|
|
145
|
+
if record.key? :SEQ_COUNT
|
|
146
|
+
seq_name << ";size=#{record[:SEQ_COUNT]}"
|
|
147
|
+
else
|
|
148
|
+
fail BioDSL::SeqError, 'Missing SEQ_COUNT'
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ])
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Process the cluster output and emit otus to the output stream.
|
|
155
|
+
#
|
|
156
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
157
|
+
# @param tmp_out [String] Path to temporary OTU file.
|
|
158
|
+
#
|
|
159
|
+
# @raise [UsearchError] if size info is missing from SEQ_NAME.
|
|
160
|
+
def process_output(output, tmp_out)
|
|
161
|
+
BioDSL::Fasta.open(tmp_out) do |ios|
|
|
162
|
+
ios.each do |entry|
|
|
163
|
+
record = entry.to_bp
|
|
164
|
+
|
|
165
|
+
if record[:SEQ_NAME] =~ /;size=(\d+)$/
|
|
166
|
+
record[:SEQ_COUNT] = Regexp.last_match(1).to_i
|
|
167
|
+
record[:SEQ_NAME].sub!(/;size=\d+$/, '')
|
|
168
|
+
else
|
|
169
|
+
fail BioDSL::UsearchError, 'Missing size in SEQ_NAME: ' \
|
|
170
|
+
"#{record[:SEQ_NAME]}"
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
output << record
|
|
174
|
+
@status[:sequences_out] += 1
|
|
175
|
+
@status[:residues_out] += record[:SEQ].length
|
|
176
|
+
@status[:records_out] += 1
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
end
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Collapse OTUs based on identicial taxonomy strings.
|
|
30
|
+
#
|
|
31
|
+
# +collapse_otus+ collapses OTUs in OTU style records if the TAXONOMY string
|
|
32
|
+
# is redundant. At the same time the sample counts (+_COUNT+) is incremented
|
|
33
|
+
# the collapsed OTUs.
|
|
34
|
+
#
|
|
35
|
+
# == Usage
|
|
36
|
+
#
|
|
37
|
+
# collapse_otus
|
|
38
|
+
#
|
|
39
|
+
# === Options
|
|
40
|
+
#
|
|
41
|
+
# == Examples
|
|
42
|
+
#
|
|
43
|
+
# Here is an OTU table with four rows, one of which has a redundant Taxonomy
|
|
44
|
+
# string:
|
|
45
|
+
#
|
|
46
|
+
# BP.new.read_table(input: "otu_table.txt").dump.run
|
|
47
|
+
#
|
|
48
|
+
# {:OTU=>"OTU_1",
|
|
49
|
+
# :CM1_COUNT=>881,
|
|
50
|
+
# :CM10_COUNT=>234,
|
|
51
|
+
# :TAXONOMY=>
|
|
52
|
+
# "Bacteria(100);Firmicutes(100);Bacilli(100);Lactobacillales(100); \
|
|
53
|
+
# Leuconostocaceae(100);Leuconostoc(100)"}
|
|
54
|
+
# {:OTU=>"OTU_0",
|
|
55
|
+
# :CM1_COUNT=>3352,
|
|
56
|
+
# :CM10_COUNT=>4329,
|
|
57
|
+
# :TAXONOMY=>
|
|
58
|
+
# "Bacteria(100);Firmicutes(100);Bacilli(100);Lactobacillales(100); \
|
|
59
|
+
# Streptococcaceae(100);Lactococcus(100)"}
|
|
60
|
+
# {:OTU=>"OTU_5",
|
|
61
|
+
# :CM1_COUNT=>5,
|
|
62
|
+
# :CM10_COUNT=>0,
|
|
63
|
+
# :TAXONOMY=>
|
|
64
|
+
# "Bacteria(100);Proteobacteria(100);Gammaproteobacteria(100); \
|
|
65
|
+
# Pseudomonadales(100);Pseudomonadaceae(100);Pseudomonas(100)"}
|
|
66
|
+
# {:OTU=>"OTU_3",
|
|
67
|
+
# :CM1_COUNT=>228,
|
|
68
|
+
# :CM10_COUNT=>200,
|
|
69
|
+
# :TAXONOMY=>
|
|
70
|
+
# "Bacteria(100);Firmicutes(100);Bacilli(100);Lactobacillales(100); \
|
|
71
|
+
# Streptococcaceae(100);Lactococcus(100)"}
|
|
72
|
+
#
|
|
73
|
+
# In order to collapse the redudant OTU simply run the stream through
|
|
74
|
+
# +collapse_otus+:
|
|
75
|
+
#
|
|
76
|
+
# BP.new.read_table(input: "otu_table.txt").collapse_otus.dump.run
|
|
77
|
+
#
|
|
78
|
+
# {:OTU=>"OTU_1",
|
|
79
|
+
# :CM1_COUNT=>881,
|
|
80
|
+
# :CM10_COUNT=>234,
|
|
81
|
+
# :TAXONOMY=>
|
|
82
|
+
# "Bacteria(100);Firmicutes(100);Bacilli(100);Lactobacillales(100); \
|
|
83
|
+
# Leuconostocaceae(100);Leuconostoc(100)"}
|
|
84
|
+
# {:OTU=>"OTU_0",
|
|
85
|
+
# :CM1_COUNT=>3580,
|
|
86
|
+
# :CM10_COUNT=>4529,
|
|
87
|
+
# :TAXONOMY=>
|
|
88
|
+
# "Bacteria(100);Firmicutes(100);Bacilli(100);Lactobacillales(100); \
|
|
89
|
+
# Streptococcaceae(100);Lactococcus(100)"}
|
|
90
|
+
# {:OTU=>"OTU_5",
|
|
91
|
+
# :CM1_COUNT=>5,
|
|
92
|
+
# :CM10_COUNT=>0,
|
|
93
|
+
# :TAXONOMY=>
|
|
94
|
+
# "Bacteria(100);Proteobacteria(100);Gammaproteobacteria(100); \
|
|
95
|
+
# Pseudomonadales(100);Pseudomonadaceae(100);Pseudomonas(100)"}
|
|
96
|
+
class CollapseOtus
|
|
97
|
+
STATS = %i(records_in records_out otus_in otus_out)
|
|
98
|
+
|
|
99
|
+
# Constructor for CollapseOtus.
|
|
100
|
+
#
|
|
101
|
+
# @param options [Hash] Options Hash.
|
|
102
|
+
def initialize(options)
|
|
103
|
+
@options = options
|
|
104
|
+
|
|
105
|
+
check_options
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Return the CollapseOtus command lambda.
|
|
109
|
+
#
|
|
110
|
+
# @return [Proc] Lambda for the command.
|
|
111
|
+
def lmb
|
|
112
|
+
lambda do |input, output, status|
|
|
113
|
+
status_init(status, STATS)
|
|
114
|
+
|
|
115
|
+
hash = {}
|
|
116
|
+
|
|
117
|
+
input.each do |record|
|
|
118
|
+
@status[:records_in] += 1
|
|
119
|
+
|
|
120
|
+
if record[:TAXONOMY]
|
|
121
|
+
@status[:otus_in] += 1
|
|
122
|
+
|
|
123
|
+
collapse_tax(hash, record)
|
|
124
|
+
else
|
|
125
|
+
output << record
|
|
126
|
+
@status[:records_out] += 1
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
write_tax(hash, output)
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
private
|
|
135
|
+
|
|
136
|
+
# Check options.
|
|
137
|
+
def check_options
|
|
138
|
+
options_allowed(@options, nil)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Collapse identical taxonomies by removing duplicates and adding their
|
|
142
|
+
# counts.
|
|
143
|
+
#
|
|
144
|
+
# @param hash [Hash] Hash with taxonomy records.
|
|
145
|
+
# @param record [Hash] BioDSL record with taxonomy info.
|
|
146
|
+
def collapse_tax(hash, record)
|
|
147
|
+
key = record[:TAXONOMY].gsub(/\(\d+\)/, '').to_sym
|
|
148
|
+
|
|
149
|
+
if hash.key? key
|
|
150
|
+
record.each do |k, v|
|
|
151
|
+
hash[key][k] += v if k[-6..-1] == '_COUNT'
|
|
152
|
+
end
|
|
153
|
+
else
|
|
154
|
+
hash[key] = record
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Output collapsed taxonomy records.
|
|
159
|
+
#
|
|
160
|
+
# @param hash [Hash] Hash with taxonomy records.
|
|
161
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
162
|
+
def write_tax(hash, output)
|
|
163
|
+
hash.each_value do |record|
|
|
164
|
+
output << record
|
|
165
|
+
@status[:otus_out] += 1
|
|
166
|
+
@status[:records_out] += 1
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
end
|