BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Remove gaps from sequences or gap only columns in alignments.
|
|
30
|
+
#
|
|
31
|
+
# +degap_seq+ remove gaps from sequences (the letters ~-_.). If the option
|
|
32
|
+
# +columns_only+ is used then gaps from aligned sequences will be removed, if
|
|
33
|
+
# and only if the the entire columns consists of gaps.
|
|
34
|
+
#
|
|
35
|
+
# == Usage
|
|
36
|
+
#
|
|
37
|
+
# degap_seq([columns_only: <bool>])
|
|
38
|
+
#
|
|
39
|
+
# === Options
|
|
40
|
+
#
|
|
41
|
+
# * columns_only: <bool> - Remove gap columns only (default=false).
|
|
42
|
+
#
|
|
43
|
+
# == Examples
|
|
44
|
+
#
|
|
45
|
+
# Consider the following FASTA entries in the file `test.fna`:
|
|
46
|
+
#
|
|
47
|
+
# >test1
|
|
48
|
+
# A-G~T.C_
|
|
49
|
+
# >test2
|
|
50
|
+
# AGG_T-C~
|
|
51
|
+
#
|
|
52
|
+
# To remove all gaps from all sequences do:
|
|
53
|
+
#
|
|
54
|
+
# BP.new.read_fasta(input: "test.fna").degap_seq.dump.run
|
|
55
|
+
#
|
|
56
|
+
# {:SEQ_NAME=>"test1", :SEQ=>"AGTC", :SEQ_LEN=>4}
|
|
57
|
+
# {:SEQ_NAME=>"test2", :SEQ=>"AGGTC", :SEQ_LEN=>5}
|
|
58
|
+
#
|
|
59
|
+
#
|
|
60
|
+
# To remove all gap-only columns use the +columns_only+ option:
|
|
61
|
+
#
|
|
62
|
+
# BP.new.
|
|
63
|
+
# read_fasta(input: "test.fna").
|
|
64
|
+
# degap_seq(columns_only: true).
|
|
65
|
+
# dump.
|
|
66
|
+
# run
|
|
67
|
+
#
|
|
68
|
+
# {:SEQ_NAME=>"test1", :SEQ=>"A-GTC", :SEQ_LEN=>5}
|
|
69
|
+
# {:SEQ_NAME=>"test2", :SEQ=>"AGGTC", :SEQ_LEN=>5}
|
|
70
|
+
#
|
|
71
|
+
# rubocop:disable ClassLength
|
|
72
|
+
class DegapSeq
|
|
73
|
+
require 'narray'
|
|
74
|
+
|
|
75
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
76
|
+
residues_out)
|
|
77
|
+
|
|
78
|
+
# Constructor for DegapSeq.
|
|
79
|
+
#
|
|
80
|
+
# @param options [Hash] Options Hash.
|
|
81
|
+
#
|
|
82
|
+
# @option options [Boolean] :columns_only
|
|
83
|
+
# Flag indicating that only gap-columns only shoule be removed.
|
|
84
|
+
#
|
|
85
|
+
# @return [DegapSeq] Instance of DegapSeq.
|
|
86
|
+
def initialize(options)
|
|
87
|
+
@options = options
|
|
88
|
+
@indels = BioDSL::Seq::INDELS.sort.join('')
|
|
89
|
+
@na_mask = nil
|
|
90
|
+
@max_len = nil
|
|
91
|
+
@count = 0
|
|
92
|
+
|
|
93
|
+
check_options
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Return the command lambda for DegapSeq.
|
|
97
|
+
#
|
|
98
|
+
# @return [Proc] Command lambda.
|
|
99
|
+
def lmb
|
|
100
|
+
lambda do |input, output, status|
|
|
101
|
+
status_init(status, STATS)
|
|
102
|
+
|
|
103
|
+
if @options[:columns_only]
|
|
104
|
+
degap_columns(input, output)
|
|
105
|
+
status[:columns_removed] = @na_mask.count_false
|
|
106
|
+
else
|
|
107
|
+
degap_all(input, output)
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
private
|
|
113
|
+
|
|
114
|
+
# Check options.
|
|
115
|
+
def check_options
|
|
116
|
+
options_allowed(@options, :columns_only)
|
|
117
|
+
options_allowed_values(@options, columns_only: [true, false, nil])
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Remove all gap-only columns from all sequences in input stream and output
|
|
121
|
+
# to output stream.
|
|
122
|
+
#
|
|
123
|
+
# @param input [Enumerator] Input stream.
|
|
124
|
+
# @param output [Enumerator::Yeilder] Output stream.
|
|
125
|
+
def degap_columns(input, output)
|
|
126
|
+
TmpDir.create('degap_seq') do |tmp_file, _|
|
|
127
|
+
process_input(input, tmp_file)
|
|
128
|
+
create_mask
|
|
129
|
+
process_output(output, tmp_file)
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Serialize all input record to a temporary file and at the same time add
|
|
134
|
+
# all sequence type records to the gap mask.
|
|
135
|
+
#
|
|
136
|
+
# @param input [Enumerator] Input stream.
|
|
137
|
+
# @param tmp_file [String] Path to temporary file.
|
|
138
|
+
def process_input(input, tmp_file)
|
|
139
|
+
File.open(tmp_file, 'wb') do |ios|
|
|
140
|
+
BioDSL::Serializer.new(ios) do |s|
|
|
141
|
+
input.each do |record|
|
|
142
|
+
@status[:records_in] += 1
|
|
143
|
+
|
|
144
|
+
if (seq = record[:SEQ])
|
|
145
|
+
mask_add(seq)
|
|
146
|
+
@count += 1
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
s << record
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Add sequence gaps to mask.
|
|
156
|
+
#
|
|
157
|
+
# @param seq [String] Sequences.
|
|
158
|
+
def mask_add(seq)
|
|
159
|
+
@status[:sequences_in] += 1
|
|
160
|
+
@status[:residues_in] += seq.length
|
|
161
|
+
|
|
162
|
+
@max_len ||= seq.length
|
|
163
|
+
|
|
164
|
+
check_length(seq)
|
|
165
|
+
|
|
166
|
+
@na_mask ||= NArray.int(seq.length)
|
|
167
|
+
na_seq = NArray.to_na(seq, 'byte')
|
|
168
|
+
@indels.each_char { |c| @na_mask += na_seq.eq(c.ord) }
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Check if sequence length match max_len.
|
|
172
|
+
#
|
|
173
|
+
# @param seq [String] Sequences.
|
|
174
|
+
#
|
|
175
|
+
# @raise [BioDSL::SeqError] if sequence length and max_len don't match.
|
|
176
|
+
def check_length(seq)
|
|
177
|
+
return if @max_len == seq.length
|
|
178
|
+
fail BioDSL::SeqError,
|
|
179
|
+
"Uneven seq lengths: #{@max_len} != #{seq.length}"
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Create a mask for all-gap columns.
|
|
183
|
+
def create_mask
|
|
184
|
+
@na_mask = @na_mask.ne @count
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Read all serialized records from the temporary file and emit to the output
|
|
188
|
+
# stream records with degapped sequences.
|
|
189
|
+
#
|
|
190
|
+
# @param output [Enumerator::Yeilder] Output stream.
|
|
191
|
+
# @param tmp_file [String] Path to temporary file.
|
|
192
|
+
def process_output(output, tmp_file)
|
|
193
|
+
File.open(tmp_file, 'rb') do |ios|
|
|
194
|
+
BioDSL::Serializer.new(ios) do |s|
|
|
195
|
+
s.each do |record|
|
|
196
|
+
remove_residues(record) if record[:SEQ]
|
|
197
|
+
|
|
198
|
+
output << record
|
|
199
|
+
@status[:records_out] += 1
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
# Given a BioDSL record containing sequence information
|
|
206
|
+
# remove all residues based on the na_mask.
|
|
207
|
+
#
|
|
208
|
+
# @param record [Hash] BioDSL record.
|
|
209
|
+
def remove_residues(record)
|
|
210
|
+
na_seq = NArray.to_na(record[:SEQ], 'byte')
|
|
211
|
+
record[:SEQ] = na_seq[@na_mask].to_s
|
|
212
|
+
record[:SEQ_LEN] = record[:SEQ].length
|
|
213
|
+
|
|
214
|
+
@status[:sequences_out] += 1
|
|
215
|
+
@status[:residues_out] += record[:SEQ].length
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# Remove all gaps from all sequences in input stream and output to output
|
|
219
|
+
# stream.
|
|
220
|
+
#
|
|
221
|
+
# @param input [Enumerator] Input stream.
|
|
222
|
+
# @param output [Enumerator::Yeilder] Output stream.
|
|
223
|
+
def degap_all(input, output)
|
|
224
|
+
input.each do |record|
|
|
225
|
+
@status[:records_in] += 1
|
|
226
|
+
|
|
227
|
+
degap_seq(record) if record.key? :SEQ
|
|
228
|
+
|
|
229
|
+
output << record
|
|
230
|
+
|
|
231
|
+
@status[:records_out] += 1
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
# Given a BioDSL record with sequence information, remove all gaps from
|
|
236
|
+
# the sequence.
|
|
237
|
+
#
|
|
238
|
+
# @param record [Hash] BioDSL record.
|
|
239
|
+
def degap_seq(record)
|
|
240
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
241
|
+
|
|
242
|
+
@status[:sequences_in] += 1
|
|
243
|
+
@status[:residues_in] += entry.length
|
|
244
|
+
|
|
245
|
+
entry.seq.delete!(@indels)
|
|
246
|
+
|
|
247
|
+
@status[:sequences_out] += 1
|
|
248
|
+
@status[:residues_out] += entry.length
|
|
249
|
+
|
|
250
|
+
record.merge! entry.to_bp
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
end
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Dereplicate sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# +dereplicate_seq+ removes all duplicate sequence records. Dereplicated
|
|
32
|
+
# sequences are output along with the count of replicates. Using the
|
|
33
|
+
# +ignore_case+ option disables the default case sensitive sequence matching.
|
|
34
|
+
#
|
|
35
|
+
# == Usage
|
|
36
|
+
#
|
|
37
|
+
# dereplicate_seq([ignore_case: <bool>])
|
|
38
|
+
#
|
|
39
|
+
# === Options
|
|
40
|
+
#
|
|
41
|
+
# * ignore_case: <bool> - Ignore sequence case.
|
|
42
|
+
#
|
|
43
|
+
# == Examples
|
|
44
|
+
#
|
|
45
|
+
# Consider the following FASTA file test.fna:
|
|
46
|
+
#
|
|
47
|
+
# >test1
|
|
48
|
+
# ATGC
|
|
49
|
+
# >test2
|
|
50
|
+
# ATGC
|
|
51
|
+
# >test3
|
|
52
|
+
# GCAT
|
|
53
|
+
#
|
|
54
|
+
# To dereplicate all sequences we use +read_fasta+ and +dereplicate_seq+:
|
|
55
|
+
#
|
|
56
|
+
# BP.new.read_fasta(input: "test.fna").dereplicate_seq.dump.run
|
|
57
|
+
#
|
|
58
|
+
# {:SEQ_NAME=>"test1", :SEQ=>"ATGC", :SEQ_LEN=>4, :SEQ_COUNT=>2}
|
|
59
|
+
# {:SEQ_NAME=>"test3", :SEQ=>"GCAT", :SEQ_LEN=>4, :SEQ_COUNT=>1}
|
|
60
|
+
class DereplicateSeq
|
|
61
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
62
|
+
residues_out)
|
|
63
|
+
|
|
64
|
+
# Constructor for the DereplicateSeq class.
|
|
65
|
+
#
|
|
66
|
+
# @param options [Hash] Options hash.
|
|
67
|
+
# @option options [Boolean] :ignore_case Ignore sequence case.
|
|
68
|
+
#
|
|
69
|
+
# @return [DereplicateSeq] Class intance.
|
|
70
|
+
def initialize(options)
|
|
71
|
+
@options = options
|
|
72
|
+
@lookup = {}
|
|
73
|
+
|
|
74
|
+
check_options
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Return the command lambda for DereplicateSeq.
|
|
78
|
+
#
|
|
79
|
+
# @return [Proc] Command lambda.
|
|
80
|
+
def lmb
|
|
81
|
+
lambda do |input, output, status|
|
|
82
|
+
status_init(status, STATS)
|
|
83
|
+
|
|
84
|
+
TmpDir.create('dereplicate_seq') do |tmp_file, _|
|
|
85
|
+
process_input(input, output, tmp_file)
|
|
86
|
+
process_output(output, tmp_file)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
private
|
|
92
|
+
|
|
93
|
+
# Check options.
|
|
94
|
+
def check_options
|
|
95
|
+
options_allowed(@options, :ignore_case)
|
|
96
|
+
options_allowed_values(@options, ignore_case: [nil, true, false])
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Process input stream and serialize all records with sequence information.
|
|
100
|
+
# All other records are emitted to the output stream.
|
|
101
|
+
#
|
|
102
|
+
# @param input [Enumerator] Input stream.
|
|
103
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
104
|
+
# @param tmp_file [String] Path to temporary file.
|
|
105
|
+
def process_input(input, output, tmp_file)
|
|
106
|
+
File.open(tmp_file, 'wb') do |ios|
|
|
107
|
+
BioDSL::Serializer.new(ios) do |s|
|
|
108
|
+
input.each do |record|
|
|
109
|
+
@status[:records_in] += 1
|
|
110
|
+
|
|
111
|
+
if record.key? :SEQ
|
|
112
|
+
serialize(record, s)
|
|
113
|
+
else
|
|
114
|
+
output << record
|
|
115
|
+
|
|
116
|
+
@status[:records_out] += 1
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Serialize records with unique sequences and keep a count of how many time
|
|
124
|
+
# each sequence was encountered.
|
|
125
|
+
#
|
|
126
|
+
# @param record [Hash] BioDSL record.
|
|
127
|
+
# @param s [BioDSL::Serializer] Serializer.
|
|
128
|
+
def serialize(record, s)
|
|
129
|
+
@status[:sequences_in] += 1
|
|
130
|
+
|
|
131
|
+
seq = record[:SEQ].dup
|
|
132
|
+
@status[:residues_in] += seq.length
|
|
133
|
+
seq.downcase! if @options[:ignore_case]
|
|
134
|
+
key = seq.to_sym
|
|
135
|
+
|
|
136
|
+
unless @lookup[key]
|
|
137
|
+
s << record
|
|
138
|
+
|
|
139
|
+
@lookup[key] = 0
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
@lookup[key] += 1
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Read all serialized records from tmp file and emit to the output stream
|
|
146
|
+
# along with the sequence count.
|
|
147
|
+
#
|
|
148
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
149
|
+
# @param tmp_file [String] Path to tmp file.
|
|
150
|
+
def process_output(output, tmp_file)
|
|
151
|
+
File.open(tmp_file, 'rb') do |ios|
|
|
152
|
+
BioDSL::Serializer.new(ios) do |s|
|
|
153
|
+
s.each do |record|
|
|
154
|
+
seq = record[:SEQ].dup
|
|
155
|
+
@status[:residues_out] += seq.length
|
|
156
|
+
seq.downcase! if @options[:ignore_case]
|
|
157
|
+
record[:SEQ_COUNT] = @lookup[seq.to_sym]
|
|
158
|
+
|
|
159
|
+
output << record
|
|
160
|
+
|
|
161
|
+
@status[:records_out] += 1
|
|
162
|
+
@status[:sequences_out] += 1
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Dump records in stream to STDOUT.
|
|
30
|
+
#
|
|
31
|
+
# +dump+ outputs records from the stream to STDOUT.
|
|
32
|
+
#
|
|
33
|
+
# == Usage
|
|
34
|
+
#
|
|
35
|
+
# dump([first: <uint> |last: <uint>])
|
|
36
|
+
#
|
|
37
|
+
# === Options
|
|
38
|
+
#
|
|
39
|
+
# * first <uint> - Only dump the first number of records.
|
|
40
|
+
# * last <uint> - Only dump the last number of records.
|
|
41
|
+
#
|
|
42
|
+
# == Examples
|
|
43
|
+
#
|
|
44
|
+
# To dump all records in the stream:
|
|
45
|
+
#
|
|
46
|
+
# dump
|
|
47
|
+
#
|
|
48
|
+
# To dump only the _first_ 10 records:
|
|
49
|
+
#
|
|
50
|
+
# dump(first: 10)
|
|
51
|
+
#
|
|
52
|
+
# To dump only the _last_ 10 records:
|
|
53
|
+
#
|
|
54
|
+
# dump(last: 10)
|
|
55
|
+
class Dump
|
|
56
|
+
STATS = %i(records_in records_out)
|
|
57
|
+
|
|
58
|
+
# Constructor for the Dump class.
|
|
59
|
+
#
|
|
60
|
+
# @param [Hash] options Options hash.
|
|
61
|
+
# @option options [Integer] :first Dump first number of records.
|
|
62
|
+
# @option options [Integer] :last Dump last number of records.
|
|
63
|
+
#
|
|
64
|
+
# @return [Dump] Returns an instance of the Dump class.
|
|
65
|
+
def initialize(options)
|
|
66
|
+
@options = options
|
|
67
|
+
|
|
68
|
+
check_options
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Return a lambda for the dump command.
|
|
72
|
+
#
|
|
73
|
+
# @return [Proc] Returns the dump command lambda.
|
|
74
|
+
def lmb
|
|
75
|
+
lambda do |input, output, status|
|
|
76
|
+
status_init(status, STATS)
|
|
77
|
+
|
|
78
|
+
if @options[:first]
|
|
79
|
+
dump_first(input, output)
|
|
80
|
+
elsif @options[:last]
|
|
81
|
+
dump_last(input, output)
|
|
82
|
+
else
|
|
83
|
+
dump_all(input, output)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
private
|
|
89
|
+
|
|
90
|
+
# Check the options and return a lambda for the command.
|
|
91
|
+
def check_options
|
|
92
|
+
options_allowed(@options, :first, :last)
|
|
93
|
+
options_unique(@options, :first, :last)
|
|
94
|
+
options_assert(@options, ':first > 0')
|
|
95
|
+
options_assert(@options, ':last > 0')
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Dump the first number of records.
|
|
99
|
+
#
|
|
100
|
+
# @param input [Enumerator::Yielder] Input stream.
|
|
101
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
102
|
+
def dump_first(input, output)
|
|
103
|
+
input.first(@options[:first]).each do |record|
|
|
104
|
+
@status[:records_in] += 1
|
|
105
|
+
|
|
106
|
+
puts record
|
|
107
|
+
|
|
108
|
+
if output
|
|
109
|
+
output << record
|
|
110
|
+
@status[:records_out] += 1
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Dump the last number of records.
|
|
116
|
+
#
|
|
117
|
+
# @param input [Enumerator::Yielder] Input stream.
|
|
118
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
119
|
+
def dump_last(input, output)
|
|
120
|
+
buffer = []
|
|
121
|
+
last = @options[:last]
|
|
122
|
+
|
|
123
|
+
input.each do |record|
|
|
124
|
+
@status[:records_in] += 1
|
|
125
|
+
|
|
126
|
+
buffer << record
|
|
127
|
+
buffer.shift if buffer.size > last
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
buffer.each do |record|
|
|
131
|
+
puts record
|
|
132
|
+
|
|
133
|
+
if output
|
|
134
|
+
output << record
|
|
135
|
+
@status[:records_out] += 1
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Dump all records.
|
|
141
|
+
#
|
|
142
|
+
# @param input [Enumerator::Yielder] Input stream.
|
|
143
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
144
|
+
def dump_all(input, output)
|
|
145
|
+
input.each do |record|
|
|
146
|
+
@status[:records_in] += 1
|
|
147
|
+
|
|
148
|
+
puts record
|
|
149
|
+
|
|
150
|
+
if output
|
|
151
|
+
output << record
|
|
152
|
+
@status[:records_out] += 1
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|