BioDSL 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/BioDSL.gemspec +1 -1
- data/Gemfile +6 -0
- data/README.md +289 -155
- data/Rakefile +18 -16
- data/lib/BioDSL.rb +1 -1
- data/lib/BioDSL/cary.rb +78 -53
- data/lib/BioDSL/command.rb +2 -2
- data/lib/BioDSL/commands.rb +1 -1
- data/lib/BioDSL/commands/add_key.rb +1 -1
- data/lib/BioDSL/commands/align_seq_mothur.rb +4 -4
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +5 -5
- data/lib/BioDSL/commands/assemble_pairs.rb +13 -13
- data/lib/BioDSL/commands/assemble_seq_idba.rb +7 -9
- data/lib/BioDSL/commands/assemble_seq_ray.rb +13 -13
- data/lib/BioDSL/commands/assemble_seq_spades.rb +4 -4
- data/lib/BioDSL/commands/classify_seq.rb +8 -8
- data/lib/BioDSL/commands/classify_seq_mothur.rb +5 -5
- data/lib/BioDSL/commands/clip_primer.rb +7 -7
- data/lib/BioDSL/commands/cluster_otus.rb +5 -5
- data/lib/BioDSL/commands/collapse_otus.rb +2 -2
- data/lib/BioDSL/commands/collect_otus.rb +2 -2
- data/lib/BioDSL/commands/complement_seq.rb +4 -4
- data/lib/BioDSL/commands/count.rb +1 -1
- data/lib/BioDSL/commands/count_values.rb +2 -2
- data/lib/BioDSL/commands/degap_seq.rb +6 -7
- data/lib/BioDSL/commands/dereplicate_seq.rb +1 -1
- data/lib/BioDSL/commands/dump.rb +2 -2
- data/lib/BioDSL/commands/filter_rrna.rb +4 -4
- data/lib/BioDSL/commands/genecall.rb +7 -7
- data/lib/BioDSL/commands/grab.rb +1 -1
- data/lib/BioDSL/commands/index_taxonomy.rb +3 -3
- data/lib/BioDSL/commands/mask_seq.rb +4 -4
- data/lib/BioDSL/commands/mean_scores.rb +2 -2
- data/lib/BioDSL/commands/merge_pair_seq.rb +3 -3
- data/lib/BioDSL/commands/merge_table.rb +1 -1
- data/lib/BioDSL/commands/merge_values.rb +1 -1
- data/lib/BioDSL/commands/plot_heatmap.rb +4 -5
- data/lib/BioDSL/commands/plot_histogram.rb +4 -4
- data/lib/BioDSL/commands/plot_matches.rb +5 -5
- data/lib/BioDSL/commands/plot_residue_distribution.rb +6 -6
- data/lib/BioDSL/commands/plot_scores.rb +7 -7
- data/lib/BioDSL/commands/random.rb +1 -1
- data/lib/BioDSL/commands/read_fasta.rb +9 -9
- data/lib/BioDSL/commands/read_fastq.rb +16 -16
- data/lib/BioDSL/commands/read_table.rb +2 -3
- data/lib/BioDSL/commands/reverse_seq.rb +4 -4
- data/lib/BioDSL/commands/slice_align.rb +4 -4
- data/lib/BioDSL/commands/slice_seq.rb +3 -3
- data/lib/BioDSL/commands/sort.rb +1 -1
- data/lib/BioDSL/commands/split_pair_seq.rb +6 -7
- data/lib/BioDSL/commands/split_values.rb +2 -2
- data/lib/BioDSL/commands/trim_primer.rb +13 -8
- data/lib/BioDSL/commands/trim_seq.rb +5 -5
- data/lib/BioDSL/commands/uchime_ref.rb +6 -6
- data/lib/BioDSL/commands/uclust.rb +5 -5
- data/lib/BioDSL/commands/unique_values.rb +1 -1
- data/lib/BioDSL/commands/usearch_global.rb +2 -2
- data/lib/BioDSL/commands/usearch_local.rb +2 -2
- data/lib/BioDSL/commands/write_fasta.rb +7 -9
- data/lib/BioDSL/commands/write_fastq.rb +4 -4
- data/lib/BioDSL/commands/write_table.rb +3 -3
- data/lib/BioDSL/commands/write_tree.rb +2 -3
- data/lib/BioDSL/config.rb +2 -2
- data/lib/BioDSL/csv.rb +8 -10
- data/lib/BioDSL/debug.rb +1 -1
- data/lib/BioDSL/fasta.rb +54 -40
- data/lib/BioDSL/fastq.rb +35 -32
- data/lib/BioDSL/filesys.rb +56 -47
- data/lib/BioDSL/fork.rb +1 -1
- data/lib/BioDSL/hamming.rb +1 -1
- data/lib/BioDSL/helpers.rb +1 -1
- data/lib/BioDSL/helpers/aux_helper.rb +1 -1
- data/lib/BioDSL/helpers/email_helper.rb +1 -1
- data/lib/BioDSL/helpers/history_helper.rb +1 -1
- data/lib/BioDSL/helpers/log_helper.rb +1 -1
- data/lib/BioDSL/helpers/options_helper.rb +1 -1
- data/lib/BioDSL/helpers/status_helper.rb +1 -1
- data/lib/BioDSL/html_report.rb +1 -1
- data/lib/BioDSL/math.rb +1 -1
- data/lib/BioDSL/mummer.rb +1 -1
- data/lib/BioDSL/pipeline.rb +1 -1
- data/lib/BioDSL/seq.rb +240 -231
- data/lib/BioDSL/seq/ambiguity.rb +1 -1
- data/lib/BioDSL/seq/assemble.rb +1 -1
- data/lib/BioDSL/seq/backtrack.rb +93 -76
- data/lib/BioDSL/seq/digest.rb +1 -1
- data/lib/BioDSL/seq/dynamic.rb +43 -55
- data/lib/BioDSL/seq/homopolymer.rb +34 -36
- data/lib/BioDSL/seq/kmer.rb +67 -50
- data/lib/BioDSL/seq/levenshtein.rb +35 -40
- data/lib/BioDSL/seq/translate.rb +64 -55
- data/lib/BioDSL/seq/trim.rb +60 -50
- data/lib/BioDSL/serializer.rb +1 -1
- data/lib/BioDSL/stream.rb +1 -1
- data/lib/BioDSL/taxonomy.rb +1 -1
- data/lib/BioDSL/test.rb +1 -1
- data/lib/BioDSL/tmp_dir.rb +1 -1
- data/lib/BioDSL/usearch.rb +1 -1
- data/lib/BioDSL/verbose.rb +1 -1
- data/lib/BioDSL/version.rb +2 -2
- data/test/BioDSL/commands/test_add_key.rb +1 -1
- data/test/BioDSL/commands/test_align_seq_mothur.rb +1 -1
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +1 -1
- data/test/BioDSL/commands/test_assemble_pairs.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +1 -1
- data/test/BioDSL/commands/test_classify_seq.rb +1 -1
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +1 -1
- data/test/BioDSL/commands/test_clip_primer.rb +1 -1
- data/test/BioDSL/commands/test_cluster_otus.rb +1 -1
- data/test/BioDSL/commands/test_collapse_otus.rb +1 -1
- data/test/BioDSL/commands/test_collect_otus.rb +1 -1
- data/test/BioDSL/commands/test_complement_seq.rb +1 -1
- data/test/BioDSL/commands/test_count.rb +1 -1
- data/test/BioDSL/commands/test_count_values.rb +1 -1
- data/test/BioDSL/commands/test_degap_seq.rb +1 -1
- data/test/BioDSL/commands/test_dereplicate_seq.rb +1 -1
- data/test/BioDSL/commands/test_dump.rb +1 -1
- data/test/BioDSL/commands/test_filter_rrna.rb +1 -1
- data/test/BioDSL/commands/test_genecall.rb +1 -1
- data/test/BioDSL/commands/test_grab.rb +1 -1
- data/test/BioDSL/commands/test_index_taxonomy.rb +1 -1
- data/test/BioDSL/commands/test_mask_seq.rb +1 -1
- data/test/BioDSL/commands/test_mean_scores.rb +1 -1
- data/test/BioDSL/commands/test_merge_pair_seq.rb +1 -1
- data/test/BioDSL/commands/test_merge_table.rb +1 -1
- data/test/BioDSL/commands/test_merge_values.rb +1 -1
- data/test/BioDSL/commands/test_plot_heatmap.rb +1 -1
- data/test/BioDSL/commands/test_plot_histogram.rb +1 -1
- data/test/BioDSL/commands/test_plot_matches.rb +1 -1
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +1 -1
- data/test/BioDSL/commands/test_plot_scores.rb +1 -1
- data/test/BioDSL/commands/test_random.rb +1 -1
- data/test/BioDSL/commands/test_read_fasta.rb +1 -1
- data/test/BioDSL/commands/test_read_fastq.rb +1 -1
- data/test/BioDSL/commands/test_read_table.rb +1 -1
- data/test/BioDSL/commands/test_reverse_seq.rb +1 -1
- data/test/BioDSL/commands/test_slice_align.rb +1 -1
- data/test/BioDSL/commands/test_slice_seq.rb +1 -1
- data/test/BioDSL/commands/test_sort.rb +1 -1
- data/test/BioDSL/commands/test_split_pair_seq.rb +1 -1
- data/test/BioDSL/commands/test_split_values.rb +1 -1
- data/test/BioDSL/commands/test_trim_primer.rb +1 -1
- data/test/BioDSL/commands/test_trim_seq.rb +1 -1
- data/test/BioDSL/commands/test_uchime_ref.rb +1 -1
- data/test/BioDSL/commands/test_uclust.rb +1 -1
- data/test/BioDSL/commands/test_unique_values.rb +1 -1
- data/test/BioDSL/commands/test_usearch_global.rb +1 -1
- data/test/BioDSL/commands/test_usearch_local.rb +1 -1
- data/test/BioDSL/commands/test_write_fasta.rb +1 -1
- data/test/BioDSL/commands/test_write_fastq.rb +1 -1
- data/test/BioDSL/commands/test_write_table.rb +1 -1
- data/test/BioDSL/commands/test_write_tree.rb +1 -1
- data/test/BioDSL/helpers/test_options_helper.rb +3 -3
- data/test/BioDSL/seq/test_assemble.rb +58 -56
- data/test/BioDSL/seq/test_backtrack.rb +83 -81
- data/test/BioDSL/seq/test_digest.rb +47 -45
- data/test/BioDSL/seq/test_dynamic.rb +66 -64
- data/test/BioDSL/seq/test_homopolymer.rb +35 -33
- data/test/BioDSL/seq/test_kmer.rb +29 -28
- data/test/BioDSL/seq/test_translate.rb +44 -42
- data/test/BioDSL/seq/test_trim.rb +59 -57
- data/test/BioDSL/test_cary.rb +1 -1
- data/test/BioDSL/test_command.rb +2 -2
- data/test/BioDSL/test_csv.rb +34 -31
- data/test/BioDSL/test_debug.rb +31 -31
- data/test/BioDSL/test_fasta.rb +30 -29
- data/test/BioDSL/test_fastq.rb +27 -26
- data/test/BioDSL/test_filesys.rb +28 -27
- data/test/BioDSL/test_fork.rb +29 -28
- data/test/BioDSL/test_math.rb +31 -30
- data/test/BioDSL/test_mummer.rb +1 -1
- data/test/BioDSL/test_pipeline.rb +1 -1
- data/test/BioDSL/test_seq.rb +42 -41
- data/test/BioDSL/test_serializer.rb +35 -33
- data/test/BioDSL/test_stream.rb +28 -27
- data/test/BioDSL/test_taxonomy.rb +38 -37
- data/test/BioDSL/test_test.rb +32 -31
- data/test/BioDSL/test_tmp_dir.rb +1 -1
- data/test/BioDSL/test_usearch.rb +28 -27
- data/test/BioDSL/test_verbose.rb +32 -31
- data/test/helper.rb +34 -31
- metadata +3 -2
|
@@ -21,13 +21,11 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of the BioDSL
|
|
24
|
+
# This software is part of the BioDSL (www.BioDSL.org). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
28
28
|
module BioDSL
|
|
29
|
-
# rubocop:disable ClassLength
|
|
30
|
-
|
|
31
29
|
# == Assemble sequences the stream using IDBA_UD.
|
|
32
30
|
#
|
|
33
31
|
# +assemble_seq_idba+ is a wrapper around the prokaryotic metagenome
|
|
@@ -99,7 +97,7 @@ module BioDSL
|
|
|
99
97
|
TmpDir.create('reads.fna', 'contig.fa') do |fa_in, fa_out, tmp_dir|
|
|
100
98
|
process_input(input, output, fa_in)
|
|
101
99
|
execute_idba(fa_in, tmp_dir)
|
|
102
|
-
|
|
100
|
+
process_output(output, fa_out)
|
|
103
101
|
end
|
|
104
102
|
|
|
105
103
|
calc_n50(status)
|
|
@@ -123,7 +121,7 @@ module BioDSL
|
|
|
123
121
|
def defaults
|
|
124
122
|
@options[:kmer_min] ||= 24
|
|
125
123
|
@options[:kmer_max] ||= 48
|
|
126
|
-
@options[:cpus]
|
|
124
|
+
@options[:cpus] ||= 1
|
|
127
125
|
end
|
|
128
126
|
|
|
129
127
|
# Read all records from input and emit non-sequence records to the output
|
|
@@ -141,7 +139,7 @@ module BioDSL
|
|
|
141
139
|
entry = BioDSL::Seq.new_bp(record)
|
|
142
140
|
|
|
143
141
|
@status[:sequences_in] += 1
|
|
144
|
-
@status[:residues_in]
|
|
142
|
+
@status[:residues_in] += entry.length
|
|
145
143
|
|
|
146
144
|
fasta_io.puts entry.to_fasta
|
|
147
145
|
else
|
|
@@ -193,9 +191,9 @@ module BioDSL
|
|
|
193
191
|
BioDSL::Fasta.open(fa_out, 'r') do |ios|
|
|
194
192
|
ios.each do |entry|
|
|
195
193
|
output << entry.to_bp
|
|
196
|
-
@status[:records_out]
|
|
194
|
+
@status[:records_out] += 1
|
|
197
195
|
@status[:sequences_out] += 1
|
|
198
|
-
@status[:residues_out]
|
|
196
|
+
@status[:residues_out] += entry.length
|
|
199
197
|
|
|
200
198
|
@lengths << entry.length
|
|
201
199
|
end
|
|
@@ -212,7 +210,7 @@ module BioDSL
|
|
|
212
210
|
@lengths.reverse!
|
|
213
211
|
|
|
214
212
|
status[:contig_max] = @lengths.first || 0
|
|
215
|
-
status[:contig_min] = @lengths.last
|
|
213
|
+
status[:contig_min] = @lengths.last || 0
|
|
216
214
|
status[:contig_n50] = 0
|
|
217
215
|
|
|
218
216
|
count = 0
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of the BioDSL
|
|
24
|
+
# This software is part of the BioDSL (www.BioDSL.org). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -164,10 +164,10 @@ module BioDSL
|
|
|
164
164
|
|
|
165
165
|
# Set the default option values.
|
|
166
166
|
def defaults
|
|
167
|
-
@options[:kmer_min]
|
|
168
|
-
@options[:kmer_max]
|
|
167
|
+
@options[:kmer_min] ||= 21
|
|
168
|
+
@options[:kmer_max] ||= 49
|
|
169
169
|
@options[:contig_min] ||= 500
|
|
170
|
-
@options[:cpus]
|
|
170
|
+
@options[:cpus] ||= 1
|
|
171
171
|
end
|
|
172
172
|
|
|
173
173
|
# Read all records from input and emit non-sequence records to the output
|
|
@@ -185,7 +185,7 @@ module BioDSL
|
|
|
185
185
|
entry = BioDSL::Seq.new_bp(record)
|
|
186
186
|
|
|
187
187
|
@status[:sequences_in] += 1
|
|
188
|
-
@status[:residues_in]
|
|
188
|
+
@status[:residues_in] += entry.length
|
|
189
189
|
|
|
190
190
|
fasta_io.puts entry.to_fasta
|
|
191
191
|
else
|
|
@@ -314,11 +314,11 @@ module BioDSL
|
|
|
314
314
|
next if entry.length < @options[:contig_min]
|
|
315
315
|
|
|
316
316
|
lengths << entry.length
|
|
317
|
-
output
|
|
317
|
+
output << entry.to_bp
|
|
318
318
|
|
|
319
|
-
@status[:records_out]
|
|
319
|
+
@status[:records_out] += 1
|
|
320
320
|
@status[:sequences_out] += 1
|
|
321
|
-
@status[:residues_out]
|
|
321
|
+
@status[:residues_out] += entry.length
|
|
322
322
|
end
|
|
323
323
|
end
|
|
324
324
|
|
|
@@ -333,11 +333,11 @@ module BioDSL
|
|
|
333
333
|
@status[:kmer] = kmer
|
|
334
334
|
@status[:paired] = @paired
|
|
335
335
|
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
336
|
+
return if lengths.empty?
|
|
337
|
+
|
|
338
|
+
@status[:contig_min] = lengths.min
|
|
339
|
+
@status[:contig_max] = lengths.max
|
|
340
|
+
@status[:n50] = calc_n50(lengths)
|
|
341
341
|
end
|
|
342
342
|
|
|
343
343
|
N50 = Struct.new(:kmer, :n50)
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of the BioDSL
|
|
24
|
+
# This software is part of the BioDSL (www.BioDSL.org). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -160,7 +160,7 @@ module BioDSL
|
|
|
160
160
|
entry = BioDSL::Seq.new_bp(record)
|
|
161
161
|
|
|
162
162
|
@status[:sequences_in] += 1
|
|
163
|
-
@status[:residues_in]
|
|
163
|
+
@status[:residues_in] += entry.length
|
|
164
164
|
|
|
165
165
|
if entry.qual
|
|
166
166
|
@type = :fastq
|
|
@@ -216,9 +216,9 @@ module BioDSL
|
|
|
216
216
|
BioDSL::Fasta.open(output_file) do |ios|
|
|
217
217
|
ios.each do |entry|
|
|
218
218
|
output << entry.to_bp
|
|
219
|
-
@status[:records_out]
|
|
219
|
+
@status[:records_out] += 1
|
|
220
220
|
@status[:sequences_out] += 1
|
|
221
|
-
@status[:residues_out]
|
|
221
|
+
@status[:residues_out] += entry.length
|
|
222
222
|
|
|
223
223
|
@lengths << entry.length
|
|
224
224
|
end
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of the BioDSL
|
|
24
|
+
# This software is part of the BioDSL (www.BioDSL.org). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -185,12 +185,12 @@ module BioDSL
|
|
|
185
185
|
|
|
186
186
|
# Set default options.
|
|
187
187
|
def defaults
|
|
188
|
-
@options[:prefix]
|
|
188
|
+
@options[:prefix] ||= 'taxonomy'
|
|
189
189
|
@options[:kmer_size] ||= 8
|
|
190
190
|
@options[:step_size] ||= 1
|
|
191
|
-
@options[:hits_max]
|
|
191
|
+
@options[:hits_max] ||= 50
|
|
192
192
|
@options[:consensus] ||= 0.51
|
|
193
|
-
@options[:coverage]
|
|
193
|
+
@options[:coverage] ||= 0.9
|
|
194
194
|
@options[:best_only] = true if @options[:best_only].nil?
|
|
195
195
|
end
|
|
196
196
|
|
|
@@ -200,14 +200,14 @@ module BioDSL
|
|
|
200
200
|
# @param i [Fixnum] Record number,
|
|
201
201
|
# @param search [BioDSL::Taxonomy::Search] Search object.
|
|
202
202
|
def classify_seq(record, i, search)
|
|
203
|
-
@status[:sequences_in]
|
|
203
|
+
@status[:sequences_in] += 1
|
|
204
204
|
@status[:sequences_out] += 1
|
|
205
|
-
@status[:residues_in]
|
|
206
|
-
@status[:residues_out]
|
|
205
|
+
@status[:residues_in] += record[:SEQ].length
|
|
206
|
+
@status[:residues_out] += record[:SEQ].length
|
|
207
207
|
seq_name = record[:SEQ_NAME] || i.to_s
|
|
208
208
|
|
|
209
209
|
result = search.execute(BioDSL::Seq.new(seq_name: seq_name,
|
|
210
|
-
|
|
210
|
+
seq: record[:SEQ]))
|
|
211
211
|
|
|
212
212
|
record[:TAXONOMY] = result.taxonomy
|
|
213
213
|
record[:TAXONOMY_HITS] = result.hits
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of the BioDSL
|
|
24
|
+
# This software is part of the BioDSL (www.BioDSL.org). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -128,7 +128,7 @@ module BioDSL
|
|
|
128
128
|
# Set default options.
|
|
129
129
|
def defaults
|
|
130
130
|
@options[:confidence] ||= 80
|
|
131
|
-
@options[:cpus]
|
|
131
|
+
@options[:cpus] ||= 1
|
|
132
132
|
end
|
|
133
133
|
|
|
134
134
|
# Process input data and save sequences to a temporary file for
|
|
@@ -143,10 +143,10 @@ module BioDSL
|
|
|
143
143
|
@status[:records_in] += 1
|
|
144
144
|
|
|
145
145
|
if record[:SEQ]
|
|
146
|
-
@status[:sequences_in]
|
|
146
|
+
@status[:sequences_in] += 1
|
|
147
147
|
@status[:sequences_out] += 1
|
|
148
|
-
@status[:residues_in]
|
|
149
|
-
@status[:records_out]
|
|
148
|
+
@status[:residues_in] += record[:SEQ].length
|
|
149
|
+
@status[:records_out] += record[:SEQ].length
|
|
150
150
|
seq_name = record[:SEQ_NAME] || i.to_s
|
|
151
151
|
|
|
152
152
|
entry = BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ])
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of the BioDSL
|
|
24
|
+
# This software is part of the BioDSL (www.BioDSL.org). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -166,16 +166,16 @@ module BioDSL
|
|
|
166
166
|
|
|
167
167
|
# Set default option values.
|
|
168
168
|
def defaults
|
|
169
|
-
@options[:mismatch_percent]
|
|
169
|
+
@options[:mismatch_percent] ||= 0
|
|
170
170
|
@options[:insertion_percent] ||= 0
|
|
171
|
-
@options[:deletion_percent]
|
|
171
|
+
@options[:deletion_percent] ||= 0
|
|
172
172
|
end
|
|
173
173
|
|
|
174
174
|
# Calculate the mismatch percentage.
|
|
175
175
|
#
|
|
176
176
|
# @return [Float] Mismatch percentage.
|
|
177
177
|
def calc_mis
|
|
178
|
-
(@primer.length * @options[:mismatch_percent]
|
|
178
|
+
(@primer.length * @options[:mismatch_percent] * 0.01).round
|
|
179
179
|
end
|
|
180
180
|
|
|
181
181
|
# Calculate the insertion percentage.
|
|
@@ -189,7 +189,7 @@ module BioDSL
|
|
|
189
189
|
#
|
|
190
190
|
# @return [Float] Deletion percentage.
|
|
191
191
|
def calc_del
|
|
192
|
-
(@primer.length * @options[:deletion_percent]
|
|
192
|
+
(@primer.length * @options[:deletion_percent] * 0.01).round
|
|
193
193
|
end
|
|
194
194
|
|
|
195
195
|
# Reset any previous clip_primer results from record.
|
|
@@ -207,7 +207,7 @@ module BioDSL
|
|
|
207
207
|
entry = BioDSL::Seq.new_bp(record)
|
|
208
208
|
|
|
209
209
|
@status[:sequences_in] += 1
|
|
210
|
-
@status[:residues_in]
|
|
210
|
+
@status[:residues_in] += entry.length
|
|
211
211
|
|
|
212
212
|
case @options[:direction]
|
|
213
213
|
when :forward then clip_primer_forward(record, entry)
|
|
@@ -217,7 +217,7 @@ module BioDSL
|
|
|
217
217
|
end
|
|
218
218
|
|
|
219
219
|
@status[:sequences_out] += 1
|
|
220
|
-
@status[:residues_out]
|
|
220
|
+
@status[:residues_out] += entry.length
|
|
221
221
|
end
|
|
222
222
|
|
|
223
223
|
# Clip forward primer from entry and save clip information
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of the BioDSL
|
|
24
|
+
# This software is part of the BioDSL (www.BioDSL.org). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -89,8 +89,8 @@ module BioDSL
|
|
|
89
89
|
process_input(input, output, tmp_in)
|
|
90
90
|
|
|
91
91
|
BioDSL::Usearch.cluster_otus(input: tmp_in, output: tmp_out,
|
|
92
|
-
|
|
93
|
-
|
|
92
|
+
identity: @options[:identity],
|
|
93
|
+
verbose: @options[:verbose])
|
|
94
94
|
|
|
95
95
|
process_output(output, tmp_out)
|
|
96
96
|
end
|
|
@@ -172,8 +172,8 @@ module BioDSL
|
|
|
172
172
|
|
|
173
173
|
output << record
|
|
174
174
|
@status[:sequences_out] += 1
|
|
175
|
-
@status[:residues_out]
|
|
176
|
-
@status[:records_out]
|
|
175
|
+
@status[:residues_out] += record[:SEQ].length
|
|
176
|
+
@status[:records_out] += 1
|
|
177
177
|
end
|
|
178
178
|
end
|
|
179
179
|
end
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of the BioDSL
|
|
24
|
+
# This software is part of the BioDSL (www.BioDSL.org). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -162,7 +162,7 @@ module BioDSL
|
|
|
162
162
|
def write_tax(hash, output)
|
|
163
163
|
hash.each_value do |record|
|
|
164
164
|
output << record
|
|
165
|
-
@status[:otus_out]
|
|
165
|
+
@status[:otus_out] += 1
|
|
166
166
|
@status[:records_out] += 1
|
|
167
167
|
end
|
|
168
168
|
end
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of the BioDSL
|
|
24
|
+
# This software is part of the BioDSL (www.BioDSL.org). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -142,7 +142,7 @@ module BioDSL
|
|
|
142
142
|
|
|
143
143
|
output << record
|
|
144
144
|
|
|
145
|
-
@status[:hits_out]
|
|
145
|
+
@status[:hits_out] += 1
|
|
146
146
|
@status[:records_out] += 1
|
|
147
147
|
end
|
|
148
148
|
end
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of the BioDSL
|
|
24
|
+
# This software is part of the BioDSL (www.BioDSL.org). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -106,10 +106,10 @@ module BioDSL
|
|
|
106
106
|
entry.type = @type
|
|
107
107
|
entry.complement!
|
|
108
108
|
|
|
109
|
-
@status[:sequences_in]
|
|
109
|
+
@status[:sequences_in] += 1
|
|
110
110
|
@status[:sequences_out] += 1
|
|
111
|
-
@status[:residues_in]
|
|
112
|
-
@status[:residues_out]
|
|
111
|
+
@status[:residues_in] += entry.length
|
|
112
|
+
@status[:residues_out] += entry.length
|
|
113
113
|
|
|
114
114
|
record.merge! entry.to_bp
|
|
115
115
|
end
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of the BioDSL
|
|
24
|
+
# This software is part of the BioDSL (www.BioDSL.org). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of the BioDSL
|
|
24
|
+
# This software is part of the BioDSL (www.BioDSL.org). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -74,7 +74,7 @@ module BioDSL
|
|
|
74
74
|
#
|
|
75
75
|
# @return [CountValues] Instance of class.
|
|
76
76
|
def initialize(options)
|
|
77
|
-
@options
|
|
77
|
+
@options = options
|
|
78
78
|
|
|
79
79
|
check_options
|
|
80
80
|
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of the BioDSL
|
|
24
|
+
# This software is part of the BioDSL (www.BioDSL.org). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -68,7 +68,6 @@ module BioDSL
|
|
|
68
68
|
# {:SEQ_NAME=>"test1", :SEQ=>"A-GTC", :SEQ_LEN=>5}
|
|
69
69
|
# {:SEQ_NAME=>"test2", :SEQ=>"AGGTC", :SEQ_LEN=>5}
|
|
70
70
|
#
|
|
71
|
-
# rubocop:disable ClassLength
|
|
72
71
|
class DegapSeq
|
|
73
72
|
require 'narray'
|
|
74
73
|
|
|
@@ -157,14 +156,14 @@ module BioDSL
|
|
|
157
156
|
# @param seq [String] Sequences.
|
|
158
157
|
def mask_add(seq)
|
|
159
158
|
@status[:sequences_in] += 1
|
|
160
|
-
@status[:residues_in]
|
|
159
|
+
@status[:residues_in] += seq.length
|
|
161
160
|
|
|
162
161
|
@max_len ||= seq.length
|
|
163
162
|
|
|
164
163
|
check_length(seq)
|
|
165
164
|
|
|
166
165
|
@na_mask ||= NArray.int(seq.length)
|
|
167
|
-
na_seq
|
|
166
|
+
na_seq = NArray.to_na(seq, 'byte')
|
|
168
167
|
@indels.each_char { |c| @na_mask += na_seq.eq(c.ord) }
|
|
169
168
|
end
|
|
170
169
|
|
|
@@ -212,7 +211,7 @@ module BioDSL
|
|
|
212
211
|
record[:SEQ_LEN] = record[:SEQ].length
|
|
213
212
|
|
|
214
213
|
@status[:sequences_out] += 1
|
|
215
|
-
@status[:residues_out]
|
|
214
|
+
@status[:residues_out] += record[:SEQ].length
|
|
216
215
|
end
|
|
217
216
|
|
|
218
217
|
# Remove all gaps from all sequences in input stream and output to output
|
|
@@ -240,12 +239,12 @@ module BioDSL
|
|
|
240
239
|
entry = BioDSL::Seq.new_bp(record)
|
|
241
240
|
|
|
242
241
|
@status[:sequences_in] += 1
|
|
243
|
-
@status[:residues_in]
|
|
242
|
+
@status[:residues_in] += entry.length
|
|
244
243
|
|
|
245
244
|
entry.seq.delete!(@indels)
|
|
246
245
|
|
|
247
246
|
@status[:sequences_out] += 1
|
|
248
|
-
@status[:residues_out]
|
|
247
|
+
@status[:residues_out] += entry.length
|
|
249
248
|
|
|
250
249
|
record.merge! entry.to_bp
|
|
251
250
|
end
|