BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
|
|
18
|
+
# #
|
|
19
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
20
|
+
# #
|
|
21
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
22
|
+
# #
|
|
23
|
+
# This software is part of BioDSL (www.BioDSL.org). #
|
|
24
|
+
# #
|
|
25
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
26
|
+
|
|
27
|
+
module BioDSL
|
|
28
|
+
# Error class for all exceptions to do with Trim.
|
|
29
|
+
class TrimError < StandardError; end
|
|
30
|
+
|
|
31
|
+
# Module containing methods for end trimming sequences with suboptimal quality
|
|
32
|
+
# scores.
|
|
33
|
+
module Trim
|
|
34
|
+
# Method to progressively trim a Seq object sequence from the right end until
|
|
35
|
+
# a run of min_len residues with quality scores above min_qual is encountered.
|
|
36
|
+
def quality_trim_right(min_qual, min_len = 1)
|
|
37
|
+
check_trim_args(min_qual, min_len)
|
|
38
|
+
|
|
39
|
+
pos = trim_right_pos_c(self.qual, self.length, min_qual, min_len, Seq::SCORE_BASE)
|
|
40
|
+
|
|
41
|
+
self[0 .. pos]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Method to progressively trim a Seq object sequence from the right end until
|
|
45
|
+
# a run of min_len residues with quality scores above min_qual is encountered.
|
|
46
|
+
def quality_trim_right!(min_qual, min_len = 1)
|
|
47
|
+
subseq = quality_trim_right(min_qual, min_len)
|
|
48
|
+
self.seq = subseq.seq
|
|
49
|
+
self.qual = subseq.qual
|
|
50
|
+
self
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Method to progressively trim a Seq object sequence from the left end until
|
|
54
|
+
# a run of min_len residues with quality scores above min_qual is encountered.
|
|
55
|
+
def quality_trim_left(min_qual, min_len = 1)
|
|
56
|
+
check_trim_args(min_qual, min_len)
|
|
57
|
+
|
|
58
|
+
pos = trim_left_pos_c(self.qual, self.length, min_qual, min_len, Seq::SCORE_BASE)
|
|
59
|
+
|
|
60
|
+
self[pos .. self.length]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Method to progressively trim a Seq object sequence from the left end until
|
|
64
|
+
# a run of min_len residues with quality scores above min_qual is encountered.
|
|
65
|
+
def quality_trim_left!(min_qual, min_len = 1)
|
|
66
|
+
subseq = quality_trim_left(min_qual, min_len)
|
|
67
|
+
self.seq = subseq.seq
|
|
68
|
+
self.qual = subseq.qual
|
|
69
|
+
self
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Method to progressively trim a Seq object sequence from both ends until a
|
|
73
|
+
# run of min_len residues with quality scores above min_qual is encountered.
|
|
74
|
+
def quality_trim(min_qual, min_len = 1)
|
|
75
|
+
check_trim_args(min_qual, min_len)
|
|
76
|
+
|
|
77
|
+
pos_right = trim_right_pos_c(self.qual, self.length, min_qual, min_len, Seq::SCORE_BASE)
|
|
78
|
+
pos_left = trim_left_pos_c(self.qual, self.length, min_qual, min_len, Seq::SCORE_BASE)
|
|
79
|
+
|
|
80
|
+
pos_left = pos_right if pos_left > pos_right
|
|
81
|
+
|
|
82
|
+
self[pos_left ... pos_right]
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Method to progressively trim a Seq object sequence from both ends until a
|
|
86
|
+
# run of min_len residues with quality scores above min_qual is encountered.
|
|
87
|
+
def quality_trim!(min_qual, min_len = 1)
|
|
88
|
+
subseq = quality_trim(min_qual, min_len)
|
|
89
|
+
self.seq = subseq.seq
|
|
90
|
+
self.qual = subseq.qual
|
|
91
|
+
self
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
private
|
|
95
|
+
|
|
96
|
+
# Method to check the arguments for trimming and raise on bad sequence, qualities,
|
|
97
|
+
# and min_qual.
|
|
98
|
+
def check_trim_args(min_qual, min_len)
|
|
99
|
+
raise TrimError, "no sequence" if self.seq.nil?
|
|
100
|
+
raise TrimError, "no quality score" if self.qual.nil?
|
|
101
|
+
unless (Seq::SCORE_MIN .. Seq::SCORE_MAX).include? min_qual
|
|
102
|
+
raise TrimError, "minimum quality value: #{min_qual} out of range #{Seq::SCORE_MIN} .. #{Seq::SCORE_MAX}"
|
|
103
|
+
end
|
|
104
|
+
raise TrimError, "min_len must be larger than zero" if min_len <= 0
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Inline C functions for speed below.
|
|
108
|
+
inline do |builder|
|
|
109
|
+
# Method for locating the right trim position and return this.
|
|
110
|
+
builder.c %{
|
|
111
|
+
VALUE trim_right_pos_c(
|
|
112
|
+
VALUE _qual, // quality score string
|
|
113
|
+
VALUE _len, // length of quality score string
|
|
114
|
+
VALUE _min_qual, // minimum quality score
|
|
115
|
+
VALUE _min_len, // minimum quality length
|
|
116
|
+
VALUE _score_base // score base
|
|
117
|
+
)
|
|
118
|
+
{
|
|
119
|
+
char *qual = StringValuePtr(_qual);
|
|
120
|
+
unsigned int len = FIX2UINT(_len);
|
|
121
|
+
unsigned int min_qual = FIX2UINT(_min_qual);
|
|
122
|
+
unsigned int min_len = FIX2UINT(_min_len);
|
|
123
|
+
unsigned int score_base = FIX2UINT(_score_base);
|
|
124
|
+
|
|
125
|
+
unsigned int i = 0;
|
|
126
|
+
unsigned int c = 0;
|
|
127
|
+
|
|
128
|
+
while (i < len)
|
|
129
|
+
{
|
|
130
|
+
c = 0;
|
|
131
|
+
|
|
132
|
+
while ((c < min_len) && ((c + i) < len) && (qual[len - (c + i) - 1] - score_base >= min_qual))
|
|
133
|
+
c++;
|
|
134
|
+
|
|
135
|
+
if (c == min_len)
|
|
136
|
+
return UINT2NUM(len - i);
|
|
137
|
+
else
|
|
138
|
+
i += c;
|
|
139
|
+
|
|
140
|
+
i++;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return UINT2NUM(0);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
# Method for locating the left trim position and return this.
|
|
148
|
+
builder.c %{
|
|
149
|
+
VALUE trim_left_pos_c(
|
|
150
|
+
VALUE _qual, // quality score string
|
|
151
|
+
VALUE _len, // length of quality score string
|
|
152
|
+
VALUE _min_qual, // minimum quality score
|
|
153
|
+
VALUE _min_len, // minimum quality length
|
|
154
|
+
VALUE _score_base // score base
|
|
155
|
+
)
|
|
156
|
+
{
|
|
157
|
+
char *qual = StringValuePtr(_qual);
|
|
158
|
+
unsigned int len = FIX2UINT(_len);
|
|
159
|
+
unsigned int min_qual = FIX2UINT(_min_qual);
|
|
160
|
+
unsigned int min_len = FIX2UINT(_min_len);
|
|
161
|
+
unsigned int score_base = FIX2UINT(_score_base);
|
|
162
|
+
|
|
163
|
+
unsigned int i = 0;
|
|
164
|
+
unsigned int c = 0;
|
|
165
|
+
|
|
166
|
+
while (i < len)
|
|
167
|
+
{
|
|
168
|
+
c = 0;
|
|
169
|
+
|
|
170
|
+
while ((c < min_len) && ((c + i) < len) && (qual[c + i] - score_base >= min_qual))
|
|
171
|
+
c++;
|
|
172
|
+
|
|
173
|
+
if (c == min_len)
|
|
174
|
+
return UINT2NUM(i);
|
|
175
|
+
else
|
|
176
|
+
i += c;
|
|
177
|
+
|
|
178
|
+
i++;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
return UINT2NUM(i);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
__END__
|