BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
|
|
18
|
+
# #
|
|
19
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
20
|
+
# #
|
|
21
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
22
|
+
# #
|
|
23
|
+
# This software is part of BioDSL (www.BioDSL.org). #
|
|
24
|
+
# #
|
|
25
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
26
|
+
|
|
27
|
+
module BioDSL
|
|
28
|
+
# Error class for all exceptions to do with Kmer.
|
|
29
|
+
class KmerError < StandardError; end
|
|
30
|
+
|
|
31
|
+
# Module containing methods for manipulating sequence kmers.
|
|
32
|
+
module Kmer
|
|
33
|
+
# Debug method to convert an array of binary encoded kmers to
|
|
34
|
+
# nucleotide oligos.
|
|
35
|
+
def self.to_oligos(kmers, kmer_size)
|
|
36
|
+
oligos = []
|
|
37
|
+
|
|
38
|
+
kmers.each do |kmer|
|
|
39
|
+
oligo = ""
|
|
40
|
+
bin = "%0#{kmer_size * 2}b" % kmer
|
|
41
|
+
|
|
42
|
+
bin.scan(/.{2}/) { |m|
|
|
43
|
+
case m
|
|
44
|
+
when '00' then oligo << 'a'
|
|
45
|
+
when '01' then oligo << 't'
|
|
46
|
+
when '10' then oligo << 'c'
|
|
47
|
+
when '11' then oligo << 'g'
|
|
48
|
+
else
|
|
49
|
+
raise "unknown m #{m}"
|
|
50
|
+
end
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
oligos << oligo
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
oligos
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Method that returns a sorted array of unique kmers, which are integer
|
|
60
|
+
# representations of DNA/RNA sequence oligos where A is encoded in two bits
|
|
61
|
+
# as 00, T as 01, U as 01, C as 10 and G as 11. Oligos with other nucleotides
|
|
62
|
+
# are ignored. The following options apply:
|
|
63
|
+
# * kmer_size: kmer size in the range 1-12.
|
|
64
|
+
# * step_size: step size in the range 1-12 (defualt=1).
|
|
65
|
+
# * score_min: drop kmers with quality score below this.
|
|
66
|
+
def to_kmers(options)
|
|
67
|
+
options[:step_size] ||= 1
|
|
68
|
+
options[:score_min] ||= Seq::SCORE_MAX
|
|
69
|
+
raise KmerError, "No kmer_size" unless options[:kmer_size]
|
|
70
|
+
raise KmerError, "Bad kmer_size: #{options[:kmer_size]}" unless (1 .. 12).include? options[:kmer_size]
|
|
71
|
+
raise KmerError, "Bad step_size: #{options[:step_size]}" unless (1 .. 12).include? options[:step_size]
|
|
72
|
+
if self.qual and not (Seq::SCORE_MIN .. Seq::SCORE_MAX).include? options[:score_min]
|
|
73
|
+
raise KmerError, "score minimum: #{options[:score_min]} out of range #{Seq::SCORE_MIN} .. #{Seq::SCORE_MAX}"
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
size = Seq::DNA.size ** options[:kmer_size]
|
|
77
|
+
|
|
78
|
+
if defined? @kmer_ary and @kmer_ary.count == size
|
|
79
|
+
@kmer_ary.zero!
|
|
80
|
+
else
|
|
81
|
+
@kmer_ary = BioDSL::CAry.new(size, 1)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
if self.qual
|
|
85
|
+
to_kmers_qual_C(self.seq, self.qual, @kmer_ary.ary, self.length, @kmer_ary.count, options[:kmer_size], options[:step_size], options[:score_min], Seq::SCORE_BASE)
|
|
86
|
+
else
|
|
87
|
+
to_kmers_C(self.seq, @kmer_ary.ary, self.length, @kmer_ary.count, options[:kmer_size], options[:step_size])
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
private
|
|
92
|
+
|
|
93
|
+
inline do |builder|
|
|
94
|
+
builder.prefix %{
|
|
95
|
+
int encode_nuc(char nuc, unsigned int *bin)
|
|
96
|
+
{
|
|
97
|
+
*bin <<= 2;
|
|
98
|
+
|
|
99
|
+
switch(nuc)
|
|
100
|
+
{
|
|
101
|
+
case 'a':
|
|
102
|
+
*bin |= 0;
|
|
103
|
+
break;
|
|
104
|
+
case 'A':
|
|
105
|
+
*bin |= 0;
|
|
106
|
+
break;
|
|
107
|
+
case 't':
|
|
108
|
+
*bin |= 1;
|
|
109
|
+
break;
|
|
110
|
+
case 'T':
|
|
111
|
+
*bin |= 1;
|
|
112
|
+
break;
|
|
113
|
+
case 'u':
|
|
114
|
+
*bin |= 1;
|
|
115
|
+
break;
|
|
116
|
+
case 'U':
|
|
117
|
+
*bin |= 1;
|
|
118
|
+
break;
|
|
119
|
+
case 'c':
|
|
120
|
+
*bin |= 2;
|
|
121
|
+
break;
|
|
122
|
+
case 'C':
|
|
123
|
+
*bin |= 2;
|
|
124
|
+
break;
|
|
125
|
+
case 'g':
|
|
126
|
+
*bin |= 3;
|
|
127
|
+
break;
|
|
128
|
+
case 'G':
|
|
129
|
+
*bin |= 3;
|
|
130
|
+
break;
|
|
131
|
+
default:
|
|
132
|
+
return 0;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return 1;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
builder.c %{
|
|
140
|
+
VALUE to_kmers_C(
|
|
141
|
+
VALUE _seq, // DNA or RNA sequence string.
|
|
142
|
+
VALUE _ary, // byte array for sort and uniq.
|
|
143
|
+
VALUE _seq_len, // sequence length.
|
|
144
|
+
VALUE _ary_len, // byte array length.
|
|
145
|
+
VALUE _kmer_size, // Size of kmer or oligo.
|
|
146
|
+
VALUE _step_size // Step size for overlapping kmers.
|
|
147
|
+
)
|
|
148
|
+
{
|
|
149
|
+
char *seq = StringValuePtr(_seq);
|
|
150
|
+
char *ary = StringValuePtr(_ary);
|
|
151
|
+
unsigned int seq_len = FIX2UINT(_seq_len);
|
|
152
|
+
unsigned int ary_len = FIX2UINT(_ary_len);
|
|
153
|
+
unsigned int kmer_size = FIX2UINT(_kmer_size);
|
|
154
|
+
unsigned int step_size = FIX2UINT(_step_size);
|
|
155
|
+
|
|
156
|
+
VALUE array = rb_ary_new();
|
|
157
|
+
unsigned int bin = 0;
|
|
158
|
+
unsigned int enc = 0;
|
|
159
|
+
unsigned int i = 0;
|
|
160
|
+
unsigned int mask = (1 << (2 * kmer_size)) - 1;
|
|
161
|
+
|
|
162
|
+
for (i = 0; i < seq_len; i++)
|
|
163
|
+
{
|
|
164
|
+
if (encode_nuc(seq[i], &bin))
|
|
165
|
+
{
|
|
166
|
+
enc++;
|
|
167
|
+
|
|
168
|
+
if (((i % step_size) == 0) && (enc >= kmer_size)) {
|
|
169
|
+
ary[(bin & mask)] = 1;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
else
|
|
173
|
+
{
|
|
174
|
+
enc = 0;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
for (i = 0; i < ary_len; i++)
|
|
179
|
+
{
|
|
180
|
+
if (ary[i]) {
|
|
181
|
+
rb_ary_push(array, INT2FIX(i));
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
return array;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
builder.c %{
|
|
190
|
+
VALUE to_kmers_qual_C(
|
|
191
|
+
VALUE _seq, // DNA or RNA sequence string.
|
|
192
|
+
VALUE _qual, // Quality score string.
|
|
193
|
+
VALUE _ary, // Byte array for sort and uniq.
|
|
194
|
+
VALUE _seq_len, // Sequence length.
|
|
195
|
+
VALUE _ary_len, // Byte array length.
|
|
196
|
+
VALUE _kmer_size, // Size of kmer or oligo.
|
|
197
|
+
VALUE _step_size, // Step size for overlapping kmers.
|
|
198
|
+
VALUE _score_min, // Miminum quality score to accept in a kmer.
|
|
199
|
+
VALUE _score_base // Quality score base.
|
|
200
|
+
)
|
|
201
|
+
{
|
|
202
|
+
char *seq = StringValuePtr(_seq);
|
|
203
|
+
char *qual = StringValuePtr(_qual);
|
|
204
|
+
char *ary = StringValuePtr(_ary);
|
|
205
|
+
unsigned int seq_len = FIX2UINT(_seq_len);
|
|
206
|
+
unsigned int ary_len = FIX2UINT(_ary_len);
|
|
207
|
+
unsigned int kmer_size = FIX2UINT(_kmer_size);
|
|
208
|
+
unsigned int step_size = FIX2UINT(_step_size);
|
|
209
|
+
unsigned int score_min = FIX2UINT(_score_min);
|
|
210
|
+
unsigned int score_base = FIX2UINT(_score_base);
|
|
211
|
+
|
|
212
|
+
VALUE array = rb_ary_new();
|
|
213
|
+
unsigned int bin = 0;
|
|
214
|
+
unsigned int enc = 0;
|
|
215
|
+
unsigned int i = 0;
|
|
216
|
+
unsigned int mask = (1 << (2 * kmer_size)) - 1;
|
|
217
|
+
|
|
218
|
+
for (i = 0; i < seq_len; i++)
|
|
219
|
+
{
|
|
220
|
+
if (encode_nuc(seq[i], &bin))
|
|
221
|
+
{
|
|
222
|
+
enc++;
|
|
223
|
+
|
|
224
|
+
if ((unsigned int) qual[i] - score_base < score_min)
|
|
225
|
+
{
|
|
226
|
+
enc = 0;
|
|
227
|
+
}
|
|
228
|
+
else if ((enc >= kmer_size) && ((i % step_size) == 0))
|
|
229
|
+
{
|
|
230
|
+
ary[(bin & mask)] = 1;
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
else
|
|
234
|
+
{
|
|
235
|
+
enc = 0;
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
for (i = 0; i < ary_len; i++)
|
|
240
|
+
{
|
|
241
|
+
if (ary[i]) {
|
|
242
|
+
rb_ary_push(array, INT2FIX(i));
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
return array;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
def naive(options)
|
|
252
|
+
oligos = []
|
|
253
|
+
|
|
254
|
+
(0 .. self.length - options[:kmer_size]).each do |i|
|
|
255
|
+
oligo = self[i ... i + options[:kmer_size]]
|
|
256
|
+
|
|
257
|
+
next unless oligo.seq.upcase =~ /^[ATUCG]+$/
|
|
258
|
+
next if oligo.qual and options[:scores_min] and oligo.scores_min < options[:scores_min]
|
|
259
|
+
|
|
260
|
+
oligos << oligo.seq.upcase
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
oligos
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
def naive_bin(options)
|
|
267
|
+
oligos = []
|
|
268
|
+
|
|
269
|
+
(0 .. self.length - options[:kmer_size]).each do |i|
|
|
270
|
+
oligo = self[i ... i + options[:kmer_size]]
|
|
271
|
+
|
|
272
|
+
next unless oligo.seq.upcase =~ /^[ATCG]+$/
|
|
273
|
+
next if oligo.qual and options[:scores_min] and oligo.scores_min < options[:scores_min]
|
|
274
|
+
|
|
275
|
+
bin = 0
|
|
276
|
+
|
|
277
|
+
oligo.seq.upcase.each_char do |c|
|
|
278
|
+
bin <<= 2
|
|
279
|
+
case c
|
|
280
|
+
when 'T' then bin |= 1
|
|
281
|
+
when 'U' then bin |= 1
|
|
282
|
+
when 'C' then bin |= 2
|
|
283
|
+
when 'G' then bin |= 3
|
|
284
|
+
end
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
oligos << bin
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
oligos
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
end
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
|
|
18
|
+
# #
|
|
19
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
20
|
+
# #
|
|
21
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
22
|
+
# #
|
|
23
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
24
|
+
# #
|
|
25
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
26
|
+
|
|
27
|
+
module BioDSL
|
|
28
|
+
# Class to calculate the Levenshtein distance between two
|
|
29
|
+
# given strings.
|
|
30
|
+
# http://en.wikipedia.org/wiki/Levenshtein_distance
|
|
31
|
+
class Levenshtein
|
|
32
|
+
extend BioDSL::Ambiguity
|
|
33
|
+
|
|
34
|
+
BYTES_IN_INT = 4
|
|
35
|
+
|
|
36
|
+
def self.distance(s, t)
|
|
37
|
+
return 0 if s == t;
|
|
38
|
+
return t.length if s.length == 0;
|
|
39
|
+
return s.length if t.length == 0;
|
|
40
|
+
|
|
41
|
+
v0 = "\0" * (t.length + 1) * BYTES_IN_INT
|
|
42
|
+
v1 = "\0" * (t.length + 1) * BYTES_IN_INT
|
|
43
|
+
|
|
44
|
+
l = self.new
|
|
45
|
+
l.levenshtein_distance_C(s, t, s.length, t.length, v0, v1)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<<
|
|
49
|
+
|
|
50
|
+
inline do |builder|
|
|
51
|
+
add_ambiguity_macro(builder)
|
|
52
|
+
|
|
53
|
+
builder.prefix %{
|
|
54
|
+
unsigned int min(unsigned int a, unsigned int b, unsigned int c)
|
|
55
|
+
{
|
|
56
|
+
unsigned int m = a;
|
|
57
|
+
|
|
58
|
+
if (m > b) m = b;
|
|
59
|
+
if (m > c) m = c;
|
|
60
|
+
|
|
61
|
+
return m;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
builder.c %{
|
|
66
|
+
VALUE levenshtein_distance_C(
|
|
67
|
+
VALUE _s, // string
|
|
68
|
+
VALUE _t, // string
|
|
69
|
+
VALUE _s_len, // string length
|
|
70
|
+
VALUE _t_len, // string length
|
|
71
|
+
VALUE _v0, // score vector
|
|
72
|
+
VALUE _v1 // score vector
|
|
73
|
+
)
|
|
74
|
+
{
|
|
75
|
+
char *s = (char *) StringValuePtr(_s);
|
|
76
|
+
char *t = (char *) StringValuePtr(_t);
|
|
77
|
+
unsigned int s_len = FIX2UINT(_s_len);
|
|
78
|
+
unsigned int t_len = FIX2UINT(_t_len);
|
|
79
|
+
unsigned int *v0 = (unsigned int *) StringValuePtr(_v0);
|
|
80
|
+
unsigned int *v1 = (unsigned int *) StringValuePtr(_v1);
|
|
81
|
+
|
|
82
|
+
unsigned int i = 0;
|
|
83
|
+
unsigned int j = 0;
|
|
84
|
+
unsigned int cost = 0;
|
|
85
|
+
|
|
86
|
+
for (i = 0; i < t_len + 1; i++)
|
|
87
|
+
v0[i] = i;
|
|
88
|
+
|
|
89
|
+
for (i = 0; i < s_len; i++)
|
|
90
|
+
{
|
|
91
|
+
v1[0] = i + 1;
|
|
92
|
+
|
|
93
|
+
for (j = 0; j < t_len; j++)
|
|
94
|
+
{
|
|
95
|
+
cost = (MATCH(s[i], t[j])) ? 0 : 1;
|
|
96
|
+
v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
for (j = 0; j < t_len + 1; j++)
|
|
100
|
+
v0[j] = v1[j];
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return UINT2NUM(v1[t_len]);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
__END__
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
|
|
18
|
+
# #
|
|
19
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
20
|
+
# #
|
|
21
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
22
|
+
# #
|
|
23
|
+
# This software is part of BioDSL (www.BioDSL.org). #
|
|
24
|
+
# #
|
|
25
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
26
|
+
|
|
27
|
+
module BioDSL
|
|
28
|
+
module Translate
|
|
29
|
+
# Translation table 11
|
|
30
|
+
# (http://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=cgencodes#SG11)
|
|
31
|
+
# AAs = FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
|
|
32
|
+
# Starts = ---M---------------M------------MMMM---------------M------------
|
|
33
|
+
# Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
|
|
34
|
+
# Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
|
|
35
|
+
# Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
|
|
36
|
+
TRANS_TAB11_START = {
|
|
37
|
+
"TTG" => "M", "CTG" => "M", "ATT" => "M", "ATC" => "M",
|
|
38
|
+
"ATA" => "M", "ATG" => "M", "GTG" => "M"
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
TRANS_TAB11 = {
|
|
42
|
+
"TTT" => "F", "TCT" => "S", "TAT" => "Y", "TGT" => "C",
|
|
43
|
+
"TTC" => "F", "TCC" => "S", "TAC" => "Y", "TGC" => "C",
|
|
44
|
+
"TTA" => "L", "TCA" => "S", "TAA" => "*", "TGA" => "*",
|
|
45
|
+
"TTG" => "L", "TCG" => "S", "TAG" => "*", "TGG" => "W",
|
|
46
|
+
"CTT" => "L", "CCT" => "P", "CAT" => "H", "CGT" => "R",
|
|
47
|
+
"CTC" => "L", "CCC" => "P", "CAC" => "H", "CGC" => "R",
|
|
48
|
+
"CTA" => "L", "CCA" => "P", "CAA" => "Q", "CGA" => "R",
|
|
49
|
+
"CTG" => "L", "CCG" => "P", "CAG" => "Q", "CGG" => "R",
|
|
50
|
+
"ATT" => "I", "ACT" => "T", "AAT" => "N", "AGT" => "S",
|
|
51
|
+
"ATC" => "I", "ACC" => "T", "AAC" => "N", "AGC" => "S",
|
|
52
|
+
"ATA" => "I", "ACA" => "T", "AAA" => "K", "AGA" => "R",
|
|
53
|
+
"ATG" => "M", "ACG" => "T", "AAG" => "K", "AGG" => "R",
|
|
54
|
+
"GTT" => "V", "GCT" => "A", "GAT" => "D", "GGT" => "G",
|
|
55
|
+
"GTC" => "V", "GCC" => "A", "GAC" => "D", "GGC" => "G",
|
|
56
|
+
"GTA" => "V", "GCA" => "A", "GAA" => "E", "GGA" => "G",
|
|
57
|
+
"GTG" => "V", "GCG" => "A", "GAG" => "E", "GGG" => "G"
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# Method to translate a DNA sequence to protein.
|
|
61
|
+
def translate!(trans_tab = 11)
|
|
62
|
+
entry = translate(trans_tab)
|
|
63
|
+
|
|
64
|
+
self.seq_name = entry.seq_name ? entry.seq_name.dup : nil
|
|
65
|
+
self.seq = entry.seq.dup
|
|
66
|
+
self.type = entry.type
|
|
67
|
+
self.qual = entry.qual
|
|
68
|
+
|
|
69
|
+
self
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
alias :to_protein! :translate!
|
|
73
|
+
|
|
74
|
+
def translate(trans_tab = 11)
|
|
75
|
+
raise SeqError, "Sequence type must be 'dna' - not #{self.type}" unless self.type == :dna
|
|
76
|
+
raise SeqError, "Sequence length must be a multiplum of 3 - was: #{self.length}" unless (self.length % 3) == 0
|
|
77
|
+
|
|
78
|
+
case trans_tab
|
|
79
|
+
when 11
|
|
80
|
+
codon_start_hash = TRANS_TAB11_START
|
|
81
|
+
codon_hash = TRANS_TAB11
|
|
82
|
+
else
|
|
83
|
+
raise SeqError, "Unknown translation table: #{trans_tab}"
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
codon = self.seq[0 ... 3].upcase
|
|
87
|
+
|
|
88
|
+
aa = codon_start_hash[codon]
|
|
89
|
+
|
|
90
|
+
raise SeqError, "Unknown start codon: #{codon}" if aa.nil?
|
|
91
|
+
|
|
92
|
+
protein = aa.dup
|
|
93
|
+
|
|
94
|
+
(3 ... self.length).step(3) do |i|
|
|
95
|
+
codon = self.seq[i ... i + 3].upcase
|
|
96
|
+
|
|
97
|
+
aa = codon_hash[codon]
|
|
98
|
+
|
|
99
|
+
raise SeqError, "Unknown codon: #{codon}" if aa.nil?
|
|
100
|
+
|
|
101
|
+
protein << aa.dup
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
Seq.new(seq_name: self.seq_name, seq: protein[0 .. -2], type: :protein)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
alias :to_protein :translate
|
|
108
|
+
end
|
|
109
|
+
end
|