BioDSL 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/BioDSL.gemspec +1 -1
- data/Gemfile +6 -0
- data/README.md +289 -155
- data/Rakefile +18 -16
- data/lib/BioDSL.rb +1 -1
- data/lib/BioDSL/cary.rb +78 -53
- data/lib/BioDSL/command.rb +2 -2
- data/lib/BioDSL/commands.rb +1 -1
- data/lib/BioDSL/commands/add_key.rb +1 -1
- data/lib/BioDSL/commands/align_seq_mothur.rb +4 -4
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +5 -5
- data/lib/BioDSL/commands/assemble_pairs.rb +13 -13
- data/lib/BioDSL/commands/assemble_seq_idba.rb +7 -9
- data/lib/BioDSL/commands/assemble_seq_ray.rb +13 -13
- data/lib/BioDSL/commands/assemble_seq_spades.rb +4 -4
- data/lib/BioDSL/commands/classify_seq.rb +8 -8
- data/lib/BioDSL/commands/classify_seq_mothur.rb +5 -5
- data/lib/BioDSL/commands/clip_primer.rb +7 -7
- data/lib/BioDSL/commands/cluster_otus.rb +5 -5
- data/lib/BioDSL/commands/collapse_otus.rb +2 -2
- data/lib/BioDSL/commands/collect_otus.rb +2 -2
- data/lib/BioDSL/commands/complement_seq.rb +4 -4
- data/lib/BioDSL/commands/count.rb +1 -1
- data/lib/BioDSL/commands/count_values.rb +2 -2
- data/lib/BioDSL/commands/degap_seq.rb +6 -7
- data/lib/BioDSL/commands/dereplicate_seq.rb +1 -1
- data/lib/BioDSL/commands/dump.rb +2 -2
- data/lib/BioDSL/commands/filter_rrna.rb +4 -4
- data/lib/BioDSL/commands/genecall.rb +7 -7
- data/lib/BioDSL/commands/grab.rb +1 -1
- data/lib/BioDSL/commands/index_taxonomy.rb +3 -3
- data/lib/BioDSL/commands/mask_seq.rb +4 -4
- data/lib/BioDSL/commands/mean_scores.rb +2 -2
- data/lib/BioDSL/commands/merge_pair_seq.rb +3 -3
- data/lib/BioDSL/commands/merge_table.rb +1 -1
- data/lib/BioDSL/commands/merge_values.rb +1 -1
- data/lib/BioDSL/commands/plot_heatmap.rb +4 -5
- data/lib/BioDSL/commands/plot_histogram.rb +4 -4
- data/lib/BioDSL/commands/plot_matches.rb +5 -5
- data/lib/BioDSL/commands/plot_residue_distribution.rb +6 -6
- data/lib/BioDSL/commands/plot_scores.rb +7 -7
- data/lib/BioDSL/commands/random.rb +1 -1
- data/lib/BioDSL/commands/read_fasta.rb +9 -9
- data/lib/BioDSL/commands/read_fastq.rb +16 -16
- data/lib/BioDSL/commands/read_table.rb +2 -3
- data/lib/BioDSL/commands/reverse_seq.rb +4 -4
- data/lib/BioDSL/commands/slice_align.rb +4 -4
- data/lib/BioDSL/commands/slice_seq.rb +3 -3
- data/lib/BioDSL/commands/sort.rb +1 -1
- data/lib/BioDSL/commands/split_pair_seq.rb +6 -7
- data/lib/BioDSL/commands/split_values.rb +2 -2
- data/lib/BioDSL/commands/trim_primer.rb +13 -8
- data/lib/BioDSL/commands/trim_seq.rb +5 -5
- data/lib/BioDSL/commands/uchime_ref.rb +6 -6
- data/lib/BioDSL/commands/uclust.rb +5 -5
- data/lib/BioDSL/commands/unique_values.rb +1 -1
- data/lib/BioDSL/commands/usearch_global.rb +2 -2
- data/lib/BioDSL/commands/usearch_local.rb +2 -2
- data/lib/BioDSL/commands/write_fasta.rb +7 -9
- data/lib/BioDSL/commands/write_fastq.rb +4 -4
- data/lib/BioDSL/commands/write_table.rb +3 -3
- data/lib/BioDSL/commands/write_tree.rb +2 -3
- data/lib/BioDSL/config.rb +2 -2
- data/lib/BioDSL/csv.rb +8 -10
- data/lib/BioDSL/debug.rb +1 -1
- data/lib/BioDSL/fasta.rb +54 -40
- data/lib/BioDSL/fastq.rb +35 -32
- data/lib/BioDSL/filesys.rb +56 -47
- data/lib/BioDSL/fork.rb +1 -1
- data/lib/BioDSL/hamming.rb +1 -1
- data/lib/BioDSL/helpers.rb +1 -1
- data/lib/BioDSL/helpers/aux_helper.rb +1 -1
- data/lib/BioDSL/helpers/email_helper.rb +1 -1
- data/lib/BioDSL/helpers/history_helper.rb +1 -1
- data/lib/BioDSL/helpers/log_helper.rb +1 -1
- data/lib/BioDSL/helpers/options_helper.rb +1 -1
- data/lib/BioDSL/helpers/status_helper.rb +1 -1
- data/lib/BioDSL/html_report.rb +1 -1
- data/lib/BioDSL/math.rb +1 -1
- data/lib/BioDSL/mummer.rb +1 -1
- data/lib/BioDSL/pipeline.rb +1 -1
- data/lib/BioDSL/seq.rb +240 -231
- data/lib/BioDSL/seq/ambiguity.rb +1 -1
- data/lib/BioDSL/seq/assemble.rb +1 -1
- data/lib/BioDSL/seq/backtrack.rb +93 -76
- data/lib/BioDSL/seq/digest.rb +1 -1
- data/lib/BioDSL/seq/dynamic.rb +43 -55
- data/lib/BioDSL/seq/homopolymer.rb +34 -36
- data/lib/BioDSL/seq/kmer.rb +67 -50
- data/lib/BioDSL/seq/levenshtein.rb +35 -40
- data/lib/BioDSL/seq/translate.rb +64 -55
- data/lib/BioDSL/seq/trim.rb +60 -50
- data/lib/BioDSL/serializer.rb +1 -1
- data/lib/BioDSL/stream.rb +1 -1
- data/lib/BioDSL/taxonomy.rb +1 -1
- data/lib/BioDSL/test.rb +1 -1
- data/lib/BioDSL/tmp_dir.rb +1 -1
- data/lib/BioDSL/usearch.rb +1 -1
- data/lib/BioDSL/verbose.rb +1 -1
- data/lib/BioDSL/version.rb +2 -2
- data/test/BioDSL/commands/test_add_key.rb +1 -1
- data/test/BioDSL/commands/test_align_seq_mothur.rb +1 -1
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +1 -1
- data/test/BioDSL/commands/test_assemble_pairs.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +1 -1
- data/test/BioDSL/commands/test_classify_seq.rb +1 -1
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +1 -1
- data/test/BioDSL/commands/test_clip_primer.rb +1 -1
- data/test/BioDSL/commands/test_cluster_otus.rb +1 -1
- data/test/BioDSL/commands/test_collapse_otus.rb +1 -1
- data/test/BioDSL/commands/test_collect_otus.rb +1 -1
- data/test/BioDSL/commands/test_complement_seq.rb +1 -1
- data/test/BioDSL/commands/test_count.rb +1 -1
- data/test/BioDSL/commands/test_count_values.rb +1 -1
- data/test/BioDSL/commands/test_degap_seq.rb +1 -1
- data/test/BioDSL/commands/test_dereplicate_seq.rb +1 -1
- data/test/BioDSL/commands/test_dump.rb +1 -1
- data/test/BioDSL/commands/test_filter_rrna.rb +1 -1
- data/test/BioDSL/commands/test_genecall.rb +1 -1
- data/test/BioDSL/commands/test_grab.rb +1 -1
- data/test/BioDSL/commands/test_index_taxonomy.rb +1 -1
- data/test/BioDSL/commands/test_mask_seq.rb +1 -1
- data/test/BioDSL/commands/test_mean_scores.rb +1 -1
- data/test/BioDSL/commands/test_merge_pair_seq.rb +1 -1
- data/test/BioDSL/commands/test_merge_table.rb +1 -1
- data/test/BioDSL/commands/test_merge_values.rb +1 -1
- data/test/BioDSL/commands/test_plot_heatmap.rb +1 -1
- data/test/BioDSL/commands/test_plot_histogram.rb +1 -1
- data/test/BioDSL/commands/test_plot_matches.rb +1 -1
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +1 -1
- data/test/BioDSL/commands/test_plot_scores.rb +1 -1
- data/test/BioDSL/commands/test_random.rb +1 -1
- data/test/BioDSL/commands/test_read_fasta.rb +1 -1
- data/test/BioDSL/commands/test_read_fastq.rb +1 -1
- data/test/BioDSL/commands/test_read_table.rb +1 -1
- data/test/BioDSL/commands/test_reverse_seq.rb +1 -1
- data/test/BioDSL/commands/test_slice_align.rb +1 -1
- data/test/BioDSL/commands/test_slice_seq.rb +1 -1
- data/test/BioDSL/commands/test_sort.rb +1 -1
- data/test/BioDSL/commands/test_split_pair_seq.rb +1 -1
- data/test/BioDSL/commands/test_split_values.rb +1 -1
- data/test/BioDSL/commands/test_trim_primer.rb +1 -1
- data/test/BioDSL/commands/test_trim_seq.rb +1 -1
- data/test/BioDSL/commands/test_uchime_ref.rb +1 -1
- data/test/BioDSL/commands/test_uclust.rb +1 -1
- data/test/BioDSL/commands/test_unique_values.rb +1 -1
- data/test/BioDSL/commands/test_usearch_global.rb +1 -1
- data/test/BioDSL/commands/test_usearch_local.rb +1 -1
- data/test/BioDSL/commands/test_write_fasta.rb +1 -1
- data/test/BioDSL/commands/test_write_fastq.rb +1 -1
- data/test/BioDSL/commands/test_write_table.rb +1 -1
- data/test/BioDSL/commands/test_write_tree.rb +1 -1
- data/test/BioDSL/helpers/test_options_helper.rb +3 -3
- data/test/BioDSL/seq/test_assemble.rb +58 -56
- data/test/BioDSL/seq/test_backtrack.rb +83 -81
- data/test/BioDSL/seq/test_digest.rb +47 -45
- data/test/BioDSL/seq/test_dynamic.rb +66 -64
- data/test/BioDSL/seq/test_homopolymer.rb +35 -33
- data/test/BioDSL/seq/test_kmer.rb +29 -28
- data/test/BioDSL/seq/test_translate.rb +44 -42
- data/test/BioDSL/seq/test_trim.rb +59 -57
- data/test/BioDSL/test_cary.rb +1 -1
- data/test/BioDSL/test_command.rb +2 -2
- data/test/BioDSL/test_csv.rb +34 -31
- data/test/BioDSL/test_debug.rb +31 -31
- data/test/BioDSL/test_fasta.rb +30 -29
- data/test/BioDSL/test_fastq.rb +27 -26
- data/test/BioDSL/test_filesys.rb +28 -27
- data/test/BioDSL/test_fork.rb +29 -28
- data/test/BioDSL/test_math.rb +31 -30
- data/test/BioDSL/test_mummer.rb +1 -1
- data/test/BioDSL/test_pipeline.rb +1 -1
- data/test/BioDSL/test_seq.rb +42 -41
- data/test/BioDSL/test_serializer.rb +35 -33
- data/test/BioDSL/test_stream.rb +28 -27
- data/test/BioDSL/test_taxonomy.rb +38 -37
- data/test/BioDSL/test_test.rb +32 -31
- data/test/BioDSL/test_tmp_dir.rb +1 -1
- data/test/BioDSL/test_usearch.rb +28 -27
- data/test/BioDSL/test_verbose.rb +32 -31
- data/test/helper.rb +34 -31
- metadata +3 -2
data/lib/BioDSL/seq/kmer.rb
CHANGED
@@ -1,28 +1,29 @@
|
|
1
|
-
#
|
2
|
-
#
|
3
|
-
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk).
|
4
|
-
#
|
5
|
-
# This program is free software; you can redistribute it and/or
|
6
|
-
# modify it under the terms of the GNU General Public License
|
7
|
-
# as published by the Free Software Foundation; either version 2
|
8
|
-
# of the License, or (at your option) any later version.
|
9
|
-
#
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
-
# GNU General Public License for more details.
|
14
|
-
#
|
15
|
-
# You should have received a copy of the GNU General Public License
|
16
|
-
# along with this program; if not, write to the Free Software
|
17
|
-
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
18
|
+
# USA. #
|
19
|
+
# #
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
21
|
+
# #
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
|
+
# #
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
|
+
# #
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
26
27
|
|
27
28
|
module BioDSL
|
28
29
|
# Error class for all exceptions to do with Kmer.
|
@@ -36,19 +37,19 @@ module BioDSL
|
|
36
37
|
oligos = []
|
37
38
|
|
38
39
|
kmers.each do |kmer|
|
39
|
-
oligo =
|
40
|
-
bin = "%0#{kmer_size * 2}b"
|
40
|
+
oligo = ''
|
41
|
+
bin = format("%0#{kmer_size * 2}b", kmer)
|
41
42
|
|
42
|
-
bin.scan(/.{2}/)
|
43
|
+
bin.scan(/.{2}/) do |m|
|
43
44
|
case m
|
44
45
|
when '00' then oligo << 'a'
|
45
46
|
when '01' then oligo << 't'
|
46
47
|
when '10' then oligo << 'c'
|
47
48
|
when '11' then oligo << 'g'
|
48
49
|
else
|
49
|
-
|
50
|
+
fail "unknown m #{m}"
|
50
51
|
end
|
51
|
-
|
52
|
+
end
|
52
53
|
|
53
54
|
oligos << oligo
|
54
55
|
end
|
@@ -58,33 +59,45 @@ module BioDSL
|
|
58
59
|
|
59
60
|
# Method that returns a sorted array of unique kmers, which are integer
|
60
61
|
# representations of DNA/RNA sequence oligos where A is encoded in two bits
|
61
|
-
# as 00, T as 01, U as 01, C as 10 and G as 11. Oligos with other
|
62
|
-
# are ignored. The following options apply:
|
62
|
+
# as 00, T as 01, U as 01, C as 10 and G as 11. Oligos with other
|
63
|
+
# nucleotides are ignored. The following options apply:
|
63
64
|
# * kmer_size: kmer size in the range 1-12.
|
64
65
|
# * step_size: step size in the range 1-12 (defualt=1).
|
65
66
|
# * score_min: drop kmers with quality score below this.
|
66
67
|
def to_kmers(options)
|
67
68
|
options[:step_size] ||= 1
|
68
69
|
options[:score_min] ||= Seq::SCORE_MAX
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
raise KmerError, "score minimum: #{options[:score_min]} out of range #{Seq::SCORE_MIN} .. #{Seq::SCORE_MAX}"
|
70
|
+
fail KmerError, 'No kmer_size' unless options[:kmer_size]
|
71
|
+
|
72
|
+
unless (1..12).include? options[:kmer_size]
|
73
|
+
fail KmerError, "Bad kmer_size: #{options[:kmer_size]}"
|
74
74
|
end
|
75
75
|
|
76
|
-
|
76
|
+
unless (1..12).include? options[:step_size]
|
77
|
+
fail KmerError, "Bad step_size: #{options[:step_size]}"
|
78
|
+
end
|
77
79
|
|
78
|
-
if
|
80
|
+
if @qual && !(Seq::SCORE_MIN..Seq::SCORE_MAX).
|
81
|
+
include?(options[:score_min])
|
82
|
+
fail KmerError, "score minimum: #{options[:score_min]} out of " \
|
83
|
+
"range #{Seq::SCORE_MIN}..#{Seq::SCORE_MAX}"
|
84
|
+
end
|
85
|
+
|
86
|
+
size = Seq::DNA.size**options[:kmer_size]
|
87
|
+
|
88
|
+
if defined?(@kmer_ary) && (@kmer_ary.count == size)
|
79
89
|
@kmer_ary.zero!
|
80
90
|
else
|
81
91
|
@kmer_ary = BioDSL::CAry.new(size, 1)
|
82
92
|
end
|
83
93
|
|
84
|
-
if
|
85
|
-
to_kmers_qual_C(
|
94
|
+
if @qual
|
95
|
+
to_kmers_qual_C(@seq, @qual, @kmer_ary.ary, length, @kmer_ary.count,
|
96
|
+
options[:kmer_size], options[:step_size],
|
97
|
+
options[:score_min], Seq::SCORE_BASE)
|
86
98
|
else
|
87
|
-
to_kmers_C(
|
99
|
+
to_kmers_C(@seq, @kmer_ary.ary, length, @kmer_ary.count,
|
100
|
+
options[:kmer_size], options[:step_size])
|
88
101
|
end
|
89
102
|
end
|
90
103
|
|
@@ -152,7 +165,7 @@ module BioDSL
|
|
152
165
|
unsigned int ary_len = FIX2UINT(_ary_len);
|
153
166
|
unsigned int kmer_size = FIX2UINT(_kmer_size);
|
154
167
|
unsigned int step_size = FIX2UINT(_step_size);
|
155
|
-
|
168
|
+
|
156
169
|
VALUE array = rb_ary_new();
|
157
170
|
unsigned int bin = 0;
|
158
171
|
unsigned int enc = 0;
|
@@ -208,7 +221,7 @@ module BioDSL
|
|
208
221
|
unsigned int step_size = FIX2UINT(_step_size);
|
209
222
|
unsigned int score_min = FIX2UINT(_score_min);
|
210
223
|
unsigned int score_base = FIX2UINT(_score_base);
|
211
|
-
|
224
|
+
|
212
225
|
VALUE array = rb_ary_new();
|
213
226
|
unsigned int bin = 0;
|
214
227
|
unsigned int enc = 0;
|
@@ -251,11 +264,13 @@ module BioDSL
|
|
251
264
|
def naive(options)
|
252
265
|
oligos = []
|
253
266
|
|
254
|
-
(0
|
255
|
-
oligo = self[i
|
267
|
+
(0..length - options[:kmer_size]).each do |i|
|
268
|
+
oligo = self[i...i + options[:kmer_size]]
|
256
269
|
|
257
270
|
next unless oligo.seq.upcase =~ /^[ATUCG]+$/
|
258
|
-
next if oligo.qual
|
271
|
+
next if oligo.qual &&
|
272
|
+
options[:scores_min] &&
|
273
|
+
(oligo.scores_min < options[:scores_min])
|
259
274
|
|
260
275
|
oligos << oligo.seq.upcase
|
261
276
|
end
|
@@ -266,11 +281,13 @@ module BioDSL
|
|
266
281
|
def naive_bin(options)
|
267
282
|
oligos = []
|
268
283
|
|
269
|
-
(0
|
270
|
-
oligo = self[i
|
284
|
+
(0..length - options[:kmer_size]).each do |i|
|
285
|
+
oligo = self[i...i + options[:kmer_size]]
|
271
286
|
|
272
287
|
next unless oligo.seq.upcase =~ /^[ATCG]+$/
|
273
|
-
next if oligo.qual
|
288
|
+
next if oligo.qual &&
|
289
|
+
options[:scores_min] &&
|
290
|
+
(oligo.scores_min < options[:scores_min])
|
274
291
|
|
275
292
|
bin = 0
|
276
293
|
|
@@ -1,28 +1,29 @@
|
|
1
|
-
#
|
2
|
-
#
|
3
|
-
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk).
|
4
|
-
#
|
5
|
-
# This program is free software; you can redistribute it and/or
|
6
|
-
# modify it under the terms of the GNU General Public License
|
7
|
-
# as published by the Free Software Foundation; either version 2
|
8
|
-
# of the License, or (at your option) any later version.
|
9
|
-
#
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
-
# GNU General Public License for more details.
|
14
|
-
#
|
15
|
-
# You should have received a copy of the GNU General Public License
|
16
|
-
# along with this program; if not, write to the Free Software
|
17
|
-
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
18
|
+
# USA. #
|
19
|
+
# #
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
21
|
+
# #
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
|
+
# #
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
|
+
# #
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
26
27
|
|
27
28
|
module BioDSL
|
28
29
|
# Class to calculate the Levenshtein distance between two
|
@@ -34,15 +35,14 @@ module BioDSL
|
|
34
35
|
BYTES_IN_INT = 4
|
35
36
|
|
36
37
|
def self.distance(s, t)
|
37
|
-
return 0 if s == t
|
38
|
-
return t.length if s.length == 0
|
39
|
-
return s.length if t.length == 0
|
38
|
+
return 0 if s == t
|
39
|
+
return t.length if s.length == 0
|
40
|
+
return s.length if t.length == 0
|
40
41
|
|
41
42
|
v0 = "\0" * (t.length + 1) * BYTES_IN_INT
|
42
43
|
v1 = "\0" * (t.length + 1) * BYTES_IN_INT
|
43
44
|
|
44
|
-
|
45
|
-
l.levenshtein_distance_C(s, t, s.length, t.length, v0, v1)
|
45
|
+
new.levenshtein_distance_C(s, t, s.length, t.length, v0, v1)
|
46
46
|
end
|
47
47
|
|
48
48
|
# >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<<
|
@@ -82,32 +82,27 @@ module BioDSL
|
|
82
82
|
unsigned int i = 0;
|
83
83
|
unsigned int j = 0;
|
84
84
|
unsigned int cost = 0;
|
85
|
-
|
85
|
+
|
86
86
|
for (i = 0; i < t_len + 1; i++)
|
87
87
|
v0[i] = i;
|
88
|
-
|
88
|
+
|
89
89
|
for (i = 0; i < s_len; i++)
|
90
90
|
{
|
91
91
|
v1[0] = i + 1;
|
92
|
-
|
92
|
+
|
93
93
|
for (j = 0; j < t_len; j++)
|
94
94
|
{
|
95
95
|
cost = (MATCH(s[i], t[j])) ? 0 : 1;
|
96
96
|
v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost);
|
97
97
|
}
|
98
|
-
|
98
|
+
|
99
99
|
for (j = 0; j < t_len + 1; j++)
|
100
100
|
v0[j] = v1[j];
|
101
101
|
}
|
102
|
-
|
102
|
+
|
103
103
|
return UINT2NUM(v1[t_len]);
|
104
104
|
}
|
105
105
|
}
|
106
106
|
end
|
107
107
|
end
|
108
108
|
end
|
109
|
-
|
110
|
-
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
111
|
-
|
112
|
-
|
113
|
-
__END__
|
data/lib/BioDSL/seq/translate.rb
CHANGED
@@ -1,30 +1,33 @@
|
|
1
|
-
#
|
2
|
-
#
|
3
|
-
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk).
|
4
|
-
#
|
5
|
-
# This program is free software; you can redistribute it and/or
|
6
|
-
# modify it under the terms of the GNU General Public License
|
7
|
-
# as published by the Free Software Foundation; either version 2
|
8
|
-
# of the License, or (at your option) any later version.
|
9
|
-
#
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
-
# GNU General Public License for more details.
|
14
|
-
#
|
15
|
-
# You should have received a copy of the GNU General Public License
|
16
|
-
# along with this program; if not, write to the Free Software
|
17
|
-
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
18
|
+
# USA. #
|
19
|
+
# #
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
21
|
+
# #
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
|
+
# #
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
|
+
# #
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
|
+
|
28
|
+
# Namespace for BioDSL.
|
27
29
|
module BioDSL
|
30
|
+
# Namespace for Translate methods.
|
28
31
|
module Translate
|
29
32
|
# Translation table 11
|
30
33
|
# (http://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=cgencodes#SG11)
|
@@ -34,27 +37,27 @@ module BioDSL
|
|
34
37
|
# Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
|
35
38
|
# Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
|
36
39
|
TRANS_TAB11_START = {
|
37
|
-
|
38
|
-
|
40
|
+
'TTG' => 'M', 'CTG' => 'M', 'ATT' => 'M', 'ATC' => 'M',
|
41
|
+
'ATA' => 'M', 'ATG' => 'M', 'GTG' => 'M'
|
39
42
|
}
|
40
43
|
|
41
44
|
TRANS_TAB11 = {
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
45
|
+
'TTT' => 'F', 'TCT' => 'S', 'TAT' => 'Y', 'TGT' => 'C',
|
46
|
+
'TTC' => 'F', 'TCC' => 'S', 'TAC' => 'Y', 'TGC' => 'C',
|
47
|
+
'TTA' => 'L', 'TCA' => 'S', 'TAA' => '*', 'TGA' => '*',
|
48
|
+
'TTG' => 'L', 'TCG' => 'S', 'TAG' => '*', 'TGG' => 'W',
|
49
|
+
'CTT' => 'L', 'CCT' => 'P', 'CAT' => 'H', 'CGT' => 'R',
|
50
|
+
'CTC' => 'L', 'CCC' => 'P', 'CAC' => 'H', 'CGC' => 'R',
|
51
|
+
'CTA' => 'L', 'CCA' => 'P', 'CAA' => 'Q', 'CGA' => 'R',
|
52
|
+
'CTG' => 'L', 'CCG' => 'P', 'CAG' => 'Q', 'CGG' => 'R',
|
53
|
+
'ATT' => 'I', 'ACT' => 'T', 'AAT' => 'N', 'AGT' => 'S',
|
54
|
+
'ATC' => 'I', 'ACC' => 'T', 'AAC' => 'N', 'AGC' => 'S',
|
55
|
+
'ATA' => 'I', 'ACA' => 'T', 'AAA' => 'K', 'AGA' => 'R',
|
56
|
+
'ATG' => 'M', 'ACG' => 'T', 'AAG' => 'K', 'AGG' => 'R',
|
57
|
+
'GTT' => 'V', 'GCT' => 'A', 'GAT' => 'D', 'GGT' => 'G',
|
58
|
+
'GTC' => 'V', 'GCC' => 'A', 'GAC' => 'D', 'GGC' => 'G',
|
59
|
+
'GTA' => 'V', 'GCA' => 'A', 'GAA' => 'E', 'GGA' => 'G',
|
60
|
+
'GTG' => 'V', 'GCG' => 'A', 'GAG' => 'E', 'GGG' => 'G'
|
58
61
|
}
|
59
62
|
|
60
63
|
# Method to translate a DNA sequence to protein.
|
@@ -69,41 +72,47 @@ module BioDSL
|
|
69
72
|
self
|
70
73
|
end
|
71
74
|
|
72
|
-
|
75
|
+
alias_method :to_protein!, :translate!
|
73
76
|
|
74
77
|
def translate(trans_tab = 11)
|
75
|
-
|
76
|
-
|
78
|
+
unless @type == :dna
|
79
|
+
fail SeqError, "Sequence type must be 'dna' - not #{@type}"
|
80
|
+
end
|
81
|
+
|
82
|
+
unless (length % 3) == 0
|
83
|
+
fail SeqError, 'Sequence length must be a multiplum of 3 - ' \
|
84
|
+
" was: #{length}"
|
85
|
+
end
|
77
86
|
|
78
87
|
case trans_tab
|
79
88
|
when 11
|
80
89
|
codon_start_hash = TRANS_TAB11_START
|
81
90
|
codon_hash = TRANS_TAB11
|
82
91
|
else
|
83
|
-
|
92
|
+
fail SeqError, "Unknown translation table: #{trans_tab}"
|
84
93
|
end
|
85
94
|
|
86
|
-
codon =
|
95
|
+
codon = @seq[0...3].upcase
|
87
96
|
|
88
97
|
aa = codon_start_hash[codon]
|
89
98
|
|
90
|
-
|
99
|
+
fail SeqError, "Unknown start codon: #{codon}" if aa.nil?
|
91
100
|
|
92
101
|
protein = aa.dup
|
93
102
|
|
94
|
-
(3
|
95
|
-
codon =
|
103
|
+
(3...length).step(3) do |i|
|
104
|
+
codon = @seq[i...i + 3].upcase
|
96
105
|
|
97
106
|
aa = codon_hash[codon]
|
98
107
|
|
99
|
-
|
108
|
+
fail SeqError, "Unknown codon: #{codon}" if aa.nil?
|
100
109
|
|
101
110
|
protein << aa.dup
|
102
111
|
end
|
103
112
|
|
104
|
-
Seq.new(seq_name:
|
113
|
+
Seq.new(seq_name: @seq_name, seq: protein[0..-2], type: :protein)
|
105
114
|
end
|
106
115
|
|
107
|
-
|
116
|
+
alias_method :to_protein, :translate
|
108
117
|
end
|
109
118
|
end
|