BioDSL 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/BioDSL.gemspec +1 -1
- data/Gemfile +6 -0
- data/README.md +289 -155
- data/Rakefile +18 -16
- data/lib/BioDSL.rb +1 -1
- data/lib/BioDSL/cary.rb +78 -53
- data/lib/BioDSL/command.rb +2 -2
- data/lib/BioDSL/commands.rb +1 -1
- data/lib/BioDSL/commands/add_key.rb +1 -1
- data/lib/BioDSL/commands/align_seq_mothur.rb +4 -4
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +5 -5
- data/lib/BioDSL/commands/assemble_pairs.rb +13 -13
- data/lib/BioDSL/commands/assemble_seq_idba.rb +7 -9
- data/lib/BioDSL/commands/assemble_seq_ray.rb +13 -13
- data/lib/BioDSL/commands/assemble_seq_spades.rb +4 -4
- data/lib/BioDSL/commands/classify_seq.rb +8 -8
- data/lib/BioDSL/commands/classify_seq_mothur.rb +5 -5
- data/lib/BioDSL/commands/clip_primer.rb +7 -7
- data/lib/BioDSL/commands/cluster_otus.rb +5 -5
- data/lib/BioDSL/commands/collapse_otus.rb +2 -2
- data/lib/BioDSL/commands/collect_otus.rb +2 -2
- data/lib/BioDSL/commands/complement_seq.rb +4 -4
- data/lib/BioDSL/commands/count.rb +1 -1
- data/lib/BioDSL/commands/count_values.rb +2 -2
- data/lib/BioDSL/commands/degap_seq.rb +6 -7
- data/lib/BioDSL/commands/dereplicate_seq.rb +1 -1
- data/lib/BioDSL/commands/dump.rb +2 -2
- data/lib/BioDSL/commands/filter_rrna.rb +4 -4
- data/lib/BioDSL/commands/genecall.rb +7 -7
- data/lib/BioDSL/commands/grab.rb +1 -1
- data/lib/BioDSL/commands/index_taxonomy.rb +3 -3
- data/lib/BioDSL/commands/mask_seq.rb +4 -4
- data/lib/BioDSL/commands/mean_scores.rb +2 -2
- data/lib/BioDSL/commands/merge_pair_seq.rb +3 -3
- data/lib/BioDSL/commands/merge_table.rb +1 -1
- data/lib/BioDSL/commands/merge_values.rb +1 -1
- data/lib/BioDSL/commands/plot_heatmap.rb +4 -5
- data/lib/BioDSL/commands/plot_histogram.rb +4 -4
- data/lib/BioDSL/commands/plot_matches.rb +5 -5
- data/lib/BioDSL/commands/plot_residue_distribution.rb +6 -6
- data/lib/BioDSL/commands/plot_scores.rb +7 -7
- data/lib/BioDSL/commands/random.rb +1 -1
- data/lib/BioDSL/commands/read_fasta.rb +9 -9
- data/lib/BioDSL/commands/read_fastq.rb +16 -16
- data/lib/BioDSL/commands/read_table.rb +2 -3
- data/lib/BioDSL/commands/reverse_seq.rb +4 -4
- data/lib/BioDSL/commands/slice_align.rb +4 -4
- data/lib/BioDSL/commands/slice_seq.rb +3 -3
- data/lib/BioDSL/commands/sort.rb +1 -1
- data/lib/BioDSL/commands/split_pair_seq.rb +6 -7
- data/lib/BioDSL/commands/split_values.rb +2 -2
- data/lib/BioDSL/commands/trim_primer.rb +13 -8
- data/lib/BioDSL/commands/trim_seq.rb +5 -5
- data/lib/BioDSL/commands/uchime_ref.rb +6 -6
- data/lib/BioDSL/commands/uclust.rb +5 -5
- data/lib/BioDSL/commands/unique_values.rb +1 -1
- data/lib/BioDSL/commands/usearch_global.rb +2 -2
- data/lib/BioDSL/commands/usearch_local.rb +2 -2
- data/lib/BioDSL/commands/write_fasta.rb +7 -9
- data/lib/BioDSL/commands/write_fastq.rb +4 -4
- data/lib/BioDSL/commands/write_table.rb +3 -3
- data/lib/BioDSL/commands/write_tree.rb +2 -3
- data/lib/BioDSL/config.rb +2 -2
- data/lib/BioDSL/csv.rb +8 -10
- data/lib/BioDSL/debug.rb +1 -1
- data/lib/BioDSL/fasta.rb +54 -40
- data/lib/BioDSL/fastq.rb +35 -32
- data/lib/BioDSL/filesys.rb +56 -47
- data/lib/BioDSL/fork.rb +1 -1
- data/lib/BioDSL/hamming.rb +1 -1
- data/lib/BioDSL/helpers.rb +1 -1
- data/lib/BioDSL/helpers/aux_helper.rb +1 -1
- data/lib/BioDSL/helpers/email_helper.rb +1 -1
- data/lib/BioDSL/helpers/history_helper.rb +1 -1
- data/lib/BioDSL/helpers/log_helper.rb +1 -1
- data/lib/BioDSL/helpers/options_helper.rb +1 -1
- data/lib/BioDSL/helpers/status_helper.rb +1 -1
- data/lib/BioDSL/html_report.rb +1 -1
- data/lib/BioDSL/math.rb +1 -1
- data/lib/BioDSL/mummer.rb +1 -1
- data/lib/BioDSL/pipeline.rb +1 -1
- data/lib/BioDSL/seq.rb +240 -231
- data/lib/BioDSL/seq/ambiguity.rb +1 -1
- data/lib/BioDSL/seq/assemble.rb +1 -1
- data/lib/BioDSL/seq/backtrack.rb +93 -76
- data/lib/BioDSL/seq/digest.rb +1 -1
- data/lib/BioDSL/seq/dynamic.rb +43 -55
- data/lib/BioDSL/seq/homopolymer.rb +34 -36
- data/lib/BioDSL/seq/kmer.rb +67 -50
- data/lib/BioDSL/seq/levenshtein.rb +35 -40
- data/lib/BioDSL/seq/translate.rb +64 -55
- data/lib/BioDSL/seq/trim.rb +60 -50
- data/lib/BioDSL/serializer.rb +1 -1
- data/lib/BioDSL/stream.rb +1 -1
- data/lib/BioDSL/taxonomy.rb +1 -1
- data/lib/BioDSL/test.rb +1 -1
- data/lib/BioDSL/tmp_dir.rb +1 -1
- data/lib/BioDSL/usearch.rb +1 -1
- data/lib/BioDSL/verbose.rb +1 -1
- data/lib/BioDSL/version.rb +2 -2
- data/test/BioDSL/commands/test_add_key.rb +1 -1
- data/test/BioDSL/commands/test_align_seq_mothur.rb +1 -1
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +1 -1
- data/test/BioDSL/commands/test_assemble_pairs.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +1 -1
- data/test/BioDSL/commands/test_classify_seq.rb +1 -1
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +1 -1
- data/test/BioDSL/commands/test_clip_primer.rb +1 -1
- data/test/BioDSL/commands/test_cluster_otus.rb +1 -1
- data/test/BioDSL/commands/test_collapse_otus.rb +1 -1
- data/test/BioDSL/commands/test_collect_otus.rb +1 -1
- data/test/BioDSL/commands/test_complement_seq.rb +1 -1
- data/test/BioDSL/commands/test_count.rb +1 -1
- data/test/BioDSL/commands/test_count_values.rb +1 -1
- data/test/BioDSL/commands/test_degap_seq.rb +1 -1
- data/test/BioDSL/commands/test_dereplicate_seq.rb +1 -1
- data/test/BioDSL/commands/test_dump.rb +1 -1
- data/test/BioDSL/commands/test_filter_rrna.rb +1 -1
- data/test/BioDSL/commands/test_genecall.rb +1 -1
- data/test/BioDSL/commands/test_grab.rb +1 -1
- data/test/BioDSL/commands/test_index_taxonomy.rb +1 -1
- data/test/BioDSL/commands/test_mask_seq.rb +1 -1
- data/test/BioDSL/commands/test_mean_scores.rb +1 -1
- data/test/BioDSL/commands/test_merge_pair_seq.rb +1 -1
- data/test/BioDSL/commands/test_merge_table.rb +1 -1
- data/test/BioDSL/commands/test_merge_values.rb +1 -1
- data/test/BioDSL/commands/test_plot_heatmap.rb +1 -1
- data/test/BioDSL/commands/test_plot_histogram.rb +1 -1
- data/test/BioDSL/commands/test_plot_matches.rb +1 -1
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +1 -1
- data/test/BioDSL/commands/test_plot_scores.rb +1 -1
- data/test/BioDSL/commands/test_random.rb +1 -1
- data/test/BioDSL/commands/test_read_fasta.rb +1 -1
- data/test/BioDSL/commands/test_read_fastq.rb +1 -1
- data/test/BioDSL/commands/test_read_table.rb +1 -1
- data/test/BioDSL/commands/test_reverse_seq.rb +1 -1
- data/test/BioDSL/commands/test_slice_align.rb +1 -1
- data/test/BioDSL/commands/test_slice_seq.rb +1 -1
- data/test/BioDSL/commands/test_sort.rb +1 -1
- data/test/BioDSL/commands/test_split_pair_seq.rb +1 -1
- data/test/BioDSL/commands/test_split_values.rb +1 -1
- data/test/BioDSL/commands/test_trim_primer.rb +1 -1
- data/test/BioDSL/commands/test_trim_seq.rb +1 -1
- data/test/BioDSL/commands/test_uchime_ref.rb +1 -1
- data/test/BioDSL/commands/test_uclust.rb +1 -1
- data/test/BioDSL/commands/test_unique_values.rb +1 -1
- data/test/BioDSL/commands/test_usearch_global.rb +1 -1
- data/test/BioDSL/commands/test_usearch_local.rb +1 -1
- data/test/BioDSL/commands/test_write_fasta.rb +1 -1
- data/test/BioDSL/commands/test_write_fastq.rb +1 -1
- data/test/BioDSL/commands/test_write_table.rb +1 -1
- data/test/BioDSL/commands/test_write_tree.rb +1 -1
- data/test/BioDSL/helpers/test_options_helper.rb +3 -3
- data/test/BioDSL/seq/test_assemble.rb +58 -56
- data/test/BioDSL/seq/test_backtrack.rb +83 -81
- data/test/BioDSL/seq/test_digest.rb +47 -45
- data/test/BioDSL/seq/test_dynamic.rb +66 -64
- data/test/BioDSL/seq/test_homopolymer.rb +35 -33
- data/test/BioDSL/seq/test_kmer.rb +29 -28
- data/test/BioDSL/seq/test_translate.rb +44 -42
- data/test/BioDSL/seq/test_trim.rb +59 -57
- data/test/BioDSL/test_cary.rb +1 -1
- data/test/BioDSL/test_command.rb +2 -2
- data/test/BioDSL/test_csv.rb +34 -31
- data/test/BioDSL/test_debug.rb +31 -31
- data/test/BioDSL/test_fasta.rb +30 -29
- data/test/BioDSL/test_fastq.rb +27 -26
- data/test/BioDSL/test_filesys.rb +28 -27
- data/test/BioDSL/test_fork.rb +29 -28
- data/test/BioDSL/test_math.rb +31 -30
- data/test/BioDSL/test_mummer.rb +1 -1
- data/test/BioDSL/test_pipeline.rb +1 -1
- data/test/BioDSL/test_seq.rb +42 -41
- data/test/BioDSL/test_serializer.rb +35 -33
- data/test/BioDSL/test_stream.rb +28 -27
- data/test/BioDSL/test_taxonomy.rb +38 -37
- data/test/BioDSL/test_test.rb +32 -31
- data/test/BioDSL/test_tmp_dir.rb +1 -1
- data/test/BioDSL/test_usearch.rb +28 -27
- data/test/BioDSL/test_verbose.rb +32 -31
- data/test/helper.rb +34 -31
- metadata +3 -2
data/lib/BioDSL/seq/kmer.rb
CHANGED
@@ -1,28 +1,29 @@
|
|
1
|
-
#
|
2
|
-
#
|
3
|
-
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk).
|
4
|
-
#
|
5
|
-
# This program is free software; you can redistribute it and/or
|
6
|
-
# modify it under the terms of the GNU General Public License
|
7
|
-
# as published by the Free Software Foundation; either version 2
|
8
|
-
# of the License, or (at your option) any later version.
|
9
|
-
#
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
-
# GNU General Public License for more details.
|
14
|
-
#
|
15
|
-
# You should have received a copy of the GNU General Public License
|
16
|
-
# along with this program; if not, write to the Free Software
|
17
|
-
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
18
|
+
# USA. #
|
19
|
+
# #
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
21
|
+
# #
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
|
+
# #
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
|
+
# #
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
26
27
|
|
27
28
|
module BioDSL
|
28
29
|
# Error class for all exceptions to do with Kmer.
|
@@ -36,19 +37,19 @@ module BioDSL
|
|
36
37
|
oligos = []
|
37
38
|
|
38
39
|
kmers.each do |kmer|
|
39
|
-
oligo =
|
40
|
-
bin = "%0#{kmer_size * 2}b"
|
40
|
+
oligo = ''
|
41
|
+
bin = format("%0#{kmer_size * 2}b", kmer)
|
41
42
|
|
42
|
-
bin.scan(/.{2}/)
|
43
|
+
bin.scan(/.{2}/) do |m|
|
43
44
|
case m
|
44
45
|
when '00' then oligo << 'a'
|
45
46
|
when '01' then oligo << 't'
|
46
47
|
when '10' then oligo << 'c'
|
47
48
|
when '11' then oligo << 'g'
|
48
49
|
else
|
49
|
-
|
50
|
+
fail "unknown m #{m}"
|
50
51
|
end
|
51
|
-
|
52
|
+
end
|
52
53
|
|
53
54
|
oligos << oligo
|
54
55
|
end
|
@@ -58,33 +59,45 @@ module BioDSL
|
|
58
59
|
|
59
60
|
# Method that returns a sorted array of unique kmers, which are integer
|
60
61
|
# representations of DNA/RNA sequence oligos where A is encoded in two bits
|
61
|
-
# as 00, T as 01, U as 01, C as 10 and G as 11. Oligos with other
|
62
|
-
# are ignored. The following options apply:
|
62
|
+
# as 00, T as 01, U as 01, C as 10 and G as 11. Oligos with other
|
63
|
+
# nucleotides are ignored. The following options apply:
|
63
64
|
# * kmer_size: kmer size in the range 1-12.
|
64
65
|
# * step_size: step size in the range 1-12 (defualt=1).
|
65
66
|
# * score_min: drop kmers with quality score below this.
|
66
67
|
def to_kmers(options)
|
67
68
|
options[:step_size] ||= 1
|
68
69
|
options[:score_min] ||= Seq::SCORE_MAX
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
raise KmerError, "score minimum: #{options[:score_min]} out of range #{Seq::SCORE_MIN} .. #{Seq::SCORE_MAX}"
|
70
|
+
fail KmerError, 'No kmer_size' unless options[:kmer_size]
|
71
|
+
|
72
|
+
unless (1..12).include? options[:kmer_size]
|
73
|
+
fail KmerError, "Bad kmer_size: #{options[:kmer_size]}"
|
74
74
|
end
|
75
75
|
|
76
|
-
|
76
|
+
unless (1..12).include? options[:step_size]
|
77
|
+
fail KmerError, "Bad step_size: #{options[:step_size]}"
|
78
|
+
end
|
77
79
|
|
78
|
-
if
|
80
|
+
if @qual && !(Seq::SCORE_MIN..Seq::SCORE_MAX).
|
81
|
+
include?(options[:score_min])
|
82
|
+
fail KmerError, "score minimum: #{options[:score_min]} out of " \
|
83
|
+
"range #{Seq::SCORE_MIN}..#{Seq::SCORE_MAX}"
|
84
|
+
end
|
85
|
+
|
86
|
+
size = Seq::DNA.size**options[:kmer_size]
|
87
|
+
|
88
|
+
if defined?(@kmer_ary) && (@kmer_ary.count == size)
|
79
89
|
@kmer_ary.zero!
|
80
90
|
else
|
81
91
|
@kmer_ary = BioDSL::CAry.new(size, 1)
|
82
92
|
end
|
83
93
|
|
84
|
-
if
|
85
|
-
to_kmers_qual_C(
|
94
|
+
if @qual
|
95
|
+
to_kmers_qual_C(@seq, @qual, @kmer_ary.ary, length, @kmer_ary.count,
|
96
|
+
options[:kmer_size], options[:step_size],
|
97
|
+
options[:score_min], Seq::SCORE_BASE)
|
86
98
|
else
|
87
|
-
to_kmers_C(
|
99
|
+
to_kmers_C(@seq, @kmer_ary.ary, length, @kmer_ary.count,
|
100
|
+
options[:kmer_size], options[:step_size])
|
88
101
|
end
|
89
102
|
end
|
90
103
|
|
@@ -152,7 +165,7 @@ module BioDSL
|
|
152
165
|
unsigned int ary_len = FIX2UINT(_ary_len);
|
153
166
|
unsigned int kmer_size = FIX2UINT(_kmer_size);
|
154
167
|
unsigned int step_size = FIX2UINT(_step_size);
|
155
|
-
|
168
|
+
|
156
169
|
VALUE array = rb_ary_new();
|
157
170
|
unsigned int bin = 0;
|
158
171
|
unsigned int enc = 0;
|
@@ -208,7 +221,7 @@ module BioDSL
|
|
208
221
|
unsigned int step_size = FIX2UINT(_step_size);
|
209
222
|
unsigned int score_min = FIX2UINT(_score_min);
|
210
223
|
unsigned int score_base = FIX2UINT(_score_base);
|
211
|
-
|
224
|
+
|
212
225
|
VALUE array = rb_ary_new();
|
213
226
|
unsigned int bin = 0;
|
214
227
|
unsigned int enc = 0;
|
@@ -251,11 +264,13 @@ module BioDSL
|
|
251
264
|
def naive(options)
|
252
265
|
oligos = []
|
253
266
|
|
254
|
-
(0
|
255
|
-
oligo = self[i
|
267
|
+
(0..length - options[:kmer_size]).each do |i|
|
268
|
+
oligo = self[i...i + options[:kmer_size]]
|
256
269
|
|
257
270
|
next unless oligo.seq.upcase =~ /^[ATUCG]+$/
|
258
|
-
next if oligo.qual
|
271
|
+
next if oligo.qual &&
|
272
|
+
options[:scores_min] &&
|
273
|
+
(oligo.scores_min < options[:scores_min])
|
259
274
|
|
260
275
|
oligos << oligo.seq.upcase
|
261
276
|
end
|
@@ -266,11 +281,13 @@ module BioDSL
|
|
266
281
|
def naive_bin(options)
|
267
282
|
oligos = []
|
268
283
|
|
269
|
-
(0
|
270
|
-
oligo = self[i
|
284
|
+
(0..length - options[:kmer_size]).each do |i|
|
285
|
+
oligo = self[i...i + options[:kmer_size]]
|
271
286
|
|
272
287
|
next unless oligo.seq.upcase =~ /^[ATCG]+$/
|
273
|
-
next if oligo.qual
|
288
|
+
next if oligo.qual &&
|
289
|
+
options[:scores_min] &&
|
290
|
+
(oligo.scores_min < options[:scores_min])
|
274
291
|
|
275
292
|
bin = 0
|
276
293
|
|
@@ -1,28 +1,29 @@
|
|
1
|
-
#
|
2
|
-
#
|
3
|
-
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk).
|
4
|
-
#
|
5
|
-
# This program is free software; you can redistribute it and/or
|
6
|
-
# modify it under the terms of the GNU General Public License
|
7
|
-
# as published by the Free Software Foundation; either version 2
|
8
|
-
# of the License, or (at your option) any later version.
|
9
|
-
#
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
-
# GNU General Public License for more details.
|
14
|
-
#
|
15
|
-
# You should have received a copy of the GNU General Public License
|
16
|
-
# along with this program; if not, write to the Free Software
|
17
|
-
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
18
|
+
# USA. #
|
19
|
+
# #
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
21
|
+
# #
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
|
+
# #
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
|
+
# #
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
26
27
|
|
27
28
|
module BioDSL
|
28
29
|
# Class to calculate the Levenshtein distance between two
|
@@ -34,15 +35,14 @@ module BioDSL
|
|
34
35
|
BYTES_IN_INT = 4
|
35
36
|
|
36
37
|
def self.distance(s, t)
|
37
|
-
return 0 if s == t
|
38
|
-
return t.length if s.length == 0
|
39
|
-
return s.length if t.length == 0
|
38
|
+
return 0 if s == t
|
39
|
+
return t.length if s.length == 0
|
40
|
+
return s.length if t.length == 0
|
40
41
|
|
41
42
|
v0 = "\0" * (t.length + 1) * BYTES_IN_INT
|
42
43
|
v1 = "\0" * (t.length + 1) * BYTES_IN_INT
|
43
44
|
|
44
|
-
|
45
|
-
l.levenshtein_distance_C(s, t, s.length, t.length, v0, v1)
|
45
|
+
new.levenshtein_distance_C(s, t, s.length, t.length, v0, v1)
|
46
46
|
end
|
47
47
|
|
48
48
|
# >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<<
|
@@ -82,32 +82,27 @@ module BioDSL
|
|
82
82
|
unsigned int i = 0;
|
83
83
|
unsigned int j = 0;
|
84
84
|
unsigned int cost = 0;
|
85
|
-
|
85
|
+
|
86
86
|
for (i = 0; i < t_len + 1; i++)
|
87
87
|
v0[i] = i;
|
88
|
-
|
88
|
+
|
89
89
|
for (i = 0; i < s_len; i++)
|
90
90
|
{
|
91
91
|
v1[0] = i + 1;
|
92
|
-
|
92
|
+
|
93
93
|
for (j = 0; j < t_len; j++)
|
94
94
|
{
|
95
95
|
cost = (MATCH(s[i], t[j])) ? 0 : 1;
|
96
96
|
v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost);
|
97
97
|
}
|
98
|
-
|
98
|
+
|
99
99
|
for (j = 0; j < t_len + 1; j++)
|
100
100
|
v0[j] = v1[j];
|
101
101
|
}
|
102
|
-
|
102
|
+
|
103
103
|
return UINT2NUM(v1[t_len]);
|
104
104
|
}
|
105
105
|
}
|
106
106
|
end
|
107
107
|
end
|
108
108
|
end
|
109
|
-
|
110
|
-
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
111
|
-
|
112
|
-
|
113
|
-
__END__
|
data/lib/BioDSL/seq/translate.rb
CHANGED
@@ -1,30 +1,33 @@
|
|
1
|
-
#
|
2
|
-
#
|
3
|
-
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk).
|
4
|
-
#
|
5
|
-
# This program is free software; you can redistribute it and/or
|
6
|
-
# modify it under the terms of the GNU General Public License
|
7
|
-
# as published by the Free Software Foundation; either version 2
|
8
|
-
# of the License, or (at your option) any later version.
|
9
|
-
#
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
-
# GNU General Public License for more details.
|
14
|
-
#
|
15
|
-
# You should have received a copy of the GNU General Public License
|
16
|
-
# along with this program; if not, write to the Free Software
|
17
|
-
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
18
|
+
# USA. #
|
19
|
+
# #
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
21
|
+
# #
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
|
+
# #
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
|
+
# #
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
|
+
|
28
|
+
# Namespace for BioDSL.
|
27
29
|
module BioDSL
|
30
|
+
# Namespace for Translate methods.
|
28
31
|
module Translate
|
29
32
|
# Translation table 11
|
30
33
|
# (http://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=cgencodes#SG11)
|
@@ -34,27 +37,27 @@ module BioDSL
|
|
34
37
|
# Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
|
35
38
|
# Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
|
36
39
|
TRANS_TAB11_START = {
|
37
|
-
|
38
|
-
|
40
|
+
'TTG' => 'M', 'CTG' => 'M', 'ATT' => 'M', 'ATC' => 'M',
|
41
|
+
'ATA' => 'M', 'ATG' => 'M', 'GTG' => 'M'
|
39
42
|
}
|
40
43
|
|
41
44
|
TRANS_TAB11 = {
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
45
|
+
'TTT' => 'F', 'TCT' => 'S', 'TAT' => 'Y', 'TGT' => 'C',
|
46
|
+
'TTC' => 'F', 'TCC' => 'S', 'TAC' => 'Y', 'TGC' => 'C',
|
47
|
+
'TTA' => 'L', 'TCA' => 'S', 'TAA' => '*', 'TGA' => '*',
|
48
|
+
'TTG' => 'L', 'TCG' => 'S', 'TAG' => '*', 'TGG' => 'W',
|
49
|
+
'CTT' => 'L', 'CCT' => 'P', 'CAT' => 'H', 'CGT' => 'R',
|
50
|
+
'CTC' => 'L', 'CCC' => 'P', 'CAC' => 'H', 'CGC' => 'R',
|
51
|
+
'CTA' => 'L', 'CCA' => 'P', 'CAA' => 'Q', 'CGA' => 'R',
|
52
|
+
'CTG' => 'L', 'CCG' => 'P', 'CAG' => 'Q', 'CGG' => 'R',
|
53
|
+
'ATT' => 'I', 'ACT' => 'T', 'AAT' => 'N', 'AGT' => 'S',
|
54
|
+
'ATC' => 'I', 'ACC' => 'T', 'AAC' => 'N', 'AGC' => 'S',
|
55
|
+
'ATA' => 'I', 'ACA' => 'T', 'AAA' => 'K', 'AGA' => 'R',
|
56
|
+
'ATG' => 'M', 'ACG' => 'T', 'AAG' => 'K', 'AGG' => 'R',
|
57
|
+
'GTT' => 'V', 'GCT' => 'A', 'GAT' => 'D', 'GGT' => 'G',
|
58
|
+
'GTC' => 'V', 'GCC' => 'A', 'GAC' => 'D', 'GGC' => 'G',
|
59
|
+
'GTA' => 'V', 'GCA' => 'A', 'GAA' => 'E', 'GGA' => 'G',
|
60
|
+
'GTG' => 'V', 'GCG' => 'A', 'GAG' => 'E', 'GGG' => 'G'
|
58
61
|
}
|
59
62
|
|
60
63
|
# Method to translate a DNA sequence to protein.
|
@@ -69,41 +72,47 @@ module BioDSL
|
|
69
72
|
self
|
70
73
|
end
|
71
74
|
|
72
|
-
|
75
|
+
alias_method :to_protein!, :translate!
|
73
76
|
|
74
77
|
def translate(trans_tab = 11)
|
75
|
-
|
76
|
-
|
78
|
+
unless @type == :dna
|
79
|
+
fail SeqError, "Sequence type must be 'dna' - not #{@type}"
|
80
|
+
end
|
81
|
+
|
82
|
+
unless (length % 3) == 0
|
83
|
+
fail SeqError, 'Sequence length must be a multiplum of 3 - ' \
|
84
|
+
" was: #{length}"
|
85
|
+
end
|
77
86
|
|
78
87
|
case trans_tab
|
79
88
|
when 11
|
80
89
|
codon_start_hash = TRANS_TAB11_START
|
81
90
|
codon_hash = TRANS_TAB11
|
82
91
|
else
|
83
|
-
|
92
|
+
fail SeqError, "Unknown translation table: #{trans_tab}"
|
84
93
|
end
|
85
94
|
|
86
|
-
codon =
|
95
|
+
codon = @seq[0...3].upcase
|
87
96
|
|
88
97
|
aa = codon_start_hash[codon]
|
89
98
|
|
90
|
-
|
99
|
+
fail SeqError, "Unknown start codon: #{codon}" if aa.nil?
|
91
100
|
|
92
101
|
protein = aa.dup
|
93
102
|
|
94
|
-
(3
|
95
|
-
codon =
|
103
|
+
(3...length).step(3) do |i|
|
104
|
+
codon = @seq[i...i + 3].upcase
|
96
105
|
|
97
106
|
aa = codon_hash[codon]
|
98
107
|
|
99
|
-
|
108
|
+
fail SeqError, "Unknown codon: #{codon}" if aa.nil?
|
100
109
|
|
101
110
|
protein << aa.dup
|
102
111
|
end
|
103
112
|
|
104
|
-
Seq.new(seq_name:
|
113
|
+
Seq.new(seq_name: @seq_name, seq: protein[0..-2], type: :protein)
|
105
114
|
end
|
106
115
|
|
107
|
-
|
116
|
+
alias_method :to_protein, :translate
|
108
117
|
end
|
109
118
|
end
|