BioDSL 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (186) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/BioDSL.gemspec +1 -1
  4. data/Gemfile +6 -0
  5. data/README.md +289 -155
  6. data/Rakefile +18 -16
  7. data/lib/BioDSL.rb +1 -1
  8. data/lib/BioDSL/cary.rb +78 -53
  9. data/lib/BioDSL/command.rb +2 -2
  10. data/lib/BioDSL/commands.rb +1 -1
  11. data/lib/BioDSL/commands/add_key.rb +1 -1
  12. data/lib/BioDSL/commands/align_seq_mothur.rb +4 -4
  13. data/lib/BioDSL/commands/analyze_residue_distribution.rb +5 -5
  14. data/lib/BioDSL/commands/assemble_pairs.rb +13 -13
  15. data/lib/BioDSL/commands/assemble_seq_idba.rb +7 -9
  16. data/lib/BioDSL/commands/assemble_seq_ray.rb +13 -13
  17. data/lib/BioDSL/commands/assemble_seq_spades.rb +4 -4
  18. data/lib/BioDSL/commands/classify_seq.rb +8 -8
  19. data/lib/BioDSL/commands/classify_seq_mothur.rb +5 -5
  20. data/lib/BioDSL/commands/clip_primer.rb +7 -7
  21. data/lib/BioDSL/commands/cluster_otus.rb +5 -5
  22. data/lib/BioDSL/commands/collapse_otus.rb +2 -2
  23. data/lib/BioDSL/commands/collect_otus.rb +2 -2
  24. data/lib/BioDSL/commands/complement_seq.rb +4 -4
  25. data/lib/BioDSL/commands/count.rb +1 -1
  26. data/lib/BioDSL/commands/count_values.rb +2 -2
  27. data/lib/BioDSL/commands/degap_seq.rb +6 -7
  28. data/lib/BioDSL/commands/dereplicate_seq.rb +1 -1
  29. data/lib/BioDSL/commands/dump.rb +2 -2
  30. data/lib/BioDSL/commands/filter_rrna.rb +4 -4
  31. data/lib/BioDSL/commands/genecall.rb +7 -7
  32. data/lib/BioDSL/commands/grab.rb +1 -1
  33. data/lib/BioDSL/commands/index_taxonomy.rb +3 -3
  34. data/lib/BioDSL/commands/mask_seq.rb +4 -4
  35. data/lib/BioDSL/commands/mean_scores.rb +2 -2
  36. data/lib/BioDSL/commands/merge_pair_seq.rb +3 -3
  37. data/lib/BioDSL/commands/merge_table.rb +1 -1
  38. data/lib/BioDSL/commands/merge_values.rb +1 -1
  39. data/lib/BioDSL/commands/plot_heatmap.rb +4 -5
  40. data/lib/BioDSL/commands/plot_histogram.rb +4 -4
  41. data/lib/BioDSL/commands/plot_matches.rb +5 -5
  42. data/lib/BioDSL/commands/plot_residue_distribution.rb +6 -6
  43. data/lib/BioDSL/commands/plot_scores.rb +7 -7
  44. data/lib/BioDSL/commands/random.rb +1 -1
  45. data/lib/BioDSL/commands/read_fasta.rb +9 -9
  46. data/lib/BioDSL/commands/read_fastq.rb +16 -16
  47. data/lib/BioDSL/commands/read_table.rb +2 -3
  48. data/lib/BioDSL/commands/reverse_seq.rb +4 -4
  49. data/lib/BioDSL/commands/slice_align.rb +4 -4
  50. data/lib/BioDSL/commands/slice_seq.rb +3 -3
  51. data/lib/BioDSL/commands/sort.rb +1 -1
  52. data/lib/BioDSL/commands/split_pair_seq.rb +6 -7
  53. data/lib/BioDSL/commands/split_values.rb +2 -2
  54. data/lib/BioDSL/commands/trim_primer.rb +13 -8
  55. data/lib/BioDSL/commands/trim_seq.rb +5 -5
  56. data/lib/BioDSL/commands/uchime_ref.rb +6 -6
  57. data/lib/BioDSL/commands/uclust.rb +5 -5
  58. data/lib/BioDSL/commands/unique_values.rb +1 -1
  59. data/lib/BioDSL/commands/usearch_global.rb +2 -2
  60. data/lib/BioDSL/commands/usearch_local.rb +2 -2
  61. data/lib/BioDSL/commands/write_fasta.rb +7 -9
  62. data/lib/BioDSL/commands/write_fastq.rb +4 -4
  63. data/lib/BioDSL/commands/write_table.rb +3 -3
  64. data/lib/BioDSL/commands/write_tree.rb +2 -3
  65. data/lib/BioDSL/config.rb +2 -2
  66. data/lib/BioDSL/csv.rb +8 -10
  67. data/lib/BioDSL/debug.rb +1 -1
  68. data/lib/BioDSL/fasta.rb +54 -40
  69. data/lib/BioDSL/fastq.rb +35 -32
  70. data/lib/BioDSL/filesys.rb +56 -47
  71. data/lib/BioDSL/fork.rb +1 -1
  72. data/lib/BioDSL/hamming.rb +1 -1
  73. data/lib/BioDSL/helpers.rb +1 -1
  74. data/lib/BioDSL/helpers/aux_helper.rb +1 -1
  75. data/lib/BioDSL/helpers/email_helper.rb +1 -1
  76. data/lib/BioDSL/helpers/history_helper.rb +1 -1
  77. data/lib/BioDSL/helpers/log_helper.rb +1 -1
  78. data/lib/BioDSL/helpers/options_helper.rb +1 -1
  79. data/lib/BioDSL/helpers/status_helper.rb +1 -1
  80. data/lib/BioDSL/html_report.rb +1 -1
  81. data/lib/BioDSL/math.rb +1 -1
  82. data/lib/BioDSL/mummer.rb +1 -1
  83. data/lib/BioDSL/pipeline.rb +1 -1
  84. data/lib/BioDSL/seq.rb +240 -231
  85. data/lib/BioDSL/seq/ambiguity.rb +1 -1
  86. data/lib/BioDSL/seq/assemble.rb +1 -1
  87. data/lib/BioDSL/seq/backtrack.rb +93 -76
  88. data/lib/BioDSL/seq/digest.rb +1 -1
  89. data/lib/BioDSL/seq/dynamic.rb +43 -55
  90. data/lib/BioDSL/seq/homopolymer.rb +34 -36
  91. data/lib/BioDSL/seq/kmer.rb +67 -50
  92. data/lib/BioDSL/seq/levenshtein.rb +35 -40
  93. data/lib/BioDSL/seq/translate.rb +64 -55
  94. data/lib/BioDSL/seq/trim.rb +60 -50
  95. data/lib/BioDSL/serializer.rb +1 -1
  96. data/lib/BioDSL/stream.rb +1 -1
  97. data/lib/BioDSL/taxonomy.rb +1 -1
  98. data/lib/BioDSL/test.rb +1 -1
  99. data/lib/BioDSL/tmp_dir.rb +1 -1
  100. data/lib/BioDSL/usearch.rb +1 -1
  101. data/lib/BioDSL/verbose.rb +1 -1
  102. data/lib/BioDSL/version.rb +2 -2
  103. data/test/BioDSL/commands/test_add_key.rb +1 -1
  104. data/test/BioDSL/commands/test_align_seq_mothur.rb +1 -1
  105. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +1 -1
  106. data/test/BioDSL/commands/test_assemble_pairs.rb +1 -1
  107. data/test/BioDSL/commands/test_assemble_seq_idba.rb +1 -1
  108. data/test/BioDSL/commands/test_assemble_seq_ray.rb +1 -1
  109. data/test/BioDSL/commands/test_assemble_seq_spades.rb +1 -1
  110. data/test/BioDSL/commands/test_classify_seq.rb +1 -1
  111. data/test/BioDSL/commands/test_classify_seq_mothur.rb +1 -1
  112. data/test/BioDSL/commands/test_clip_primer.rb +1 -1
  113. data/test/BioDSL/commands/test_cluster_otus.rb +1 -1
  114. data/test/BioDSL/commands/test_collapse_otus.rb +1 -1
  115. data/test/BioDSL/commands/test_collect_otus.rb +1 -1
  116. data/test/BioDSL/commands/test_complement_seq.rb +1 -1
  117. data/test/BioDSL/commands/test_count.rb +1 -1
  118. data/test/BioDSL/commands/test_count_values.rb +1 -1
  119. data/test/BioDSL/commands/test_degap_seq.rb +1 -1
  120. data/test/BioDSL/commands/test_dereplicate_seq.rb +1 -1
  121. data/test/BioDSL/commands/test_dump.rb +1 -1
  122. data/test/BioDSL/commands/test_filter_rrna.rb +1 -1
  123. data/test/BioDSL/commands/test_genecall.rb +1 -1
  124. data/test/BioDSL/commands/test_grab.rb +1 -1
  125. data/test/BioDSL/commands/test_index_taxonomy.rb +1 -1
  126. data/test/BioDSL/commands/test_mask_seq.rb +1 -1
  127. data/test/BioDSL/commands/test_mean_scores.rb +1 -1
  128. data/test/BioDSL/commands/test_merge_pair_seq.rb +1 -1
  129. data/test/BioDSL/commands/test_merge_table.rb +1 -1
  130. data/test/BioDSL/commands/test_merge_values.rb +1 -1
  131. data/test/BioDSL/commands/test_plot_heatmap.rb +1 -1
  132. data/test/BioDSL/commands/test_plot_histogram.rb +1 -1
  133. data/test/BioDSL/commands/test_plot_matches.rb +1 -1
  134. data/test/BioDSL/commands/test_plot_residue_distribution.rb +1 -1
  135. data/test/BioDSL/commands/test_plot_scores.rb +1 -1
  136. data/test/BioDSL/commands/test_random.rb +1 -1
  137. data/test/BioDSL/commands/test_read_fasta.rb +1 -1
  138. data/test/BioDSL/commands/test_read_fastq.rb +1 -1
  139. data/test/BioDSL/commands/test_read_table.rb +1 -1
  140. data/test/BioDSL/commands/test_reverse_seq.rb +1 -1
  141. data/test/BioDSL/commands/test_slice_align.rb +1 -1
  142. data/test/BioDSL/commands/test_slice_seq.rb +1 -1
  143. data/test/BioDSL/commands/test_sort.rb +1 -1
  144. data/test/BioDSL/commands/test_split_pair_seq.rb +1 -1
  145. data/test/BioDSL/commands/test_split_values.rb +1 -1
  146. data/test/BioDSL/commands/test_trim_primer.rb +1 -1
  147. data/test/BioDSL/commands/test_trim_seq.rb +1 -1
  148. data/test/BioDSL/commands/test_uchime_ref.rb +1 -1
  149. data/test/BioDSL/commands/test_uclust.rb +1 -1
  150. data/test/BioDSL/commands/test_unique_values.rb +1 -1
  151. data/test/BioDSL/commands/test_usearch_global.rb +1 -1
  152. data/test/BioDSL/commands/test_usearch_local.rb +1 -1
  153. data/test/BioDSL/commands/test_write_fasta.rb +1 -1
  154. data/test/BioDSL/commands/test_write_fastq.rb +1 -1
  155. data/test/BioDSL/commands/test_write_table.rb +1 -1
  156. data/test/BioDSL/commands/test_write_tree.rb +1 -1
  157. data/test/BioDSL/helpers/test_options_helper.rb +3 -3
  158. data/test/BioDSL/seq/test_assemble.rb +58 -56
  159. data/test/BioDSL/seq/test_backtrack.rb +83 -81
  160. data/test/BioDSL/seq/test_digest.rb +47 -45
  161. data/test/BioDSL/seq/test_dynamic.rb +66 -64
  162. data/test/BioDSL/seq/test_homopolymer.rb +35 -33
  163. data/test/BioDSL/seq/test_kmer.rb +29 -28
  164. data/test/BioDSL/seq/test_translate.rb +44 -42
  165. data/test/BioDSL/seq/test_trim.rb +59 -57
  166. data/test/BioDSL/test_cary.rb +1 -1
  167. data/test/BioDSL/test_command.rb +2 -2
  168. data/test/BioDSL/test_csv.rb +34 -31
  169. data/test/BioDSL/test_debug.rb +31 -31
  170. data/test/BioDSL/test_fasta.rb +30 -29
  171. data/test/BioDSL/test_fastq.rb +27 -26
  172. data/test/BioDSL/test_filesys.rb +28 -27
  173. data/test/BioDSL/test_fork.rb +29 -28
  174. data/test/BioDSL/test_math.rb +31 -30
  175. data/test/BioDSL/test_mummer.rb +1 -1
  176. data/test/BioDSL/test_pipeline.rb +1 -1
  177. data/test/BioDSL/test_seq.rb +42 -41
  178. data/test/BioDSL/test_serializer.rb +35 -33
  179. data/test/BioDSL/test_stream.rb +28 -27
  180. data/test/BioDSL/test_taxonomy.rb +38 -37
  181. data/test/BioDSL/test_test.rb +32 -31
  182. data/test/BioDSL/test_tmp_dir.rb +1 -1
  183. data/test/BioDSL/test_usearch.rb +28 -27
  184. data/test/BioDSL/test_verbose.rb +32 -31
  185. data/test/helper.rb +34 -31
  186. metadata +3 -2
@@ -1,28 +1,29 @@
1
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
- # #
3
- # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
- # #
5
- # This program is free software; you can redistribute it and/or #
6
- # modify it under the terms of the GNU General Public License #
7
- # as published by the Free Software Foundation; either version 2 #
8
- # of the License, or (at your option) any later version. #
9
- # #
10
- # This program is distributed in the hope that it will be useful, #
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
- # GNU General Public License for more details. #
14
- # #
15
- # You should have received a copy of the GNU General Public License #
16
- # along with this program; if not, write to the Free Software #
17
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
- # #
19
- # http://www.gnu.org/copyleft/gpl.html #
20
- # #
21
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
- # #
23
- # This software is part of BioDSL (www.BioDSL.org). #
24
- # #
25
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
27
 
27
28
  module BioDSL
28
29
  # Error class for all exceptions to do with Kmer.
@@ -36,19 +37,19 @@ module BioDSL
36
37
  oligos = []
37
38
 
38
39
  kmers.each do |kmer|
39
- oligo = ""
40
- bin = "%0#{kmer_size * 2}b" % kmer
40
+ oligo = ''
41
+ bin = format("%0#{kmer_size * 2}b", kmer)
41
42
 
42
- bin.scan(/.{2}/) { |m|
43
+ bin.scan(/.{2}/) do |m|
43
44
  case m
44
45
  when '00' then oligo << 'a'
45
46
  when '01' then oligo << 't'
46
47
  when '10' then oligo << 'c'
47
48
  when '11' then oligo << 'g'
48
49
  else
49
- raise "unknown m #{m}"
50
+ fail "unknown m #{m}"
50
51
  end
51
- }
52
+ end
52
53
 
53
54
  oligos << oligo
54
55
  end
@@ -58,33 +59,45 @@ module BioDSL
58
59
 
59
60
  # Method that returns a sorted array of unique kmers, which are integer
60
61
  # representations of DNA/RNA sequence oligos where A is encoded in two bits
61
- # as 00, T as 01, U as 01, C as 10 and G as 11. Oligos with other nucleotides
62
- # are ignored. The following options apply:
62
+ # as 00, T as 01, U as 01, C as 10 and G as 11. Oligos with other
63
+ # nucleotides are ignored. The following options apply:
63
64
  # * kmer_size: kmer size in the range 1-12.
64
65
  # * step_size: step size in the range 1-12 (defualt=1).
65
66
  # * score_min: drop kmers with quality score below this.
66
67
  def to_kmers(options)
67
68
  options[:step_size] ||= 1
68
69
  options[:score_min] ||= Seq::SCORE_MAX
69
- raise KmerError, "No kmer_size" unless options[:kmer_size]
70
- raise KmerError, "Bad kmer_size: #{options[:kmer_size]}" unless (1 .. 12).include? options[:kmer_size]
71
- raise KmerError, "Bad step_size: #{options[:step_size]}" unless (1 .. 12).include? options[:step_size]
72
- if self.qual and not (Seq::SCORE_MIN .. Seq::SCORE_MAX).include? options[:score_min]
73
- raise KmerError, "score minimum: #{options[:score_min]} out of range #{Seq::SCORE_MIN} .. #{Seq::SCORE_MAX}"
70
+ fail KmerError, 'No kmer_size' unless options[:kmer_size]
71
+
72
+ unless (1..12).include? options[:kmer_size]
73
+ fail KmerError, "Bad kmer_size: #{options[:kmer_size]}"
74
74
  end
75
75
 
76
- size = Seq::DNA.size ** options[:kmer_size]
76
+ unless (1..12).include? options[:step_size]
77
+ fail KmerError, "Bad step_size: #{options[:step_size]}"
78
+ end
77
79
 
78
- if defined? @kmer_ary and @kmer_ary.count == size
80
+ if @qual && !(Seq::SCORE_MIN..Seq::SCORE_MAX).
81
+ include?(options[:score_min])
82
+ fail KmerError, "score minimum: #{options[:score_min]} out of " \
83
+ "range #{Seq::SCORE_MIN}..#{Seq::SCORE_MAX}"
84
+ end
85
+
86
+ size = Seq::DNA.size**options[:kmer_size]
87
+
88
+ if defined?(@kmer_ary) && (@kmer_ary.count == size)
79
89
  @kmer_ary.zero!
80
90
  else
81
91
  @kmer_ary = BioDSL::CAry.new(size, 1)
82
92
  end
83
93
 
84
- if self.qual
85
- to_kmers_qual_C(self.seq, self.qual, @kmer_ary.ary, self.length, @kmer_ary.count, options[:kmer_size], options[:step_size], options[:score_min], Seq::SCORE_BASE)
94
+ if @qual
95
+ to_kmers_qual_C(@seq, @qual, @kmer_ary.ary, length, @kmer_ary.count,
96
+ options[:kmer_size], options[:step_size],
97
+ options[:score_min], Seq::SCORE_BASE)
86
98
  else
87
- to_kmers_C(self.seq, @kmer_ary.ary, self.length, @kmer_ary.count, options[:kmer_size], options[:step_size])
99
+ to_kmers_C(@seq, @kmer_ary.ary, length, @kmer_ary.count,
100
+ options[:kmer_size], options[:step_size])
88
101
  end
89
102
  end
90
103
 
@@ -152,7 +165,7 @@ module BioDSL
152
165
  unsigned int ary_len = FIX2UINT(_ary_len);
153
166
  unsigned int kmer_size = FIX2UINT(_kmer_size);
154
167
  unsigned int step_size = FIX2UINT(_step_size);
155
-
168
+
156
169
  VALUE array = rb_ary_new();
157
170
  unsigned int bin = 0;
158
171
  unsigned int enc = 0;
@@ -208,7 +221,7 @@ module BioDSL
208
221
  unsigned int step_size = FIX2UINT(_step_size);
209
222
  unsigned int score_min = FIX2UINT(_score_min);
210
223
  unsigned int score_base = FIX2UINT(_score_base);
211
-
224
+
212
225
  VALUE array = rb_ary_new();
213
226
  unsigned int bin = 0;
214
227
  unsigned int enc = 0;
@@ -251,11 +264,13 @@ module BioDSL
251
264
  def naive(options)
252
265
  oligos = []
253
266
 
254
- (0 .. self.length - options[:kmer_size]).each do |i|
255
- oligo = self[i ... i + options[:kmer_size]]
267
+ (0..length - options[:kmer_size]).each do |i|
268
+ oligo = self[i...i + options[:kmer_size]]
256
269
 
257
270
  next unless oligo.seq.upcase =~ /^[ATUCG]+$/
258
- next if oligo.qual and options[:scores_min] and oligo.scores_min < options[:scores_min]
271
+ next if oligo.qual &&
272
+ options[:scores_min] &&
273
+ (oligo.scores_min < options[:scores_min])
259
274
 
260
275
  oligos << oligo.seq.upcase
261
276
  end
@@ -266,11 +281,13 @@ module BioDSL
266
281
  def naive_bin(options)
267
282
  oligos = []
268
283
 
269
- (0 .. self.length - options[:kmer_size]).each do |i|
270
- oligo = self[i ... i + options[:kmer_size]]
284
+ (0..length - options[:kmer_size]).each do |i|
285
+ oligo = self[i...i + options[:kmer_size]]
271
286
 
272
287
  next unless oligo.seq.upcase =~ /^[ATCG]+$/
273
- next if oligo.qual and options[:scores_min] and oligo.scores_min < options[:scores_min]
288
+ next if oligo.qual &&
289
+ options[:scores_min] &&
290
+ (oligo.scores_min < options[:scores_min])
274
291
 
275
292
  bin = 0
276
293
 
@@ -1,28 +1,29 @@
1
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
- # #
3
- # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
- # #
5
- # This program is free software; you can redistribute it and/or #
6
- # modify it under the terms of the GNU General Public License #
7
- # as published by the Free Software Foundation; either version 2 #
8
- # of the License, or (at your option) any later version. #
9
- # #
10
- # This program is distributed in the hope that it will be useful, #
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
- # GNU General Public License for more details. #
14
- # #
15
- # You should have received a copy of the GNU General Public License #
16
- # along with this program; if not, write to the Free Software #
17
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
- # #
19
- # http://www.gnu.org/copyleft/gpl.html #
20
- # #
21
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
- # #
23
- # This software is part of the BioDSL framework (www.BioDSL.org). #
24
- # #
25
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
27
 
27
28
  module BioDSL
28
29
  # Class to calculate the Levenshtein distance between two
@@ -34,15 +35,14 @@ module BioDSL
34
35
  BYTES_IN_INT = 4
35
36
 
36
37
  def self.distance(s, t)
37
- return 0 if s == t;
38
- return t.length if s.length == 0;
39
- return s.length if t.length == 0;
38
+ return 0 if s == t
39
+ return t.length if s.length == 0
40
+ return s.length if t.length == 0
40
41
 
41
42
  v0 = "\0" * (t.length + 1) * BYTES_IN_INT
42
43
  v1 = "\0" * (t.length + 1) * BYTES_IN_INT
43
44
 
44
- l = self.new
45
- l.levenshtein_distance_C(s, t, s.length, t.length, v0, v1)
45
+ new.levenshtein_distance_C(s, t, s.length, t.length, v0, v1)
46
46
  end
47
47
 
48
48
  # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<<
@@ -82,32 +82,27 @@ module BioDSL
82
82
  unsigned int i = 0;
83
83
  unsigned int j = 0;
84
84
  unsigned int cost = 0;
85
-
85
+
86
86
  for (i = 0; i < t_len + 1; i++)
87
87
  v0[i] = i;
88
-
88
+
89
89
  for (i = 0; i < s_len; i++)
90
90
  {
91
91
  v1[0] = i + 1;
92
-
92
+
93
93
  for (j = 0; j < t_len; j++)
94
94
  {
95
95
  cost = (MATCH(s[i], t[j])) ? 0 : 1;
96
96
  v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost);
97
97
  }
98
-
98
+
99
99
  for (j = 0; j < t_len + 1; j++)
100
100
  v0[j] = v1[j];
101
101
  }
102
-
102
+
103
103
  return UINT2NUM(v1[t_len]);
104
104
  }
105
105
  }
106
106
  end
107
107
  end
108
108
  end
109
-
110
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
111
-
112
-
113
- __END__
@@ -1,30 +1,33 @@
1
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
- # #
3
- # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
- # #
5
- # This program is free software; you can redistribute it and/or #
6
- # modify it under the terms of the GNU General Public License #
7
- # as published by the Free Software Foundation; either version 2 #
8
- # of the License, or (at your option) any later version. #
9
- # #
10
- # This program is distributed in the hope that it will be useful, #
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
- # GNU General Public License for more details. #
14
- # #
15
- # You should have received a copy of the GNU General Public License #
16
- # along with this program; if not, write to the Free Software #
17
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
- # #
19
- # http://www.gnu.org/copyleft/gpl.html #
20
- # #
21
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
- # #
23
- # This software is part of BioDSL (www.BioDSL.org). #
24
- # #
25
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
-
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
27
29
  module BioDSL
30
+ # Namespace for Translate methods.
28
31
  module Translate
29
32
  # Translation table 11
30
33
  # (http://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=cgencodes#SG11)
@@ -34,27 +37,27 @@ module BioDSL
34
37
  # Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
35
38
  # Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
36
39
  TRANS_TAB11_START = {
37
- "TTG" => "M", "CTG" => "M", "ATT" => "M", "ATC" => "M",
38
- "ATA" => "M", "ATG" => "M", "GTG" => "M"
40
+ 'TTG' => 'M', 'CTG' => 'M', 'ATT' => 'M', 'ATC' => 'M',
41
+ 'ATA' => 'M', 'ATG' => 'M', 'GTG' => 'M'
39
42
  }
40
43
 
41
44
  TRANS_TAB11 = {
42
- "TTT" => "F", "TCT" => "S", "TAT" => "Y", "TGT" => "C",
43
- "TTC" => "F", "TCC" => "S", "TAC" => "Y", "TGC" => "C",
44
- "TTA" => "L", "TCA" => "S", "TAA" => "*", "TGA" => "*",
45
- "TTG" => "L", "TCG" => "S", "TAG" => "*", "TGG" => "W",
46
- "CTT" => "L", "CCT" => "P", "CAT" => "H", "CGT" => "R",
47
- "CTC" => "L", "CCC" => "P", "CAC" => "H", "CGC" => "R",
48
- "CTA" => "L", "CCA" => "P", "CAA" => "Q", "CGA" => "R",
49
- "CTG" => "L", "CCG" => "P", "CAG" => "Q", "CGG" => "R",
50
- "ATT" => "I", "ACT" => "T", "AAT" => "N", "AGT" => "S",
51
- "ATC" => "I", "ACC" => "T", "AAC" => "N", "AGC" => "S",
52
- "ATA" => "I", "ACA" => "T", "AAA" => "K", "AGA" => "R",
53
- "ATG" => "M", "ACG" => "T", "AAG" => "K", "AGG" => "R",
54
- "GTT" => "V", "GCT" => "A", "GAT" => "D", "GGT" => "G",
55
- "GTC" => "V", "GCC" => "A", "GAC" => "D", "GGC" => "G",
56
- "GTA" => "V", "GCA" => "A", "GAA" => "E", "GGA" => "G",
57
- "GTG" => "V", "GCG" => "A", "GAG" => "E", "GGG" => "G"
45
+ 'TTT' => 'F', 'TCT' => 'S', 'TAT' => 'Y', 'TGT' => 'C',
46
+ 'TTC' => 'F', 'TCC' => 'S', 'TAC' => 'Y', 'TGC' => 'C',
47
+ 'TTA' => 'L', 'TCA' => 'S', 'TAA' => '*', 'TGA' => '*',
48
+ 'TTG' => 'L', 'TCG' => 'S', 'TAG' => '*', 'TGG' => 'W',
49
+ 'CTT' => 'L', 'CCT' => 'P', 'CAT' => 'H', 'CGT' => 'R',
50
+ 'CTC' => 'L', 'CCC' => 'P', 'CAC' => 'H', 'CGC' => 'R',
51
+ 'CTA' => 'L', 'CCA' => 'P', 'CAA' => 'Q', 'CGA' => 'R',
52
+ 'CTG' => 'L', 'CCG' => 'P', 'CAG' => 'Q', 'CGG' => 'R',
53
+ 'ATT' => 'I', 'ACT' => 'T', 'AAT' => 'N', 'AGT' => 'S',
54
+ 'ATC' => 'I', 'ACC' => 'T', 'AAC' => 'N', 'AGC' => 'S',
55
+ 'ATA' => 'I', 'ACA' => 'T', 'AAA' => 'K', 'AGA' => 'R',
56
+ 'ATG' => 'M', 'ACG' => 'T', 'AAG' => 'K', 'AGG' => 'R',
57
+ 'GTT' => 'V', 'GCT' => 'A', 'GAT' => 'D', 'GGT' => 'G',
58
+ 'GTC' => 'V', 'GCC' => 'A', 'GAC' => 'D', 'GGC' => 'G',
59
+ 'GTA' => 'V', 'GCA' => 'A', 'GAA' => 'E', 'GGA' => 'G',
60
+ 'GTG' => 'V', 'GCG' => 'A', 'GAG' => 'E', 'GGG' => 'G'
58
61
  }
59
62
 
60
63
  # Method to translate a DNA sequence to protein.
@@ -69,41 +72,47 @@ module BioDSL
69
72
  self
70
73
  end
71
74
 
72
- alias :to_protein! :translate!
75
+ alias_method :to_protein!, :translate!
73
76
 
74
77
  def translate(trans_tab = 11)
75
- raise SeqError, "Sequence type must be 'dna' - not #{self.type}" unless self.type == :dna
76
- raise SeqError, "Sequence length must be a multiplum of 3 - was: #{self.length}" unless (self.length % 3) == 0
78
+ unless @type == :dna
79
+ fail SeqError, "Sequence type must be 'dna' - not #{@type}"
80
+ end
81
+
82
+ unless (length % 3) == 0
83
+ fail SeqError, 'Sequence length must be a multiplum of 3 - ' \
84
+ " was: #{length}"
85
+ end
77
86
 
78
87
  case trans_tab
79
88
  when 11
80
89
  codon_start_hash = TRANS_TAB11_START
81
90
  codon_hash = TRANS_TAB11
82
91
  else
83
- raise SeqError, "Unknown translation table: #{trans_tab}"
92
+ fail SeqError, "Unknown translation table: #{trans_tab}"
84
93
  end
85
94
 
86
- codon = self.seq[0 ... 3].upcase
95
+ codon = @seq[0...3].upcase
87
96
 
88
97
  aa = codon_start_hash[codon]
89
98
 
90
- raise SeqError, "Unknown start codon: #{codon}" if aa.nil?
99
+ fail SeqError, "Unknown start codon: #{codon}" if aa.nil?
91
100
 
92
101
  protein = aa.dup
93
102
 
94
- (3 ... self.length).step(3) do |i|
95
- codon = self.seq[i ... i + 3].upcase
103
+ (3...length).step(3) do |i|
104
+ codon = @seq[i...i + 3].upcase
96
105
 
97
106
  aa = codon_hash[codon]
98
107
 
99
- raise SeqError, "Unknown codon: #{codon}" if aa.nil?
108
+ fail SeqError, "Unknown codon: #{codon}" if aa.nil?
100
109
 
101
110
  protein << aa.dup
102
111
  end
103
112
 
104
- Seq.new(seq_name: self.seq_name, seq: protein[0 .. -2], type: :protein)
113
+ Seq.new(seq_name: @seq_name, seq: protein[0..-2], type: :protein)
105
114
  end
106
115
 
107
- alias :to_protein :translate
116
+ alias_method :to_protein, :translate
108
117
  end
109
118
  end