BioDSL 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/BioDSL.gemspec +1 -1
  4. data/Gemfile +6 -0
  5. data/README.md +289 -155
  6. data/Rakefile +18 -16
  7. data/lib/BioDSL.rb +1 -1
  8. data/lib/BioDSL/cary.rb +78 -53
  9. data/lib/BioDSL/command.rb +2 -2
  10. data/lib/BioDSL/commands.rb +1 -1
  11. data/lib/BioDSL/commands/add_key.rb +1 -1
  12. data/lib/BioDSL/commands/align_seq_mothur.rb +4 -4
  13. data/lib/BioDSL/commands/analyze_residue_distribution.rb +5 -5
  14. data/lib/BioDSL/commands/assemble_pairs.rb +13 -13
  15. data/lib/BioDSL/commands/assemble_seq_idba.rb +7 -9
  16. data/lib/BioDSL/commands/assemble_seq_ray.rb +13 -13
  17. data/lib/BioDSL/commands/assemble_seq_spades.rb +4 -4
  18. data/lib/BioDSL/commands/classify_seq.rb +8 -8
  19. data/lib/BioDSL/commands/classify_seq_mothur.rb +5 -5
  20. data/lib/BioDSL/commands/clip_primer.rb +7 -7
  21. data/lib/BioDSL/commands/cluster_otus.rb +5 -5
  22. data/lib/BioDSL/commands/collapse_otus.rb +2 -2
  23. data/lib/BioDSL/commands/collect_otus.rb +2 -2
  24. data/lib/BioDSL/commands/complement_seq.rb +4 -4
  25. data/lib/BioDSL/commands/count.rb +1 -1
  26. data/lib/BioDSL/commands/count_values.rb +2 -2
  27. data/lib/BioDSL/commands/degap_seq.rb +6 -7
  28. data/lib/BioDSL/commands/dereplicate_seq.rb +1 -1
  29. data/lib/BioDSL/commands/dump.rb +2 -2
  30. data/lib/BioDSL/commands/filter_rrna.rb +4 -4
  31. data/lib/BioDSL/commands/genecall.rb +7 -7
  32. data/lib/BioDSL/commands/grab.rb +1 -1
  33. data/lib/BioDSL/commands/index_taxonomy.rb +3 -3
  34. data/lib/BioDSL/commands/mask_seq.rb +4 -4
  35. data/lib/BioDSL/commands/mean_scores.rb +2 -2
  36. data/lib/BioDSL/commands/merge_pair_seq.rb +3 -3
  37. data/lib/BioDSL/commands/merge_table.rb +1 -1
  38. data/lib/BioDSL/commands/merge_values.rb +1 -1
  39. data/lib/BioDSL/commands/plot_heatmap.rb +4 -5
  40. data/lib/BioDSL/commands/plot_histogram.rb +4 -4
  41. data/lib/BioDSL/commands/plot_matches.rb +5 -5
  42. data/lib/BioDSL/commands/plot_residue_distribution.rb +6 -6
  43. data/lib/BioDSL/commands/plot_scores.rb +7 -7
  44. data/lib/BioDSL/commands/random.rb +1 -1
  45. data/lib/BioDSL/commands/read_fasta.rb +9 -9
  46. data/lib/BioDSL/commands/read_fastq.rb +16 -16
  47. data/lib/BioDSL/commands/read_table.rb +2 -3
  48. data/lib/BioDSL/commands/reverse_seq.rb +4 -4
  49. data/lib/BioDSL/commands/slice_align.rb +4 -4
  50. data/lib/BioDSL/commands/slice_seq.rb +3 -3
  51. data/lib/BioDSL/commands/sort.rb +1 -1
  52. data/lib/BioDSL/commands/split_pair_seq.rb +6 -7
  53. data/lib/BioDSL/commands/split_values.rb +2 -2
  54. data/lib/BioDSL/commands/trim_primer.rb +13 -8
  55. data/lib/BioDSL/commands/trim_seq.rb +5 -5
  56. data/lib/BioDSL/commands/uchime_ref.rb +6 -6
  57. data/lib/BioDSL/commands/uclust.rb +5 -5
  58. data/lib/BioDSL/commands/unique_values.rb +1 -1
  59. data/lib/BioDSL/commands/usearch_global.rb +2 -2
  60. data/lib/BioDSL/commands/usearch_local.rb +2 -2
  61. data/lib/BioDSL/commands/write_fasta.rb +7 -9
  62. data/lib/BioDSL/commands/write_fastq.rb +4 -4
  63. data/lib/BioDSL/commands/write_table.rb +3 -3
  64. data/lib/BioDSL/commands/write_tree.rb +2 -3
  65. data/lib/BioDSL/config.rb +2 -2
  66. data/lib/BioDSL/csv.rb +8 -10
  67. data/lib/BioDSL/debug.rb +1 -1
  68. data/lib/BioDSL/fasta.rb +54 -40
  69. data/lib/BioDSL/fastq.rb +35 -32
  70. data/lib/BioDSL/filesys.rb +56 -47
  71. data/lib/BioDSL/fork.rb +1 -1
  72. data/lib/BioDSL/hamming.rb +1 -1
  73. data/lib/BioDSL/helpers.rb +1 -1
  74. data/lib/BioDSL/helpers/aux_helper.rb +1 -1
  75. data/lib/BioDSL/helpers/email_helper.rb +1 -1
  76. data/lib/BioDSL/helpers/history_helper.rb +1 -1
  77. data/lib/BioDSL/helpers/log_helper.rb +1 -1
  78. data/lib/BioDSL/helpers/options_helper.rb +1 -1
  79. data/lib/BioDSL/helpers/status_helper.rb +1 -1
  80. data/lib/BioDSL/html_report.rb +1 -1
  81. data/lib/BioDSL/math.rb +1 -1
  82. data/lib/BioDSL/mummer.rb +1 -1
  83. data/lib/BioDSL/pipeline.rb +1 -1
  84. data/lib/BioDSL/seq.rb +240 -231
  85. data/lib/BioDSL/seq/ambiguity.rb +1 -1
  86. data/lib/BioDSL/seq/assemble.rb +1 -1
  87. data/lib/BioDSL/seq/backtrack.rb +93 -76
  88. data/lib/BioDSL/seq/digest.rb +1 -1
  89. data/lib/BioDSL/seq/dynamic.rb +43 -55
  90. data/lib/BioDSL/seq/homopolymer.rb +34 -36
  91. data/lib/BioDSL/seq/kmer.rb +67 -50
  92. data/lib/BioDSL/seq/levenshtein.rb +35 -40
  93. data/lib/BioDSL/seq/translate.rb +64 -55
  94. data/lib/BioDSL/seq/trim.rb +60 -50
  95. data/lib/BioDSL/serializer.rb +1 -1
  96. data/lib/BioDSL/stream.rb +1 -1
  97. data/lib/BioDSL/taxonomy.rb +1 -1
  98. data/lib/BioDSL/test.rb +1 -1
  99. data/lib/BioDSL/tmp_dir.rb +1 -1
  100. data/lib/BioDSL/usearch.rb +1 -1
  101. data/lib/BioDSL/verbose.rb +1 -1
  102. data/lib/BioDSL/version.rb +2 -2
  103. data/test/BioDSL/commands/test_add_key.rb +1 -1
  104. data/test/BioDSL/commands/test_align_seq_mothur.rb +1 -1
  105. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +1 -1
  106. data/test/BioDSL/commands/test_assemble_pairs.rb +1 -1
  107. data/test/BioDSL/commands/test_assemble_seq_idba.rb +1 -1
  108. data/test/BioDSL/commands/test_assemble_seq_ray.rb +1 -1
  109. data/test/BioDSL/commands/test_assemble_seq_spades.rb +1 -1
  110. data/test/BioDSL/commands/test_classify_seq.rb +1 -1
  111. data/test/BioDSL/commands/test_classify_seq_mothur.rb +1 -1
  112. data/test/BioDSL/commands/test_clip_primer.rb +1 -1
  113. data/test/BioDSL/commands/test_cluster_otus.rb +1 -1
  114. data/test/BioDSL/commands/test_collapse_otus.rb +1 -1
  115. data/test/BioDSL/commands/test_collect_otus.rb +1 -1
  116. data/test/BioDSL/commands/test_complement_seq.rb +1 -1
  117. data/test/BioDSL/commands/test_count.rb +1 -1
  118. data/test/BioDSL/commands/test_count_values.rb +1 -1
  119. data/test/BioDSL/commands/test_degap_seq.rb +1 -1
  120. data/test/BioDSL/commands/test_dereplicate_seq.rb +1 -1
  121. data/test/BioDSL/commands/test_dump.rb +1 -1
  122. data/test/BioDSL/commands/test_filter_rrna.rb +1 -1
  123. data/test/BioDSL/commands/test_genecall.rb +1 -1
  124. data/test/BioDSL/commands/test_grab.rb +1 -1
  125. data/test/BioDSL/commands/test_index_taxonomy.rb +1 -1
  126. data/test/BioDSL/commands/test_mask_seq.rb +1 -1
  127. data/test/BioDSL/commands/test_mean_scores.rb +1 -1
  128. data/test/BioDSL/commands/test_merge_pair_seq.rb +1 -1
  129. data/test/BioDSL/commands/test_merge_table.rb +1 -1
  130. data/test/BioDSL/commands/test_merge_values.rb +1 -1
  131. data/test/BioDSL/commands/test_plot_heatmap.rb +1 -1
  132. data/test/BioDSL/commands/test_plot_histogram.rb +1 -1
  133. data/test/BioDSL/commands/test_plot_matches.rb +1 -1
  134. data/test/BioDSL/commands/test_plot_residue_distribution.rb +1 -1
  135. data/test/BioDSL/commands/test_plot_scores.rb +1 -1
  136. data/test/BioDSL/commands/test_random.rb +1 -1
  137. data/test/BioDSL/commands/test_read_fasta.rb +1 -1
  138. data/test/BioDSL/commands/test_read_fastq.rb +1 -1
  139. data/test/BioDSL/commands/test_read_table.rb +1 -1
  140. data/test/BioDSL/commands/test_reverse_seq.rb +1 -1
  141. data/test/BioDSL/commands/test_slice_align.rb +1 -1
  142. data/test/BioDSL/commands/test_slice_seq.rb +1 -1
  143. data/test/BioDSL/commands/test_sort.rb +1 -1
  144. data/test/BioDSL/commands/test_split_pair_seq.rb +1 -1
  145. data/test/BioDSL/commands/test_split_values.rb +1 -1
  146. data/test/BioDSL/commands/test_trim_primer.rb +1 -1
  147. data/test/BioDSL/commands/test_trim_seq.rb +1 -1
  148. data/test/BioDSL/commands/test_uchime_ref.rb +1 -1
  149. data/test/BioDSL/commands/test_uclust.rb +1 -1
  150. data/test/BioDSL/commands/test_unique_values.rb +1 -1
  151. data/test/BioDSL/commands/test_usearch_global.rb +1 -1
  152. data/test/BioDSL/commands/test_usearch_local.rb +1 -1
  153. data/test/BioDSL/commands/test_write_fasta.rb +1 -1
  154. data/test/BioDSL/commands/test_write_fastq.rb +1 -1
  155. data/test/BioDSL/commands/test_write_table.rb +1 -1
  156. data/test/BioDSL/commands/test_write_tree.rb +1 -1
  157. data/test/BioDSL/helpers/test_options_helper.rb +3 -3
  158. data/test/BioDSL/seq/test_assemble.rb +58 -56
  159. data/test/BioDSL/seq/test_backtrack.rb +83 -81
  160. data/test/BioDSL/seq/test_digest.rb +47 -45
  161. data/test/BioDSL/seq/test_dynamic.rb +66 -64
  162. data/test/BioDSL/seq/test_homopolymer.rb +35 -33
  163. data/test/BioDSL/seq/test_kmer.rb +29 -28
  164. data/test/BioDSL/seq/test_translate.rb +44 -42
  165. data/test/BioDSL/seq/test_trim.rb +59 -57
  166. data/test/BioDSL/test_cary.rb +1 -1
  167. data/test/BioDSL/test_command.rb +2 -2
  168. data/test/BioDSL/test_csv.rb +34 -31
  169. data/test/BioDSL/test_debug.rb +31 -31
  170. data/test/BioDSL/test_fasta.rb +30 -29
  171. data/test/BioDSL/test_fastq.rb +27 -26
  172. data/test/BioDSL/test_filesys.rb +28 -27
  173. data/test/BioDSL/test_fork.rb +29 -28
  174. data/test/BioDSL/test_math.rb +31 -30
  175. data/test/BioDSL/test_mummer.rb +1 -1
  176. data/test/BioDSL/test_pipeline.rb +1 -1
  177. data/test/BioDSL/test_seq.rb +42 -41
  178. data/test/BioDSL/test_serializer.rb +35 -33
  179. data/test/BioDSL/test_stream.rb +28 -27
  180. data/test/BioDSL/test_taxonomy.rb +38 -37
  181. data/test/BioDSL/test_test.rb +32 -31
  182. data/test/BioDSL/test_tmp_dir.rb +1 -1
  183. data/test/BioDSL/test_usearch.rb +28 -27
  184. data/test/BioDSL/test_verbose.rb +32 -31
  185. data/test/helper.rb +34 -31
  186. metadata +3 -2
@@ -1,28 +1,29 @@
1
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
- # #
3
- # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
- # #
5
- # This program is free software; you can redistribute it and/or #
6
- # modify it under the terms of the GNU General Public License #
7
- # as published by the Free Software Foundation; either version 2 #
8
- # of the License, or (at your option) any later version. #
9
- # #
10
- # This program is distributed in the hope that it will be useful, #
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
- # GNU General Public License for more details. #
14
- # #
15
- # You should have received a copy of the GNU General Public License #
16
- # along with this program; if not, write to the Free Software #
17
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
- # #
19
- # http://www.gnu.org/copyleft/gpl.html #
20
- # #
21
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
- # #
23
- # This software is part of BioDSL (www.BioDSL.org). #
24
- # #
25
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
27
 
27
28
  module BioDSL
28
29
  # Error class for all exceptions to do with Kmer.
@@ -36,19 +37,19 @@ module BioDSL
36
37
  oligos = []
37
38
 
38
39
  kmers.each do |kmer|
39
- oligo = ""
40
- bin = "%0#{kmer_size * 2}b" % kmer
40
+ oligo = ''
41
+ bin = format("%0#{kmer_size * 2}b", kmer)
41
42
 
42
- bin.scan(/.{2}/) { |m|
43
+ bin.scan(/.{2}/) do |m|
43
44
  case m
44
45
  when '00' then oligo << 'a'
45
46
  when '01' then oligo << 't'
46
47
  when '10' then oligo << 'c'
47
48
  when '11' then oligo << 'g'
48
49
  else
49
- raise "unknown m #{m}"
50
+ fail "unknown m #{m}"
50
51
  end
51
- }
52
+ end
52
53
 
53
54
  oligos << oligo
54
55
  end
@@ -58,33 +59,45 @@ module BioDSL
58
59
 
59
60
  # Method that returns a sorted array of unique kmers, which are integer
60
61
  # representations of DNA/RNA sequence oligos where A is encoded in two bits
61
- # as 00, T as 01, U as 01, C as 10 and G as 11. Oligos with other nucleotides
62
- # are ignored. The following options apply:
62
+ # as 00, T as 01, U as 01, C as 10 and G as 11. Oligos with other
63
+ # nucleotides are ignored. The following options apply:
63
64
  # * kmer_size: kmer size in the range 1-12.
64
65
  # * step_size: step size in the range 1-12 (defualt=1).
65
66
  # * score_min: drop kmers with quality score below this.
66
67
  def to_kmers(options)
67
68
  options[:step_size] ||= 1
68
69
  options[:score_min] ||= Seq::SCORE_MAX
69
- raise KmerError, "No kmer_size" unless options[:kmer_size]
70
- raise KmerError, "Bad kmer_size: #{options[:kmer_size]}" unless (1 .. 12).include? options[:kmer_size]
71
- raise KmerError, "Bad step_size: #{options[:step_size]}" unless (1 .. 12).include? options[:step_size]
72
- if self.qual and not (Seq::SCORE_MIN .. Seq::SCORE_MAX).include? options[:score_min]
73
- raise KmerError, "score minimum: #{options[:score_min]} out of range #{Seq::SCORE_MIN} .. #{Seq::SCORE_MAX}"
70
+ fail KmerError, 'No kmer_size' unless options[:kmer_size]
71
+
72
+ unless (1..12).include? options[:kmer_size]
73
+ fail KmerError, "Bad kmer_size: #{options[:kmer_size]}"
74
74
  end
75
75
 
76
- size = Seq::DNA.size ** options[:kmer_size]
76
+ unless (1..12).include? options[:step_size]
77
+ fail KmerError, "Bad step_size: #{options[:step_size]}"
78
+ end
77
79
 
78
- if defined? @kmer_ary and @kmer_ary.count == size
80
+ if @qual && !(Seq::SCORE_MIN..Seq::SCORE_MAX).
81
+ include?(options[:score_min])
82
+ fail KmerError, "score minimum: #{options[:score_min]} out of " \
83
+ "range #{Seq::SCORE_MIN}..#{Seq::SCORE_MAX}"
84
+ end
85
+
86
+ size = Seq::DNA.size**options[:kmer_size]
87
+
88
+ if defined?(@kmer_ary) && (@kmer_ary.count == size)
79
89
  @kmer_ary.zero!
80
90
  else
81
91
  @kmer_ary = BioDSL::CAry.new(size, 1)
82
92
  end
83
93
 
84
- if self.qual
85
- to_kmers_qual_C(self.seq, self.qual, @kmer_ary.ary, self.length, @kmer_ary.count, options[:kmer_size], options[:step_size], options[:score_min], Seq::SCORE_BASE)
94
+ if @qual
95
+ to_kmers_qual_C(@seq, @qual, @kmer_ary.ary, length, @kmer_ary.count,
96
+ options[:kmer_size], options[:step_size],
97
+ options[:score_min], Seq::SCORE_BASE)
86
98
  else
87
- to_kmers_C(self.seq, @kmer_ary.ary, self.length, @kmer_ary.count, options[:kmer_size], options[:step_size])
99
+ to_kmers_C(@seq, @kmer_ary.ary, length, @kmer_ary.count,
100
+ options[:kmer_size], options[:step_size])
88
101
  end
89
102
  end
90
103
 
@@ -152,7 +165,7 @@ module BioDSL
152
165
  unsigned int ary_len = FIX2UINT(_ary_len);
153
166
  unsigned int kmer_size = FIX2UINT(_kmer_size);
154
167
  unsigned int step_size = FIX2UINT(_step_size);
155
-
168
+
156
169
  VALUE array = rb_ary_new();
157
170
  unsigned int bin = 0;
158
171
  unsigned int enc = 0;
@@ -208,7 +221,7 @@ module BioDSL
208
221
  unsigned int step_size = FIX2UINT(_step_size);
209
222
  unsigned int score_min = FIX2UINT(_score_min);
210
223
  unsigned int score_base = FIX2UINT(_score_base);
211
-
224
+
212
225
  VALUE array = rb_ary_new();
213
226
  unsigned int bin = 0;
214
227
  unsigned int enc = 0;
@@ -251,11 +264,13 @@ module BioDSL
251
264
  def naive(options)
252
265
  oligos = []
253
266
 
254
- (0 .. self.length - options[:kmer_size]).each do |i|
255
- oligo = self[i ... i + options[:kmer_size]]
267
+ (0..length - options[:kmer_size]).each do |i|
268
+ oligo = self[i...i + options[:kmer_size]]
256
269
 
257
270
  next unless oligo.seq.upcase =~ /^[ATUCG]+$/
258
- next if oligo.qual and options[:scores_min] and oligo.scores_min < options[:scores_min]
271
+ next if oligo.qual &&
272
+ options[:scores_min] &&
273
+ (oligo.scores_min < options[:scores_min])
259
274
 
260
275
  oligos << oligo.seq.upcase
261
276
  end
@@ -266,11 +281,13 @@ module BioDSL
266
281
  def naive_bin(options)
267
282
  oligos = []
268
283
 
269
- (0 .. self.length - options[:kmer_size]).each do |i|
270
- oligo = self[i ... i + options[:kmer_size]]
284
+ (0..length - options[:kmer_size]).each do |i|
285
+ oligo = self[i...i + options[:kmer_size]]
271
286
 
272
287
  next unless oligo.seq.upcase =~ /^[ATCG]+$/
273
- next if oligo.qual and options[:scores_min] and oligo.scores_min < options[:scores_min]
288
+ next if oligo.qual &&
289
+ options[:scores_min] &&
290
+ (oligo.scores_min < options[:scores_min])
274
291
 
275
292
  bin = 0
276
293
 
@@ -1,28 +1,29 @@
1
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
- # #
3
- # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
- # #
5
- # This program is free software; you can redistribute it and/or #
6
- # modify it under the terms of the GNU General Public License #
7
- # as published by the Free Software Foundation; either version 2 #
8
- # of the License, or (at your option) any later version. #
9
- # #
10
- # This program is distributed in the hope that it will be useful, #
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
- # GNU General Public License for more details. #
14
- # #
15
- # You should have received a copy of the GNU General Public License #
16
- # along with this program; if not, write to the Free Software #
17
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
- # #
19
- # http://www.gnu.org/copyleft/gpl.html #
20
- # #
21
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
- # #
23
- # This software is part of the BioDSL framework (www.BioDSL.org). #
24
- # #
25
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
27
 
27
28
  module BioDSL
28
29
  # Class to calculate the Levenshtein distance between two
@@ -34,15 +35,14 @@ module BioDSL
34
35
  BYTES_IN_INT = 4
35
36
 
36
37
  def self.distance(s, t)
37
- return 0 if s == t;
38
- return t.length if s.length == 0;
39
- return s.length if t.length == 0;
38
+ return 0 if s == t
39
+ return t.length if s.length == 0
40
+ return s.length if t.length == 0
40
41
 
41
42
  v0 = "\0" * (t.length + 1) * BYTES_IN_INT
42
43
  v1 = "\0" * (t.length + 1) * BYTES_IN_INT
43
44
 
44
- l = self.new
45
- l.levenshtein_distance_C(s, t, s.length, t.length, v0, v1)
45
+ new.levenshtein_distance_C(s, t, s.length, t.length, v0, v1)
46
46
  end
47
47
 
48
48
  # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<<
@@ -82,32 +82,27 @@ module BioDSL
82
82
  unsigned int i = 0;
83
83
  unsigned int j = 0;
84
84
  unsigned int cost = 0;
85
-
85
+
86
86
  for (i = 0; i < t_len + 1; i++)
87
87
  v0[i] = i;
88
-
88
+
89
89
  for (i = 0; i < s_len; i++)
90
90
  {
91
91
  v1[0] = i + 1;
92
-
92
+
93
93
  for (j = 0; j < t_len; j++)
94
94
  {
95
95
  cost = (MATCH(s[i], t[j])) ? 0 : 1;
96
96
  v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost);
97
97
  }
98
-
98
+
99
99
  for (j = 0; j < t_len + 1; j++)
100
100
  v0[j] = v1[j];
101
101
  }
102
-
102
+
103
103
  return UINT2NUM(v1[t_len]);
104
104
  }
105
105
  }
106
106
  end
107
107
  end
108
108
  end
109
-
110
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
111
-
112
-
113
- __END__
@@ -1,30 +1,33 @@
1
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
- # #
3
- # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
- # #
5
- # This program is free software; you can redistribute it and/or #
6
- # modify it under the terms of the GNU General Public License #
7
- # as published by the Free Software Foundation; either version 2 #
8
- # of the License, or (at your option) any later version. #
9
- # #
10
- # This program is distributed in the hope that it will be useful, #
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
- # GNU General Public License for more details. #
14
- # #
15
- # You should have received a copy of the GNU General Public License #
16
- # along with this program; if not, write to the Free Software #
17
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
- # #
19
- # http://www.gnu.org/copyleft/gpl.html #
20
- # #
21
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
- # #
23
- # This software is part of BioDSL (www.BioDSL.org). #
24
- # #
25
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
-
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
27
29
  module BioDSL
30
+ # Namespace for Translate methods.
28
31
  module Translate
29
32
  # Translation table 11
30
33
  # (http://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=cgencodes#SG11)
@@ -34,27 +37,27 @@ module BioDSL
34
37
  # Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
35
38
  # Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
36
39
  TRANS_TAB11_START = {
37
- "TTG" => "M", "CTG" => "M", "ATT" => "M", "ATC" => "M",
38
- "ATA" => "M", "ATG" => "M", "GTG" => "M"
40
+ 'TTG' => 'M', 'CTG' => 'M', 'ATT' => 'M', 'ATC' => 'M',
41
+ 'ATA' => 'M', 'ATG' => 'M', 'GTG' => 'M'
39
42
  }
40
43
 
41
44
  TRANS_TAB11 = {
42
- "TTT" => "F", "TCT" => "S", "TAT" => "Y", "TGT" => "C",
43
- "TTC" => "F", "TCC" => "S", "TAC" => "Y", "TGC" => "C",
44
- "TTA" => "L", "TCA" => "S", "TAA" => "*", "TGA" => "*",
45
- "TTG" => "L", "TCG" => "S", "TAG" => "*", "TGG" => "W",
46
- "CTT" => "L", "CCT" => "P", "CAT" => "H", "CGT" => "R",
47
- "CTC" => "L", "CCC" => "P", "CAC" => "H", "CGC" => "R",
48
- "CTA" => "L", "CCA" => "P", "CAA" => "Q", "CGA" => "R",
49
- "CTG" => "L", "CCG" => "P", "CAG" => "Q", "CGG" => "R",
50
- "ATT" => "I", "ACT" => "T", "AAT" => "N", "AGT" => "S",
51
- "ATC" => "I", "ACC" => "T", "AAC" => "N", "AGC" => "S",
52
- "ATA" => "I", "ACA" => "T", "AAA" => "K", "AGA" => "R",
53
- "ATG" => "M", "ACG" => "T", "AAG" => "K", "AGG" => "R",
54
- "GTT" => "V", "GCT" => "A", "GAT" => "D", "GGT" => "G",
55
- "GTC" => "V", "GCC" => "A", "GAC" => "D", "GGC" => "G",
56
- "GTA" => "V", "GCA" => "A", "GAA" => "E", "GGA" => "G",
57
- "GTG" => "V", "GCG" => "A", "GAG" => "E", "GGG" => "G"
45
+ 'TTT' => 'F', 'TCT' => 'S', 'TAT' => 'Y', 'TGT' => 'C',
46
+ 'TTC' => 'F', 'TCC' => 'S', 'TAC' => 'Y', 'TGC' => 'C',
47
+ 'TTA' => 'L', 'TCA' => 'S', 'TAA' => '*', 'TGA' => '*',
48
+ 'TTG' => 'L', 'TCG' => 'S', 'TAG' => '*', 'TGG' => 'W',
49
+ 'CTT' => 'L', 'CCT' => 'P', 'CAT' => 'H', 'CGT' => 'R',
50
+ 'CTC' => 'L', 'CCC' => 'P', 'CAC' => 'H', 'CGC' => 'R',
51
+ 'CTA' => 'L', 'CCA' => 'P', 'CAA' => 'Q', 'CGA' => 'R',
52
+ 'CTG' => 'L', 'CCG' => 'P', 'CAG' => 'Q', 'CGG' => 'R',
53
+ 'ATT' => 'I', 'ACT' => 'T', 'AAT' => 'N', 'AGT' => 'S',
54
+ 'ATC' => 'I', 'ACC' => 'T', 'AAC' => 'N', 'AGC' => 'S',
55
+ 'ATA' => 'I', 'ACA' => 'T', 'AAA' => 'K', 'AGA' => 'R',
56
+ 'ATG' => 'M', 'ACG' => 'T', 'AAG' => 'K', 'AGG' => 'R',
57
+ 'GTT' => 'V', 'GCT' => 'A', 'GAT' => 'D', 'GGT' => 'G',
58
+ 'GTC' => 'V', 'GCC' => 'A', 'GAC' => 'D', 'GGC' => 'G',
59
+ 'GTA' => 'V', 'GCA' => 'A', 'GAA' => 'E', 'GGA' => 'G',
60
+ 'GTG' => 'V', 'GCG' => 'A', 'GAG' => 'E', 'GGG' => 'G'
58
61
  }
59
62
 
60
63
  # Method to translate a DNA sequence to protein.
@@ -69,41 +72,47 @@ module BioDSL
69
72
  self
70
73
  end
71
74
 
72
- alias :to_protein! :translate!
75
+ alias_method :to_protein!, :translate!
73
76
 
74
77
  def translate(trans_tab = 11)
75
- raise SeqError, "Sequence type must be 'dna' - not #{self.type}" unless self.type == :dna
76
- raise SeqError, "Sequence length must be a multiplum of 3 - was: #{self.length}" unless (self.length % 3) == 0
78
+ unless @type == :dna
79
+ fail SeqError, "Sequence type must be 'dna' - not #{@type}"
80
+ end
81
+
82
+ unless (length % 3) == 0
83
+ fail SeqError, 'Sequence length must be a multiplum of 3 - ' \
84
+ " was: #{length}"
85
+ end
77
86
 
78
87
  case trans_tab
79
88
  when 11
80
89
  codon_start_hash = TRANS_TAB11_START
81
90
  codon_hash = TRANS_TAB11
82
91
  else
83
- raise SeqError, "Unknown translation table: #{trans_tab}"
92
+ fail SeqError, "Unknown translation table: #{trans_tab}"
84
93
  end
85
94
 
86
- codon = self.seq[0 ... 3].upcase
95
+ codon = @seq[0...3].upcase
87
96
 
88
97
  aa = codon_start_hash[codon]
89
98
 
90
- raise SeqError, "Unknown start codon: #{codon}" if aa.nil?
99
+ fail SeqError, "Unknown start codon: #{codon}" if aa.nil?
91
100
 
92
101
  protein = aa.dup
93
102
 
94
- (3 ... self.length).step(3) do |i|
95
- codon = self.seq[i ... i + 3].upcase
103
+ (3...length).step(3) do |i|
104
+ codon = @seq[i...i + 3].upcase
96
105
 
97
106
  aa = codon_hash[codon]
98
107
 
99
- raise SeqError, "Unknown codon: #{codon}" if aa.nil?
108
+ fail SeqError, "Unknown codon: #{codon}" if aa.nil?
100
109
 
101
110
  protein << aa.dup
102
111
  end
103
112
 
104
- Seq.new(seq_name: self.seq_name, seq: protein[0 .. -2], type: :protein)
113
+ Seq.new(seq_name: @seq_name, seq: protein[0..-2], type: :protein)
105
114
  end
106
115
 
107
- alias :to_protein :translate
116
+ alias_method :to_protein, :translate
108
117
  end
109
118
  end