BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,293 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part of BioDSL (www.BioDSL.org). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+
27
+ module BioDSL
28
+ # Error class for all exceptions to do with Kmer.
29
+ class KmerError < StandardError; end
30
+
31
+ # Module containing methods for manipulating sequence kmers.
32
+ module Kmer
33
+ # Debug method to convert an array of binary encoded kmers to
34
+ # nucleotide oligos.
35
+ def self.to_oligos(kmers, kmer_size)
36
+ oligos = []
37
+
38
+ kmers.each do |kmer|
39
+ oligo = ""
40
+ bin = "%0#{kmer_size * 2}b" % kmer
41
+
42
+ bin.scan(/.{2}/) { |m|
43
+ case m
44
+ when '00' then oligo << 'a'
45
+ when '01' then oligo << 't'
46
+ when '10' then oligo << 'c'
47
+ when '11' then oligo << 'g'
48
+ else
49
+ raise "unknown m #{m}"
50
+ end
51
+ }
52
+
53
+ oligos << oligo
54
+ end
55
+
56
+ oligos
57
+ end
58
+
59
+ # Method that returns a sorted array of unique kmers, which are integer
60
+ # representations of DNA/RNA sequence oligos where A is encoded in two bits
61
+ # as 00, T as 01, U as 01, C as 10 and G as 11. Oligos with other nucleotides
62
+ # are ignored. The following options apply:
63
+ # * kmer_size: kmer size in the range 1-12.
64
+ # * step_size: step size in the range 1-12 (defualt=1).
65
+ # * score_min: drop kmers with quality score below this.
66
+ def to_kmers(options)
67
+ options[:step_size] ||= 1
68
+ options[:score_min] ||= Seq::SCORE_MAX
69
+ raise KmerError, "No kmer_size" unless options[:kmer_size]
70
+ raise KmerError, "Bad kmer_size: #{options[:kmer_size]}" unless (1 .. 12).include? options[:kmer_size]
71
+ raise KmerError, "Bad step_size: #{options[:step_size]}" unless (1 .. 12).include? options[:step_size]
72
+ if self.qual and not (Seq::SCORE_MIN .. Seq::SCORE_MAX).include? options[:score_min]
73
+ raise KmerError, "score minimum: #{options[:score_min]} out of range #{Seq::SCORE_MIN} .. #{Seq::SCORE_MAX}"
74
+ end
75
+
76
+ size = Seq::DNA.size ** options[:kmer_size]
77
+
78
+ if defined? @kmer_ary and @kmer_ary.count == size
79
+ @kmer_ary.zero!
80
+ else
81
+ @kmer_ary = BioDSL::CAry.new(size, 1)
82
+ end
83
+
84
+ if self.qual
85
+ to_kmers_qual_C(self.seq, self.qual, @kmer_ary.ary, self.length, @kmer_ary.count, options[:kmer_size], options[:step_size], options[:score_min], Seq::SCORE_BASE)
86
+ else
87
+ to_kmers_C(self.seq, @kmer_ary.ary, self.length, @kmer_ary.count, options[:kmer_size], options[:step_size])
88
+ end
89
+ end
90
+
91
+ private
92
+
93
+ inline do |builder|
94
+ builder.prefix %{
95
+ int encode_nuc(char nuc, unsigned int *bin)
96
+ {
97
+ *bin <<= 2;
98
+
99
+ switch(nuc)
100
+ {
101
+ case 'a':
102
+ *bin |= 0;
103
+ break;
104
+ case 'A':
105
+ *bin |= 0;
106
+ break;
107
+ case 't':
108
+ *bin |= 1;
109
+ break;
110
+ case 'T':
111
+ *bin |= 1;
112
+ break;
113
+ case 'u':
114
+ *bin |= 1;
115
+ break;
116
+ case 'U':
117
+ *bin |= 1;
118
+ break;
119
+ case 'c':
120
+ *bin |= 2;
121
+ break;
122
+ case 'C':
123
+ *bin |= 2;
124
+ break;
125
+ case 'g':
126
+ *bin |= 3;
127
+ break;
128
+ case 'G':
129
+ *bin |= 3;
130
+ break;
131
+ default:
132
+ return 0;
133
+ }
134
+
135
+ return 1;
136
+ }
137
+ }
138
+
139
+ builder.c %{
140
+ VALUE to_kmers_C(
141
+ VALUE _seq, // DNA or RNA sequence string.
142
+ VALUE _ary, // byte array for sort and uniq.
143
+ VALUE _seq_len, // sequence length.
144
+ VALUE _ary_len, // byte array length.
145
+ VALUE _kmer_size, // Size of kmer or oligo.
146
+ VALUE _step_size // Step size for overlapping kmers.
147
+ )
148
+ {
149
+ char *seq = StringValuePtr(_seq);
150
+ char *ary = StringValuePtr(_ary);
151
+ unsigned int seq_len = FIX2UINT(_seq_len);
152
+ unsigned int ary_len = FIX2UINT(_ary_len);
153
+ unsigned int kmer_size = FIX2UINT(_kmer_size);
154
+ unsigned int step_size = FIX2UINT(_step_size);
155
+
156
+ VALUE array = rb_ary_new();
157
+ unsigned int bin = 0;
158
+ unsigned int enc = 0;
159
+ unsigned int i = 0;
160
+ unsigned int mask = (1 << (2 * kmer_size)) - 1;
161
+
162
+ for (i = 0; i < seq_len; i++)
163
+ {
164
+ if (encode_nuc(seq[i], &bin))
165
+ {
166
+ enc++;
167
+
168
+ if (((i % step_size) == 0) && (enc >= kmer_size)) {
169
+ ary[(bin & mask)] = 1;
170
+ }
171
+ }
172
+ else
173
+ {
174
+ enc = 0;
175
+ }
176
+ }
177
+
178
+ for (i = 0; i < ary_len; i++)
179
+ {
180
+ if (ary[i]) {
181
+ rb_ary_push(array, INT2FIX(i));
182
+ }
183
+ }
184
+
185
+ return array;
186
+ }
187
+ }
188
+
189
+ builder.c %{
190
+ VALUE to_kmers_qual_C(
191
+ VALUE _seq, // DNA or RNA sequence string.
192
+ VALUE _qual, // Quality score string.
193
+ VALUE _ary, // Byte array for sort and uniq.
194
+ VALUE _seq_len, // Sequence length.
195
+ VALUE _ary_len, // Byte array length.
196
+ VALUE _kmer_size, // Size of kmer or oligo.
197
+ VALUE _step_size, // Step size for overlapping kmers.
198
+ VALUE _score_min, // Miminum quality score to accept in a kmer.
199
+ VALUE _score_base // Quality score base.
200
+ )
201
+ {
202
+ char *seq = StringValuePtr(_seq);
203
+ char *qual = StringValuePtr(_qual);
204
+ char *ary = StringValuePtr(_ary);
205
+ unsigned int seq_len = FIX2UINT(_seq_len);
206
+ unsigned int ary_len = FIX2UINT(_ary_len);
207
+ unsigned int kmer_size = FIX2UINT(_kmer_size);
208
+ unsigned int step_size = FIX2UINT(_step_size);
209
+ unsigned int score_min = FIX2UINT(_score_min);
210
+ unsigned int score_base = FIX2UINT(_score_base);
211
+
212
+ VALUE array = rb_ary_new();
213
+ unsigned int bin = 0;
214
+ unsigned int enc = 0;
215
+ unsigned int i = 0;
216
+ unsigned int mask = (1 << (2 * kmer_size)) - 1;
217
+
218
+ for (i = 0; i < seq_len; i++)
219
+ {
220
+ if (encode_nuc(seq[i], &bin))
221
+ {
222
+ enc++;
223
+
224
+ if ((unsigned int) qual[i] - score_base < score_min)
225
+ {
226
+ enc = 0;
227
+ }
228
+ else if ((enc >= kmer_size) && ((i % step_size) == 0))
229
+ {
230
+ ary[(bin & mask)] = 1;
231
+ }
232
+ }
233
+ else
234
+ {
235
+ enc = 0;
236
+ }
237
+ }
238
+
239
+ for (i = 0; i < ary_len; i++)
240
+ {
241
+ if (ary[i]) {
242
+ rb_ary_push(array, INT2FIX(i));
243
+ }
244
+ }
245
+
246
+ return array;
247
+ }
248
+ }
249
+ end
250
+
251
+ def naive(options)
252
+ oligos = []
253
+
254
+ (0 .. self.length - options[:kmer_size]).each do |i|
255
+ oligo = self[i ... i + options[:kmer_size]]
256
+
257
+ next unless oligo.seq.upcase =~ /^[ATUCG]+$/
258
+ next if oligo.qual and options[:scores_min] and oligo.scores_min < options[:scores_min]
259
+
260
+ oligos << oligo.seq.upcase
261
+ end
262
+
263
+ oligos
264
+ end
265
+
266
+ def naive_bin(options)
267
+ oligos = []
268
+
269
+ (0 .. self.length - options[:kmer_size]).each do |i|
270
+ oligo = self[i ... i + options[:kmer_size]]
271
+
272
+ next unless oligo.seq.upcase =~ /^[ATCG]+$/
273
+ next if oligo.qual and options[:scores_min] and oligo.scores_min < options[:scores_min]
274
+
275
+ bin = 0
276
+
277
+ oligo.seq.upcase.each_char do |c|
278
+ bin <<= 2
279
+ case c
280
+ when 'T' then bin |= 1
281
+ when 'U' then bin |= 1
282
+ when 'C' then bin |= 2
283
+ when 'G' then bin |= 3
284
+ end
285
+ end
286
+
287
+ oligos << bin
288
+ end
289
+
290
+ oligos
291
+ end
292
+ end
293
+ end
@@ -0,0 +1,113 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+
27
+ module BioDSL
28
+ # Class to calculate the Levenshtein distance between two
29
+ # given strings.
30
+ # http://en.wikipedia.org/wiki/Levenshtein_distance
31
+ class Levenshtein
32
+ extend BioDSL::Ambiguity
33
+
34
+ BYTES_IN_INT = 4
35
+
36
+ def self.distance(s, t)
37
+ return 0 if s == t;
38
+ return t.length if s.length == 0;
39
+ return s.length if t.length == 0;
40
+
41
+ v0 = "\0" * (t.length + 1) * BYTES_IN_INT
42
+ v1 = "\0" * (t.length + 1) * BYTES_IN_INT
43
+
44
+ l = self.new
45
+ l.levenshtein_distance_C(s, t, s.length, t.length, v0, v1)
46
+ end
47
+
48
+ # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<<
49
+
50
+ inline do |builder|
51
+ add_ambiguity_macro(builder)
52
+
53
+ builder.prefix %{
54
+ unsigned int min(unsigned int a, unsigned int b, unsigned int c)
55
+ {
56
+ unsigned int m = a;
57
+
58
+ if (m > b) m = b;
59
+ if (m > c) m = c;
60
+
61
+ return m;
62
+ }
63
+ }
64
+
65
+ builder.c %{
66
+ VALUE levenshtein_distance_C(
67
+ VALUE _s, // string
68
+ VALUE _t, // string
69
+ VALUE _s_len, // string length
70
+ VALUE _t_len, // string length
71
+ VALUE _v0, // score vector
72
+ VALUE _v1 // score vector
73
+ )
74
+ {
75
+ char *s = (char *) StringValuePtr(_s);
76
+ char *t = (char *) StringValuePtr(_t);
77
+ unsigned int s_len = FIX2UINT(_s_len);
78
+ unsigned int t_len = FIX2UINT(_t_len);
79
+ unsigned int *v0 = (unsigned int *) StringValuePtr(_v0);
80
+ unsigned int *v1 = (unsigned int *) StringValuePtr(_v1);
81
+
82
+ unsigned int i = 0;
83
+ unsigned int j = 0;
84
+ unsigned int cost = 0;
85
+
86
+ for (i = 0; i < t_len + 1; i++)
87
+ v0[i] = i;
88
+
89
+ for (i = 0; i < s_len; i++)
90
+ {
91
+ v1[0] = i + 1;
92
+
93
+ for (j = 0; j < t_len; j++)
94
+ {
95
+ cost = (MATCH(s[i], t[j])) ? 0 : 1;
96
+ v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost);
97
+ }
98
+
99
+ for (j = 0; j < t_len + 1; j++)
100
+ v0[j] = v1[j];
101
+ }
102
+
103
+ return UINT2NUM(v1[t_len]);
104
+ }
105
+ }
106
+ end
107
+ end
108
+ end
109
+
110
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
111
+
112
+
113
+ __END__
@@ -0,0 +1,109 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part of BioDSL (www.BioDSL.org). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+
27
+ module BioDSL
28
+ module Translate
29
+ # Translation table 11
30
+ # (http://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=cgencodes#SG11)
31
+ # AAs = FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
32
+ # Starts = ---M---------------M------------MMMM---------------M------------
33
+ # Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
34
+ # Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
35
+ # Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
36
+ TRANS_TAB11_START = {
37
+ "TTG" => "M", "CTG" => "M", "ATT" => "M", "ATC" => "M",
38
+ "ATA" => "M", "ATG" => "M", "GTG" => "M"
39
+ }
40
+
41
+ TRANS_TAB11 = {
42
+ "TTT" => "F", "TCT" => "S", "TAT" => "Y", "TGT" => "C",
43
+ "TTC" => "F", "TCC" => "S", "TAC" => "Y", "TGC" => "C",
44
+ "TTA" => "L", "TCA" => "S", "TAA" => "*", "TGA" => "*",
45
+ "TTG" => "L", "TCG" => "S", "TAG" => "*", "TGG" => "W",
46
+ "CTT" => "L", "CCT" => "P", "CAT" => "H", "CGT" => "R",
47
+ "CTC" => "L", "CCC" => "P", "CAC" => "H", "CGC" => "R",
48
+ "CTA" => "L", "CCA" => "P", "CAA" => "Q", "CGA" => "R",
49
+ "CTG" => "L", "CCG" => "P", "CAG" => "Q", "CGG" => "R",
50
+ "ATT" => "I", "ACT" => "T", "AAT" => "N", "AGT" => "S",
51
+ "ATC" => "I", "ACC" => "T", "AAC" => "N", "AGC" => "S",
52
+ "ATA" => "I", "ACA" => "T", "AAA" => "K", "AGA" => "R",
53
+ "ATG" => "M", "ACG" => "T", "AAG" => "K", "AGG" => "R",
54
+ "GTT" => "V", "GCT" => "A", "GAT" => "D", "GGT" => "G",
55
+ "GTC" => "V", "GCC" => "A", "GAC" => "D", "GGC" => "G",
56
+ "GTA" => "V", "GCA" => "A", "GAA" => "E", "GGA" => "G",
57
+ "GTG" => "V", "GCG" => "A", "GAG" => "E", "GGG" => "G"
58
+ }
59
+
60
+ # Method to translate a DNA sequence to protein.
61
+ def translate!(trans_tab = 11)
62
+ entry = translate(trans_tab)
63
+
64
+ self.seq_name = entry.seq_name ? entry.seq_name.dup : nil
65
+ self.seq = entry.seq.dup
66
+ self.type = entry.type
67
+ self.qual = entry.qual
68
+
69
+ self
70
+ end
71
+
72
+ alias :to_protein! :translate!
73
+
74
+ def translate(trans_tab = 11)
75
+ raise SeqError, "Sequence type must be 'dna' - not #{self.type}" unless self.type == :dna
76
+ raise SeqError, "Sequence length must be a multiplum of 3 - was: #{self.length}" unless (self.length % 3) == 0
77
+
78
+ case trans_tab
79
+ when 11
80
+ codon_start_hash = TRANS_TAB11_START
81
+ codon_hash = TRANS_TAB11
82
+ else
83
+ raise SeqError, "Unknown translation table: #{trans_tab}"
84
+ end
85
+
86
+ codon = self.seq[0 ... 3].upcase
87
+
88
+ aa = codon_start_hash[codon]
89
+
90
+ raise SeqError, "Unknown start codon: #{codon}" if aa.nil?
91
+
92
+ protein = aa.dup
93
+
94
+ (3 ... self.length).step(3) do |i|
95
+ codon = self.seq[i ... i + 3].upcase
96
+
97
+ aa = codon_hash[codon]
98
+
99
+ raise SeqError, "Unknown codon: #{codon}" if aa.nil?
100
+
101
+ protein << aa.dup
102
+ end
103
+
104
+ Seq.new(seq_name: self.seq_name, seq: protein[0 .. -2], type: :protein)
105
+ end
106
+
107
+ alias :to_protein :translate
108
+ end
109
+ end