BioDSL 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,252 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part of BioDSL (www.BioDSL.org). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+
27
+ module BioDSL
28
+ # Error class for all exceptions to do with BackTrack.
29
+ class BackTrackError < StandardError; end
30
+
31
+ # Module containing code to locate nucleotide patterns in sequences allowing for
32
+ # ambiguity codes and a given maximum mismatches, insertions, and deletions. The
33
+ # pattern match engine is based on a backtrack algorithm.
34
+ # Insertions are nucleotides found in the pattern but not in the sequence.
35
+ # Deletions are nucleotides found in the sequence but not in the pattern.
36
+ # Algorithm based on code kindly provided by j_random_hacker @ Stackoverflow:
37
+ # http://stackoverflow.com/questions/7557017/approximate-string-matching-using-backtracking/
38
+ module BackTrack
39
+ extend BioDSL::Ambiguity
40
+
41
+ OK_PATTERN = Regexp.new('^[bflsycwphqrimtnkvadegu]+$')
42
+ MAX_MIS = 5 # Maximum number of mismatches allowed
43
+ MAX_INS = 5 # Maximum number of insertions allowed
44
+ MAX_DEL = 5 # Maximum number of deletions allowed
45
+
46
+ # ------------------------------------------------------------------------------
47
+ # str.patmatch(pattern[, options])
48
+ # -> Match
49
+ # str.patmatch(pattern[, options]) { |match|
50
+ # block
51
+ # }
52
+ # -> Match
53
+ #
54
+ # options:
55
+ # :start
56
+ # :stop
57
+ # :max_mismatches
58
+ # :max_insertions
59
+ # :max_deletions
60
+ #
61
+ # ------------------------------------------------------------------------------
62
+ # Method to iterate through a sequence from a given start position to the end of
63
+ # the sequence or to a given stop position to locate a pattern allowing for a
64
+ # maximum number of mismatches, insertions, and deletions. Insertions are
65
+ # nucleotides found in the pattern but not in the sequence. Deletions are
66
+ # nucleotides found in the sequence but not in the pattern.
67
+ def patmatch(pattern, options = {})
68
+ options[:start] ||= 0
69
+ options[:stop] ||= self.length - 1
70
+ options[:max_mismatches] ||= 0
71
+ options[:max_insertions] ||= 0
72
+ options[:max_deletions] ||= 0
73
+
74
+ self.patscan(pattern, options) do |m|
75
+ if block_given?
76
+ yield m
77
+ else
78
+ return m
79
+ end
80
+ end
81
+ end
82
+
83
+ # ------------------------------------------------------------------------------
84
+ # str.patscan(pattern[, options])
85
+ # -> Array
86
+ # str.patscan(pattern[, options]) { |match|
87
+ # block
88
+ # }
89
+ # -> Match
90
+ #
91
+ # options:
92
+ # :start
93
+ # :stop
94
+ # :max_mismatches
95
+ # :max_insertions
96
+ # :max_deletions
97
+ #
98
+ # ------------------------------------------------------------------------------
99
+ # Method to iterate through a sequence from a given start position to the end of
100
+ # the sequence or to a given stop position to locate a pattern allowing for a
101
+ # maximum number of mismatches, insertions, and deletions. Insertions are
102
+ # nucleotides found in the pattern but not in the sequence. Deletions are
103
+ # nucleotides found in the sequence but not in the pattern. Matches found in
104
+ # block context return the Match object. Otherwise matches are returned in an
105
+ # Array of Match objects.
106
+ def patscan(pattern, options = {})
107
+ options[:start] ||= 0
108
+ options[:stop] ||= self.length - 1
109
+ options[:max_mismatches] ||= 0
110
+ options[:max_insertions] ||= 0
111
+ options[:max_deletions] ||= 0
112
+
113
+ raise BackTrackError, "Bad pattern: #{pattern}" unless pattern.downcase =~ OK_PATTERN
114
+ raise BackTrackError, "start: #{options[:start]} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? options[:start]
115
+ raise BackTrackError, "stop: #{options[:stop]} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? options[:stop]
116
+ raise BackTrackError, "max_mismatches: #{options[:max_mismatches]} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? options[:max_mismatches]
117
+ raise BackTrackError, "max_insertions: #{options[:max_insertions]} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? options[:max_insertions]
118
+ raise BackTrackError, "max_deletions: #{options[:max_deletions]} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? options[:max_deletions]
119
+
120
+ matches = []
121
+
122
+ while result = scan_C(self.seq,
123
+ pattern,
124
+ options[:start],
125
+ options[:stop],
126
+ options[:max_mismatches],
127
+ options[:max_insertions],
128
+ options[:max_deletions]
129
+ )
130
+ match = Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
131
+
132
+ if block_given?
133
+ yield match
134
+ else
135
+ matches << match
136
+ end
137
+
138
+ options[:start] = result.first + 1
139
+ end
140
+
141
+ return matches unless block_given?
142
+ end
143
+
144
+ private
145
+
146
+ inline do |builder|
147
+ add_ambiguity_macro(builder)
148
+
149
+ # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mis
150
+ # mismatches, ins insertions and del deletions. ss is the start of the sequence, used only for
151
+ # reporting the match endpoints. State is used to avoid ins followed by del and visa versa which
152
+ # are nonsense.
153
+ builder.prefix %{
154
+ unsigned int backtrack(
155
+ char *ss, // Sequence start
156
+ char *s, // Sequence
157
+ char *p, // Pattern
158
+ unsigned int mis, // Max mismatches
159
+ unsigned int ins, // Max insertions
160
+ unsigned int del, // Max deletions
161
+ int state // Last event: mis, ins or del
162
+ )
163
+ {
164
+ unsigned int r = 0;
165
+
166
+ while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment
167
+
168
+ if (!*p)
169
+ return (unsigned int) (s - ss);
170
+ else
171
+ {
172
+ if (mis && *s && *p && (r = backtrack(ss, s + 1, p + 1, mis - 1, ins, del, 0))) return r;
173
+ if (ins && *p && (state != -1) && (r = backtrack(ss, s, p + 1, mis, ins - 1, del, 1))) return r;
174
+ if (del && *s && (state != 1) && (r = backtrack(ss, s + 1, p, mis, ins, del - 1, -1))) return r;
175
+ }
176
+
177
+ return 0;
178
+ }
179
+ }
180
+
181
+ # Find pattern (p) in a sequence (s) starting at pos, with at most mis mismatches, ins
182
+ # insertions and del deletions.
183
+ builder.c %{
184
+ VALUE scan_C(
185
+ VALUE _s, // Sequence
186
+ VALUE _p, // Pattern
187
+ VALUE _start, // Search postition start
188
+ VALUE _stop, // Search position stop
189
+ VALUE _mis, // Maximum mismatches
190
+ VALUE _ins, // Maximum insertions
191
+ VALUE _del // Maximum deletions
192
+ )
193
+ {
194
+ char *s = StringValuePtr(_s);
195
+ char *p = StringValuePtr(_p);
196
+ unsigned int start = FIX2UINT(_start);
197
+ unsigned int stop = FIX2UINT(_stop);
198
+ unsigned int mis = FIX2UINT(_mis);
199
+ unsigned int ins = FIX2UINT(_ins);
200
+ unsigned int del = FIX2UINT(_del);
201
+
202
+ char *ss = s;
203
+ int state = 0;
204
+ unsigned int i = 0;
205
+ unsigned int e = 0;
206
+ VALUE tuple;
207
+
208
+ s += start;
209
+
210
+ for (i = start; i <= stop; i++, s++)
211
+ {
212
+ if ((e = backtrack(ss, s, p, mis, ins, del, state)))
213
+ {
214
+ tuple = rb_ary_new();
215
+ rb_ary_push(tuple, INT2FIX((int) (s - ss)));
216
+ rb_ary_push(tuple, INT2FIX((int) e - (s - ss)));
217
+ return tuple;
218
+ }
219
+ }
220
+
221
+ return Qnil;
222
+ }
223
+ }
224
+ end
225
+
226
+ # Class containing match information.
227
+ class Match
228
+ attr_reader :pos, :length, :match
229
+
230
+ def initialize(pos, length, match)
231
+ @pos = pos
232
+ @length = length
233
+ @match = match
234
+ end
235
+
236
+ def start
237
+ @pos
238
+ end
239
+
240
+ def stop
241
+ @pos + @length - 1
242
+ end
243
+
244
+ def to_s
245
+ "#{pos}:#{length}:#{match}"
246
+ end
247
+ end
248
+ end
249
+ end
250
+
251
+
252
+ __END__
@@ -0,0 +1,99 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
29
+ module BioDSL
30
+ # Error class for all exceptions to do with Digest.
31
+ DigestError = Class.new(StandardError)
32
+
33
+ # Namespace for Digest.
34
+ module Digest
35
+ # Method to get the next digestion product from a sequence.
36
+ def each_digest(pattern, cut_pos)
37
+ return to_enum(:each_digest, pattern, cut_pos) unless block_given?
38
+ pattern = disambiguate(pattern)
39
+ offset = 0
40
+
41
+ seq.upcase.scan pattern do
42
+ pos = $`.length + cut_pos
43
+
44
+ if pos >= 0 && pos < length - 2
45
+ subseq = self[offset...pos]
46
+ subseq.seq_name = "#{seq_name}[#{offset}-#{pos - offset - 1}]"
47
+
48
+ yield subseq
49
+ end
50
+
51
+ offset = pos
52
+ end
53
+
54
+ offset = 0 if offset < 0 || offset > length
55
+ subseq = self[offset..-1]
56
+ subseq.seq_name = "#{seq_name}[#{offset}-#{length - 1}]"
57
+
58
+ yield subseq
59
+ end
60
+
61
+ private
62
+
63
+ # Method that returns a regexp object with a restriction
64
+ # enzyme pattern with ambiguity codes substituted to the
65
+ # appropriate regexp.
66
+ def disambiguate(pattern)
67
+ ambiguity = {
68
+ 'A' => 'A',
69
+ 'T' => 'T',
70
+ 'U' => 'T',
71
+ 'C' => 'C',
72
+ 'G' => 'G',
73
+ 'M' => '[AC]',
74
+ 'R' => '[AG]',
75
+ 'W' => '[AT]',
76
+ 'S' => '[CG]',
77
+ 'Y' => '[CT]',
78
+ 'K' => '[GT]',
79
+ 'V' => '[ACG]',
80
+ 'H' => '[ACT]',
81
+ 'D' => '[AGT]',
82
+ 'B' => '[CGT]',
83
+ 'N' => '[GATC]'
84
+ }
85
+
86
+ new_pattern = ''
87
+
88
+ pattern.upcase.each_char do |char|
89
+ if ambiguity[char]
90
+ new_pattern << ambiguity[char]
91
+ else
92
+ fail DigestError, "Could not disambiguate residue: #{char}"
93
+ end
94
+ end
95
+
96
+ Regexp.new(new_pattern)
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,263 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part of BioDSL (www.BioDSL.org). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+
27
+ module BioDSL
28
+ # Error class for Dynamic.
29
+ class DynamicError < StandardError; end
30
+
31
+ # Module containing code to locate nucleotide patterns in sequences allowing for
32
+ # ambiguity codes and a given maximum edit distance.
33
+ # Insertions are nucleotides found in the pattern but not in the sequence.
34
+ # Deletions are nucleotides found in the sequence but not in the pattern.
35
+ #
36
+ # Inspired by the paper by Bruno Woltzenlogel Paleo (page 197):
37
+ # http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf
38
+ module Dynamic
39
+ extend BioDSL::Ambiguity
40
+
41
+ # ------------------------------------------------------------------------------
42
+ # str.patmatch(pattern[, pos[, max_edit_distance]])
43
+ # -> Match or nil
44
+ # str.patscan(pattern[, pos[, max_edit_distance]]) { |match|
45
+ # block
46
+ # }
47
+ # -> Match
48
+ #
49
+ # ------------------------------------------------------------------------------
50
+ # Method to iterate through a sequence to locate the first pattern match
51
+ # starting from a given position and allowing for a maximum edit distance.
52
+ def patmatch(pattern, pos = 0, max_edit_distance = 0)
53
+ self.patscan(pattern, pos, max_edit_distance) do |m|
54
+ return m
55
+ end
56
+ end
57
+
58
+ # ------------------------------------------------------------------------------
59
+ # str.patscan(pattern[, pos[, max_edit_distance]])
60
+ # -> Array or nil
61
+ # str.patscan(pattern[, pos[, max_edit_distance]]) { |match|
62
+ # block
63
+ # }
64
+ # -> Match
65
+ #
66
+ # ------------------------------------------------------------------------------
67
+ # Method to iterate through a sequence to locate pattern matches starting from a
68
+ # given position and allowing for a maximum edit distance. Matches found in
69
+ # block context return the Match object. Otherwise matches are returned in an
70
+ # Array.
71
+ def patscan(pattern, pos = 0, max_edit_distance = 0)
72
+ matches = []
73
+
74
+ while result = match_C(self.seq, self.length, pattern, pattern.length, pos, max_edit_distance)
75
+ match = Match.new(*result, self.seq[result[0] ... result[0] + result[1]]);
76
+
77
+ if block_given?
78
+ yield match
79
+ else
80
+ matches << match
81
+ end
82
+
83
+ pos = match.beg + 1
84
+ end
85
+
86
+ return matches unless block_given?
87
+ end
88
+
89
+ private
90
+
91
+ inline do |builder|
92
+ add_ambiguity_macro(builder)
93
+
94
+ # Macro for matching nucleotides including ambiguity codes.
95
+ builder.prefix %{
96
+ #define MAX_PAT 1024
97
+ }
98
+
99
+ builder.prefix %{
100
+ typedef struct
101
+ {
102
+ unsigned int mis;
103
+ unsigned int ins;
104
+ unsigned int del;
105
+ unsigned int ed;
106
+ } score;
107
+ }
108
+
109
+ builder.prefix %{
110
+ void vector_init(score *vec, unsigned int vec_len)
111
+ {
112
+ unsigned int i = 0;
113
+
114
+ for (i = 1; i < vec_len; i++)
115
+ {
116
+ vec[i].ins = i;
117
+ vec[i].ed = i;
118
+ }
119
+ }
120
+ }
121
+
122
+ builder.prefix %{
123
+ void vector_print(score *vec, unsigned int vec_len)
124
+ {
125
+ unsigned int i = 0;
126
+
127
+ for (i = 0; i < vec_len; i++)
128
+ {
129
+ printf("i: %d mis: %d ins: %d del: %d ed: %d\\n", i, vec[i].mis, vec[i].ins, vec[i].del, vec[i].ed);
130
+ }
131
+
132
+ printf("---\\n");
133
+ }
134
+ }
135
+
136
+ builder.prefix %{
137
+ int match_found(score *vec, unsigned int pat_len, unsigned int max_ed)
138
+ {
139
+ return (vec[pat_len].ed <= max_ed);
140
+ }
141
+ }
142
+
143
+ builder.prefix %{
144
+ void vector_update(score *vec, char *seq, char *pat, unsigned int pat_len, unsigned int pos)
145
+ {
146
+ score diag = vec[0];
147
+ score up = {0, 0, 0, 0}; // insertion
148
+ score left = vec[1]; // deletion
149
+ score new = {0, 0, 0, 0};
150
+
151
+ unsigned int i = 0;
152
+
153
+ for (i = 0; i < pat_len; i++)
154
+ {
155
+ if (MATCH(seq[pos], pat[i])) // match
156
+ {
157
+ new = diag;
158
+ }
159
+ else
160
+ {
161
+ if (left.ed <= diag.ed && left.ed <= up.ed) // deletion
162
+ {
163
+ new = left;
164
+ new.del++;
165
+ }
166
+ else if (diag.ed <= up.ed && diag.ed <= left.ed) // mismatch
167
+ {
168
+ new = diag;
169
+ new.mis++;
170
+ }
171
+ else if (up.ed <= diag.ed && up.ed <= left.ed) // insertion
172
+ {
173
+ new = up;
174
+ new.ins++;
175
+ }
176
+ else
177
+ {
178
+ printf("This should not happen\\n");
179
+ exit(1);
180
+ }
181
+
182
+ new.ed++;
183
+ }
184
+
185
+ diag = vec[i + 1];
186
+ up = new;
187
+ left = vec[i + 2];
188
+
189
+ vec[i + 1] = new;
190
+ }
191
+ }
192
+ }
193
+
194
+ builder.c %{
195
+ VALUE match_C(
196
+ VALUE _seq, // Sequence
197
+ VALUE _seq_len, // Sequence length
198
+ VALUE _pat, // Pattern
199
+ VALUE _pat_len, // Pattern length
200
+ VALUE _pos, // Offset position
201
+ VALUE _max_ed // Maximum edit distance
202
+ )
203
+ {
204
+ char *seq = (char *) StringValuePtr(_seq);
205
+ char *pat = (char *) StringValuePtr(_pat);
206
+ unsigned int seq_len = FIX2UINT(_seq_len);
207
+ unsigned int pat_len = FIX2UINT(_pat_len);
208
+ unsigned int pos = FIX2UINT(_pos);
209
+ unsigned int max_ed = FIX2UINT(_max_ed);
210
+
211
+ score vec[MAX_PAT] = {0};
212
+ unsigned int vec_len = pat_len + 1;
213
+ unsigned int match_beg = 0;
214
+ unsigned int match_len = 0;
215
+
216
+ VALUE match_ary;
217
+
218
+ vector_init(vec, vec_len);
219
+
220
+ while (pos < seq_len)
221
+ {
222
+ vector_update(vec, seq, pat, pat_len, pos);
223
+
224
+ if (match_found(vec, pat_len, max_ed))
225
+ {
226
+ match_len = pat_len - vec[pat_len].ins + vec[pat_len].del;
227
+ match_beg = pos - match_len + 1;
228
+
229
+ match_ary = rb_ary_new();
230
+ rb_ary_push(match_ary, INT2FIX(match_beg));
231
+ rb_ary_push(match_ary, INT2FIX(match_len));
232
+ rb_ary_push(match_ary, INT2FIX(vec[pat_len].mis));
233
+ rb_ary_push(match_ary, INT2FIX(vec[pat_len].ins));
234
+ rb_ary_push(match_ary, INT2FIX(vec[pat_len].del));
235
+
236
+ return match_ary;
237
+ }
238
+
239
+ pos++;
240
+ }
241
+
242
+ return Qfalse; // no match
243
+ }
244
+ }
245
+ end
246
+
247
+ class Match
248
+ attr_accessor :beg, :length, :mis, :ins, :del, :match
249
+
250
+ def initialize(beg, length, mis, ins, del, match)
251
+ @beg = beg
252
+ @length = length
253
+ @mis = mis
254
+ @ins = ins
255
+ @del = del
256
+ @match = match
257
+ end
258
+ end
259
+ end
260
+ end
261
+
262
+
263
+ __END__
@@ -0,0 +1,59 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part of BioDSL (www.BioDSL.org). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+
27
+ module BioDSL
28
+ # Error class for all exceptions to do with Homopolymer.
29
+ class HomopolymerError < StandardError; end
30
+
31
+ module Homopolymer
32
+ def each_homopolymer(min = 1)
33
+ raise HomopolymerError, "Bad min value: #{min}" if min <= 0
34
+ list = []
35
+
36
+ self.seq.upcase.scan(/A{#{min},}|T{#{min},}|G{#{min},}|C{#{min},}|N{#{min},}/) do |match|
37
+ hp = Homopolymer.new(match, match.length, $`.length)
38
+
39
+ if block_given?
40
+ yield hp
41
+ else
42
+ list << hp
43
+ end
44
+ end
45
+
46
+ block_given? ? self : list
47
+ end
48
+
49
+ class Homopolymer
50
+ attr_reader :pattern, :length, :pos
51
+
52
+ def initialize(pattern, length, pos)
53
+ @pattern = pattern
54
+ @length = length
55
+ @pos = pos
56
+ end
57
+ end
58
+ end
59
+ end