BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,252 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part of BioDSL (www.BioDSL.org). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+
27
+ module BioDSL
28
+ # Error class for all exceptions to do with BackTrack.
29
+ class BackTrackError < StandardError; end
30
+
31
+ # Module containing code to locate nucleotide patterns in sequences allowing for
32
+ # ambiguity codes and a given maximum mismatches, insertions, and deletions. The
33
+ # pattern match engine is based on a backtrack algorithm.
34
+ # Insertions are nucleotides found in the pattern but not in the sequence.
35
+ # Deletions are nucleotides found in the sequence but not in the pattern.
36
+ # Algorithm based on code kindly provided by j_random_hacker @ Stackoverflow:
37
+ # http://stackoverflow.com/questions/7557017/approximate-string-matching-using-backtracking/
38
+ module BackTrack
39
+ extend BioDSL::Ambiguity
40
+
41
+ OK_PATTERN = Regexp.new('^[bflsycwphqrimtnkvadegu]+$')
42
+ MAX_MIS = 5 # Maximum number of mismatches allowed
43
+ MAX_INS = 5 # Maximum number of insertions allowed
44
+ MAX_DEL = 5 # Maximum number of deletions allowed
45
+
46
+ # ------------------------------------------------------------------------------
47
+ # str.patmatch(pattern[, options])
48
+ # -> Match
49
+ # str.patmatch(pattern[, options]) { |match|
50
+ # block
51
+ # }
52
+ # -> Match
53
+ #
54
+ # options:
55
+ # :start
56
+ # :stop
57
+ # :max_mismatches
58
+ # :max_insertions
59
+ # :max_deletions
60
+ #
61
+ # ------------------------------------------------------------------------------
62
+ # Method to iterate through a sequence from a given start position to the end of
63
+ # the sequence or to a given stop position to locate a pattern allowing for a
64
+ # maximum number of mismatches, insertions, and deletions. Insertions are
65
+ # nucleotides found in the pattern but not in the sequence. Deletions are
66
+ # nucleotides found in the sequence but not in the pattern.
67
+ def patmatch(pattern, options = {})
68
+ options[:start] ||= 0
69
+ options[:stop] ||= self.length - 1
70
+ options[:max_mismatches] ||= 0
71
+ options[:max_insertions] ||= 0
72
+ options[:max_deletions] ||= 0
73
+
74
+ self.patscan(pattern, options) do |m|
75
+ if block_given?
76
+ yield m
77
+ else
78
+ return m
79
+ end
80
+ end
81
+ end
82
+
83
+ # ------------------------------------------------------------------------------
84
+ # str.patscan(pattern[, options])
85
+ # -> Array
86
+ # str.patscan(pattern[, options]) { |match|
87
+ # block
88
+ # }
89
+ # -> Match
90
+ #
91
+ # options:
92
+ # :start
93
+ # :stop
94
+ # :max_mismatches
95
+ # :max_insertions
96
+ # :max_deletions
97
+ #
98
+ # ------------------------------------------------------------------------------
99
+ # Method to iterate through a sequence from a given start position to the end of
100
+ # the sequence or to a given stop position to locate a pattern allowing for a
101
+ # maximum number of mismatches, insertions, and deletions. Insertions are
102
+ # nucleotides found in the pattern but not in the sequence. Deletions are
103
+ # nucleotides found in the sequence but not in the pattern. Matches found in
104
+ # block context return the Match object. Otherwise matches are returned in an
105
+ # Array of Match objects.
106
+ def patscan(pattern, options = {})
107
+ options[:start] ||= 0
108
+ options[:stop] ||= self.length - 1
109
+ options[:max_mismatches] ||= 0
110
+ options[:max_insertions] ||= 0
111
+ options[:max_deletions] ||= 0
112
+
113
+ raise BackTrackError, "Bad pattern: #{pattern}" unless pattern.downcase =~ OK_PATTERN
114
+ raise BackTrackError, "start: #{options[:start]} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? options[:start]
115
+ raise BackTrackError, "stop: #{options[:stop]} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? options[:stop]
116
+ raise BackTrackError, "max_mismatches: #{options[:max_mismatches]} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? options[:max_mismatches]
117
+ raise BackTrackError, "max_insertions: #{options[:max_insertions]} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? options[:max_insertions]
118
+ raise BackTrackError, "max_deletions: #{options[:max_deletions]} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? options[:max_deletions]
119
+
120
+ matches = []
121
+
122
+ while result = scan_C(self.seq,
123
+ pattern,
124
+ options[:start],
125
+ options[:stop],
126
+ options[:max_mismatches],
127
+ options[:max_insertions],
128
+ options[:max_deletions]
129
+ )
130
+ match = Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
131
+
132
+ if block_given?
133
+ yield match
134
+ else
135
+ matches << match
136
+ end
137
+
138
+ options[:start] = result.first + 1
139
+ end
140
+
141
+ return matches unless block_given?
142
+ end
143
+
144
+ private
145
+
146
+ inline do |builder|
147
+ add_ambiguity_macro(builder)
148
+
149
+ # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mis
150
+ # mismatches, ins insertions and del deletions. ss is the start of the sequence, used only for
151
+ # reporting the match endpoints. State is used to avoid ins followed by del and visa versa which
152
+ # are nonsense.
153
+ builder.prefix %{
154
+ unsigned int backtrack(
155
+ char *ss, // Sequence start
156
+ char *s, // Sequence
157
+ char *p, // Pattern
158
+ unsigned int mis, // Max mismatches
159
+ unsigned int ins, // Max insertions
160
+ unsigned int del, // Max deletions
161
+ int state // Last event: mis, ins or del
162
+ )
163
+ {
164
+ unsigned int r = 0;
165
+
166
+ while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment
167
+
168
+ if (!*p)
169
+ return (unsigned int) (s - ss);
170
+ else
171
+ {
172
+ if (mis && *s && *p && (r = backtrack(ss, s + 1, p + 1, mis - 1, ins, del, 0))) return r;
173
+ if (ins && *p && (state != -1) && (r = backtrack(ss, s, p + 1, mis, ins - 1, del, 1))) return r;
174
+ if (del && *s && (state != 1) && (r = backtrack(ss, s + 1, p, mis, ins, del - 1, -1))) return r;
175
+ }
176
+
177
+ return 0;
178
+ }
179
+ }
180
+
181
+ # Find pattern (p) in a sequence (s) starting at pos, with at most mis mismatches, ins
182
+ # insertions and del deletions.
183
+ builder.c %{
184
+ VALUE scan_C(
185
+ VALUE _s, // Sequence
186
+ VALUE _p, // Pattern
187
+ VALUE _start, // Search postition start
188
+ VALUE _stop, // Search position stop
189
+ VALUE _mis, // Maximum mismatches
190
+ VALUE _ins, // Maximum insertions
191
+ VALUE _del // Maximum deletions
192
+ )
193
+ {
194
+ char *s = StringValuePtr(_s);
195
+ char *p = StringValuePtr(_p);
196
+ unsigned int start = FIX2UINT(_start);
197
+ unsigned int stop = FIX2UINT(_stop);
198
+ unsigned int mis = FIX2UINT(_mis);
199
+ unsigned int ins = FIX2UINT(_ins);
200
+ unsigned int del = FIX2UINT(_del);
201
+
202
+ char *ss = s;
203
+ int state = 0;
204
+ unsigned int i = 0;
205
+ unsigned int e = 0;
206
+ VALUE tuple;
207
+
208
+ s += start;
209
+
210
+ for (i = start; i <= stop; i++, s++)
211
+ {
212
+ if ((e = backtrack(ss, s, p, mis, ins, del, state)))
213
+ {
214
+ tuple = rb_ary_new();
215
+ rb_ary_push(tuple, INT2FIX((int) (s - ss)));
216
+ rb_ary_push(tuple, INT2FIX((int) e - (s - ss)));
217
+ return tuple;
218
+ }
219
+ }
220
+
221
+ return Qnil;
222
+ }
223
+ }
224
+ end
225
+
226
+ # Class containing match information.
227
+ class Match
228
+ attr_reader :pos, :length, :match
229
+
230
+ def initialize(pos, length, match)
231
+ @pos = pos
232
+ @length = length
233
+ @match = match
234
+ end
235
+
236
+ def start
237
+ @pos
238
+ end
239
+
240
+ def stop
241
+ @pos + @length - 1
242
+ end
243
+
244
+ def to_s
245
+ "#{pos}:#{length}:#{match}"
246
+ end
247
+ end
248
+ end
249
+ end
250
+
251
+
252
+ __END__
@@ -0,0 +1,99 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
29
+ module BioDSL
30
+ # Error class for all exceptions to do with Digest.
31
+ DigestError = Class.new(StandardError)
32
+
33
+ # Namespace for Digest.
34
+ module Digest
35
+ # Method to get the next digestion product from a sequence.
36
+ def each_digest(pattern, cut_pos)
37
+ return to_enum(:each_digest, pattern, cut_pos) unless block_given?
38
+ pattern = disambiguate(pattern)
39
+ offset = 0
40
+
41
+ seq.upcase.scan pattern do
42
+ pos = $`.length + cut_pos
43
+
44
+ if pos >= 0 && pos < length - 2
45
+ subseq = self[offset...pos]
46
+ subseq.seq_name = "#{seq_name}[#{offset}-#{pos - offset - 1}]"
47
+
48
+ yield subseq
49
+ end
50
+
51
+ offset = pos
52
+ end
53
+
54
+ offset = 0 if offset < 0 || offset > length
55
+ subseq = self[offset..-1]
56
+ subseq.seq_name = "#{seq_name}[#{offset}-#{length - 1}]"
57
+
58
+ yield subseq
59
+ end
60
+
61
+ private
62
+
63
+ # Method that returns a regexp object with a restriction
64
+ # enzyme pattern with ambiguity codes substituted to the
65
+ # appropriate regexp.
66
+ def disambiguate(pattern)
67
+ ambiguity = {
68
+ 'A' => 'A',
69
+ 'T' => 'T',
70
+ 'U' => 'T',
71
+ 'C' => 'C',
72
+ 'G' => 'G',
73
+ 'M' => '[AC]',
74
+ 'R' => '[AG]',
75
+ 'W' => '[AT]',
76
+ 'S' => '[CG]',
77
+ 'Y' => '[CT]',
78
+ 'K' => '[GT]',
79
+ 'V' => '[ACG]',
80
+ 'H' => '[ACT]',
81
+ 'D' => '[AGT]',
82
+ 'B' => '[CGT]',
83
+ 'N' => '[GATC]'
84
+ }
85
+
86
+ new_pattern = ''
87
+
88
+ pattern.upcase.each_char do |char|
89
+ if ambiguity[char]
90
+ new_pattern << ambiguity[char]
91
+ else
92
+ fail DigestError, "Could not disambiguate residue: #{char}"
93
+ end
94
+ end
95
+
96
+ Regexp.new(new_pattern)
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,263 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part of BioDSL (www.BioDSL.org). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+
27
+ module BioDSL
28
+ # Error class for Dynamic.
29
+ class DynamicError < StandardError; end
30
+
31
+ # Module containing code to locate nucleotide patterns in sequences allowing for
32
+ # ambiguity codes and a given maximum edit distance.
33
+ # Insertions are nucleotides found in the pattern but not in the sequence.
34
+ # Deletions are nucleotides found in the sequence but not in the pattern.
35
+ #
36
+ # Inspired by the paper by Bruno Woltzenlogel Paleo (page 197):
37
+ # http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf
38
+ module Dynamic
39
+ extend BioDSL::Ambiguity
40
+
41
+ # ------------------------------------------------------------------------------
42
+ # str.patmatch(pattern[, pos[, max_edit_distance]])
43
+ # -> Match or nil
44
+ # str.patscan(pattern[, pos[, max_edit_distance]]) { |match|
45
+ # block
46
+ # }
47
+ # -> Match
48
+ #
49
+ # ------------------------------------------------------------------------------
50
+ # Method to iterate through a sequence to locate the first pattern match
51
+ # starting from a given position and allowing for a maximum edit distance.
52
+ def patmatch(pattern, pos = 0, max_edit_distance = 0)
53
+ self.patscan(pattern, pos, max_edit_distance) do |m|
54
+ return m
55
+ end
56
+ end
57
+
58
+ # ------------------------------------------------------------------------------
59
+ # str.patscan(pattern[, pos[, max_edit_distance]])
60
+ # -> Array or nil
61
+ # str.patscan(pattern[, pos[, max_edit_distance]]) { |match|
62
+ # block
63
+ # }
64
+ # -> Match
65
+ #
66
+ # ------------------------------------------------------------------------------
67
+ # Method to iterate through a sequence to locate pattern matches starting from a
68
+ # given position and allowing for a maximum edit distance. Matches found in
69
+ # block context return the Match object. Otherwise matches are returned in an
70
+ # Array.
71
+ def patscan(pattern, pos = 0, max_edit_distance = 0)
72
+ matches = []
73
+
74
+ while result = match_C(self.seq, self.length, pattern, pattern.length, pos, max_edit_distance)
75
+ match = Match.new(*result, self.seq[result[0] ... result[0] + result[1]]);
76
+
77
+ if block_given?
78
+ yield match
79
+ else
80
+ matches << match
81
+ end
82
+
83
+ pos = match.beg + 1
84
+ end
85
+
86
+ return matches unless block_given?
87
+ end
88
+
89
+ private
90
+
91
+ inline do |builder|
92
+ add_ambiguity_macro(builder)
93
+
94
+ # Macro for matching nucleotides including ambiguity codes.
95
+ builder.prefix %{
96
+ #define MAX_PAT 1024
97
+ }
98
+
99
+ builder.prefix %{
100
+ typedef struct
101
+ {
102
+ unsigned int mis;
103
+ unsigned int ins;
104
+ unsigned int del;
105
+ unsigned int ed;
106
+ } score;
107
+ }
108
+
109
+ builder.prefix %{
110
+ void vector_init(score *vec, unsigned int vec_len)
111
+ {
112
+ unsigned int i = 0;
113
+
114
+ for (i = 1; i < vec_len; i++)
115
+ {
116
+ vec[i].ins = i;
117
+ vec[i].ed = i;
118
+ }
119
+ }
120
+ }
121
+
122
+ builder.prefix %{
123
+ void vector_print(score *vec, unsigned int vec_len)
124
+ {
125
+ unsigned int i = 0;
126
+
127
+ for (i = 0; i < vec_len; i++)
128
+ {
129
+ printf("i: %d mis: %d ins: %d del: %d ed: %d\\n", i, vec[i].mis, vec[i].ins, vec[i].del, vec[i].ed);
130
+ }
131
+
132
+ printf("---\\n");
133
+ }
134
+ }
135
+
136
+ builder.prefix %{
137
+ int match_found(score *vec, unsigned int pat_len, unsigned int max_ed)
138
+ {
139
+ return (vec[pat_len].ed <= max_ed);
140
+ }
141
+ }
142
+
143
+ builder.prefix %{
144
+ void vector_update(score *vec, char *seq, char *pat, unsigned int pat_len, unsigned int pos)
145
+ {
146
+ score diag = vec[0];
147
+ score up = {0, 0, 0, 0}; // insertion
148
+ score left = vec[1]; // deletion
149
+ score new = {0, 0, 0, 0};
150
+
151
+ unsigned int i = 0;
152
+
153
+ for (i = 0; i < pat_len; i++)
154
+ {
155
+ if (MATCH(seq[pos], pat[i])) // match
156
+ {
157
+ new = diag;
158
+ }
159
+ else
160
+ {
161
+ if (left.ed <= diag.ed && left.ed <= up.ed) // deletion
162
+ {
163
+ new = left;
164
+ new.del++;
165
+ }
166
+ else if (diag.ed <= up.ed && diag.ed <= left.ed) // mismatch
167
+ {
168
+ new = diag;
169
+ new.mis++;
170
+ }
171
+ else if (up.ed <= diag.ed && up.ed <= left.ed) // insertion
172
+ {
173
+ new = up;
174
+ new.ins++;
175
+ }
176
+ else
177
+ {
178
+ printf("This should not happen\\n");
179
+ exit(1);
180
+ }
181
+
182
+ new.ed++;
183
+ }
184
+
185
+ diag = vec[i + 1];
186
+ up = new;
187
+ left = vec[i + 2];
188
+
189
+ vec[i + 1] = new;
190
+ }
191
+ }
192
+ }
193
+
194
+ builder.c %{
195
+ VALUE match_C(
196
+ VALUE _seq, // Sequence
197
+ VALUE _seq_len, // Sequence length
198
+ VALUE _pat, // Pattern
199
+ VALUE _pat_len, // Pattern length
200
+ VALUE _pos, // Offset position
201
+ VALUE _max_ed // Maximum edit distance
202
+ )
203
+ {
204
+ char *seq = (char *) StringValuePtr(_seq);
205
+ char *pat = (char *) StringValuePtr(_pat);
206
+ unsigned int seq_len = FIX2UINT(_seq_len);
207
+ unsigned int pat_len = FIX2UINT(_pat_len);
208
+ unsigned int pos = FIX2UINT(_pos);
209
+ unsigned int max_ed = FIX2UINT(_max_ed);
210
+
211
+ score vec[MAX_PAT] = {0};
212
+ unsigned int vec_len = pat_len + 1;
213
+ unsigned int match_beg = 0;
214
+ unsigned int match_len = 0;
215
+
216
+ VALUE match_ary;
217
+
218
+ vector_init(vec, vec_len);
219
+
220
+ while (pos < seq_len)
221
+ {
222
+ vector_update(vec, seq, pat, pat_len, pos);
223
+
224
+ if (match_found(vec, pat_len, max_ed))
225
+ {
226
+ match_len = pat_len - vec[pat_len].ins + vec[pat_len].del;
227
+ match_beg = pos - match_len + 1;
228
+
229
+ match_ary = rb_ary_new();
230
+ rb_ary_push(match_ary, INT2FIX(match_beg));
231
+ rb_ary_push(match_ary, INT2FIX(match_len));
232
+ rb_ary_push(match_ary, INT2FIX(vec[pat_len].mis));
233
+ rb_ary_push(match_ary, INT2FIX(vec[pat_len].ins));
234
+ rb_ary_push(match_ary, INT2FIX(vec[pat_len].del));
235
+
236
+ return match_ary;
237
+ }
238
+
239
+ pos++;
240
+ }
241
+
242
+ return Qfalse; // no match
243
+ }
244
+ }
245
+ end
246
+
247
+ class Match
248
+ attr_accessor :beg, :length, :mis, :ins, :del, :match
249
+
250
+ def initialize(beg, length, mis, ins, del, match)
251
+ @beg = beg
252
+ @length = length
253
+ @mis = mis
254
+ @ins = ins
255
+ @del = del
256
+ @match = match
257
+ end
258
+ end
259
+ end
260
+ end
261
+
262
+
263
+ __END__
@@ -0,0 +1,59 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part of BioDSL (www.BioDSL.org). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+
27
+ module BioDSL
28
+ # Error class for all exceptions to do with Homopolymer.
29
+ class HomopolymerError < StandardError; end
30
+
31
+ module Homopolymer
32
+ def each_homopolymer(min = 1)
33
+ raise HomopolymerError, "Bad min value: #{min}" if min <= 0
34
+ list = []
35
+
36
+ self.seq.upcase.scan(/A{#{min},}|T{#{min},}|G{#{min},}|C{#{min},}|N{#{min},}/) do |match|
37
+ hp = Homopolymer.new(match, match.length, $`.length)
38
+
39
+ if block_given?
40
+ yield hp
41
+ else
42
+ list << hp
43
+ end
44
+ end
45
+
46
+ block_given? ? self : list
47
+ end
48
+
49
+ class Homopolymer
50
+ attr_reader :pattern, :length, :pos
51
+
52
+ def initialize(pattern, length, pos)
53
+ @pattern = pattern
54
+ @length = length
55
+ @pos = pos
56
+ end
57
+ end
58
+ end
59
+ end