BioDSL 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/BioDSL.gemspec +1 -1
  4. data/Gemfile +6 -0
  5. data/README.md +289 -155
  6. data/Rakefile +18 -16
  7. data/lib/BioDSL.rb +1 -1
  8. data/lib/BioDSL/cary.rb +78 -53
  9. data/lib/BioDSL/command.rb +2 -2
  10. data/lib/BioDSL/commands.rb +1 -1
  11. data/lib/BioDSL/commands/add_key.rb +1 -1
  12. data/lib/BioDSL/commands/align_seq_mothur.rb +4 -4
  13. data/lib/BioDSL/commands/analyze_residue_distribution.rb +5 -5
  14. data/lib/BioDSL/commands/assemble_pairs.rb +13 -13
  15. data/lib/BioDSL/commands/assemble_seq_idba.rb +7 -9
  16. data/lib/BioDSL/commands/assemble_seq_ray.rb +13 -13
  17. data/lib/BioDSL/commands/assemble_seq_spades.rb +4 -4
  18. data/lib/BioDSL/commands/classify_seq.rb +8 -8
  19. data/lib/BioDSL/commands/classify_seq_mothur.rb +5 -5
  20. data/lib/BioDSL/commands/clip_primer.rb +7 -7
  21. data/lib/BioDSL/commands/cluster_otus.rb +5 -5
  22. data/lib/BioDSL/commands/collapse_otus.rb +2 -2
  23. data/lib/BioDSL/commands/collect_otus.rb +2 -2
  24. data/lib/BioDSL/commands/complement_seq.rb +4 -4
  25. data/lib/BioDSL/commands/count.rb +1 -1
  26. data/lib/BioDSL/commands/count_values.rb +2 -2
  27. data/lib/BioDSL/commands/degap_seq.rb +6 -7
  28. data/lib/BioDSL/commands/dereplicate_seq.rb +1 -1
  29. data/lib/BioDSL/commands/dump.rb +2 -2
  30. data/lib/BioDSL/commands/filter_rrna.rb +4 -4
  31. data/lib/BioDSL/commands/genecall.rb +7 -7
  32. data/lib/BioDSL/commands/grab.rb +1 -1
  33. data/lib/BioDSL/commands/index_taxonomy.rb +3 -3
  34. data/lib/BioDSL/commands/mask_seq.rb +4 -4
  35. data/lib/BioDSL/commands/mean_scores.rb +2 -2
  36. data/lib/BioDSL/commands/merge_pair_seq.rb +3 -3
  37. data/lib/BioDSL/commands/merge_table.rb +1 -1
  38. data/lib/BioDSL/commands/merge_values.rb +1 -1
  39. data/lib/BioDSL/commands/plot_heatmap.rb +4 -5
  40. data/lib/BioDSL/commands/plot_histogram.rb +4 -4
  41. data/lib/BioDSL/commands/plot_matches.rb +5 -5
  42. data/lib/BioDSL/commands/plot_residue_distribution.rb +6 -6
  43. data/lib/BioDSL/commands/plot_scores.rb +7 -7
  44. data/lib/BioDSL/commands/random.rb +1 -1
  45. data/lib/BioDSL/commands/read_fasta.rb +9 -9
  46. data/lib/BioDSL/commands/read_fastq.rb +16 -16
  47. data/lib/BioDSL/commands/read_table.rb +2 -3
  48. data/lib/BioDSL/commands/reverse_seq.rb +4 -4
  49. data/lib/BioDSL/commands/slice_align.rb +4 -4
  50. data/lib/BioDSL/commands/slice_seq.rb +3 -3
  51. data/lib/BioDSL/commands/sort.rb +1 -1
  52. data/lib/BioDSL/commands/split_pair_seq.rb +6 -7
  53. data/lib/BioDSL/commands/split_values.rb +2 -2
  54. data/lib/BioDSL/commands/trim_primer.rb +13 -8
  55. data/lib/BioDSL/commands/trim_seq.rb +5 -5
  56. data/lib/BioDSL/commands/uchime_ref.rb +6 -6
  57. data/lib/BioDSL/commands/uclust.rb +5 -5
  58. data/lib/BioDSL/commands/unique_values.rb +1 -1
  59. data/lib/BioDSL/commands/usearch_global.rb +2 -2
  60. data/lib/BioDSL/commands/usearch_local.rb +2 -2
  61. data/lib/BioDSL/commands/write_fasta.rb +7 -9
  62. data/lib/BioDSL/commands/write_fastq.rb +4 -4
  63. data/lib/BioDSL/commands/write_table.rb +3 -3
  64. data/lib/BioDSL/commands/write_tree.rb +2 -3
  65. data/lib/BioDSL/config.rb +2 -2
  66. data/lib/BioDSL/csv.rb +8 -10
  67. data/lib/BioDSL/debug.rb +1 -1
  68. data/lib/BioDSL/fasta.rb +54 -40
  69. data/lib/BioDSL/fastq.rb +35 -32
  70. data/lib/BioDSL/filesys.rb +56 -47
  71. data/lib/BioDSL/fork.rb +1 -1
  72. data/lib/BioDSL/hamming.rb +1 -1
  73. data/lib/BioDSL/helpers.rb +1 -1
  74. data/lib/BioDSL/helpers/aux_helper.rb +1 -1
  75. data/lib/BioDSL/helpers/email_helper.rb +1 -1
  76. data/lib/BioDSL/helpers/history_helper.rb +1 -1
  77. data/lib/BioDSL/helpers/log_helper.rb +1 -1
  78. data/lib/BioDSL/helpers/options_helper.rb +1 -1
  79. data/lib/BioDSL/helpers/status_helper.rb +1 -1
  80. data/lib/BioDSL/html_report.rb +1 -1
  81. data/lib/BioDSL/math.rb +1 -1
  82. data/lib/BioDSL/mummer.rb +1 -1
  83. data/lib/BioDSL/pipeline.rb +1 -1
  84. data/lib/BioDSL/seq.rb +240 -231
  85. data/lib/BioDSL/seq/ambiguity.rb +1 -1
  86. data/lib/BioDSL/seq/assemble.rb +1 -1
  87. data/lib/BioDSL/seq/backtrack.rb +93 -76
  88. data/lib/BioDSL/seq/digest.rb +1 -1
  89. data/lib/BioDSL/seq/dynamic.rb +43 -55
  90. data/lib/BioDSL/seq/homopolymer.rb +34 -36
  91. data/lib/BioDSL/seq/kmer.rb +67 -50
  92. data/lib/BioDSL/seq/levenshtein.rb +35 -40
  93. data/lib/BioDSL/seq/translate.rb +64 -55
  94. data/lib/BioDSL/seq/trim.rb +60 -50
  95. data/lib/BioDSL/serializer.rb +1 -1
  96. data/lib/BioDSL/stream.rb +1 -1
  97. data/lib/BioDSL/taxonomy.rb +1 -1
  98. data/lib/BioDSL/test.rb +1 -1
  99. data/lib/BioDSL/tmp_dir.rb +1 -1
  100. data/lib/BioDSL/usearch.rb +1 -1
  101. data/lib/BioDSL/verbose.rb +1 -1
  102. data/lib/BioDSL/version.rb +2 -2
  103. data/test/BioDSL/commands/test_add_key.rb +1 -1
  104. data/test/BioDSL/commands/test_align_seq_mothur.rb +1 -1
  105. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +1 -1
  106. data/test/BioDSL/commands/test_assemble_pairs.rb +1 -1
  107. data/test/BioDSL/commands/test_assemble_seq_idba.rb +1 -1
  108. data/test/BioDSL/commands/test_assemble_seq_ray.rb +1 -1
  109. data/test/BioDSL/commands/test_assemble_seq_spades.rb +1 -1
  110. data/test/BioDSL/commands/test_classify_seq.rb +1 -1
  111. data/test/BioDSL/commands/test_classify_seq_mothur.rb +1 -1
  112. data/test/BioDSL/commands/test_clip_primer.rb +1 -1
  113. data/test/BioDSL/commands/test_cluster_otus.rb +1 -1
  114. data/test/BioDSL/commands/test_collapse_otus.rb +1 -1
  115. data/test/BioDSL/commands/test_collect_otus.rb +1 -1
  116. data/test/BioDSL/commands/test_complement_seq.rb +1 -1
  117. data/test/BioDSL/commands/test_count.rb +1 -1
  118. data/test/BioDSL/commands/test_count_values.rb +1 -1
  119. data/test/BioDSL/commands/test_degap_seq.rb +1 -1
  120. data/test/BioDSL/commands/test_dereplicate_seq.rb +1 -1
  121. data/test/BioDSL/commands/test_dump.rb +1 -1
  122. data/test/BioDSL/commands/test_filter_rrna.rb +1 -1
  123. data/test/BioDSL/commands/test_genecall.rb +1 -1
  124. data/test/BioDSL/commands/test_grab.rb +1 -1
  125. data/test/BioDSL/commands/test_index_taxonomy.rb +1 -1
  126. data/test/BioDSL/commands/test_mask_seq.rb +1 -1
  127. data/test/BioDSL/commands/test_mean_scores.rb +1 -1
  128. data/test/BioDSL/commands/test_merge_pair_seq.rb +1 -1
  129. data/test/BioDSL/commands/test_merge_table.rb +1 -1
  130. data/test/BioDSL/commands/test_merge_values.rb +1 -1
  131. data/test/BioDSL/commands/test_plot_heatmap.rb +1 -1
  132. data/test/BioDSL/commands/test_plot_histogram.rb +1 -1
  133. data/test/BioDSL/commands/test_plot_matches.rb +1 -1
  134. data/test/BioDSL/commands/test_plot_residue_distribution.rb +1 -1
  135. data/test/BioDSL/commands/test_plot_scores.rb +1 -1
  136. data/test/BioDSL/commands/test_random.rb +1 -1
  137. data/test/BioDSL/commands/test_read_fasta.rb +1 -1
  138. data/test/BioDSL/commands/test_read_fastq.rb +1 -1
  139. data/test/BioDSL/commands/test_read_table.rb +1 -1
  140. data/test/BioDSL/commands/test_reverse_seq.rb +1 -1
  141. data/test/BioDSL/commands/test_slice_align.rb +1 -1
  142. data/test/BioDSL/commands/test_slice_seq.rb +1 -1
  143. data/test/BioDSL/commands/test_sort.rb +1 -1
  144. data/test/BioDSL/commands/test_split_pair_seq.rb +1 -1
  145. data/test/BioDSL/commands/test_split_values.rb +1 -1
  146. data/test/BioDSL/commands/test_trim_primer.rb +1 -1
  147. data/test/BioDSL/commands/test_trim_seq.rb +1 -1
  148. data/test/BioDSL/commands/test_uchime_ref.rb +1 -1
  149. data/test/BioDSL/commands/test_uclust.rb +1 -1
  150. data/test/BioDSL/commands/test_unique_values.rb +1 -1
  151. data/test/BioDSL/commands/test_usearch_global.rb +1 -1
  152. data/test/BioDSL/commands/test_usearch_local.rb +1 -1
  153. data/test/BioDSL/commands/test_write_fasta.rb +1 -1
  154. data/test/BioDSL/commands/test_write_fastq.rb +1 -1
  155. data/test/BioDSL/commands/test_write_table.rb +1 -1
  156. data/test/BioDSL/commands/test_write_tree.rb +1 -1
  157. data/test/BioDSL/helpers/test_options_helper.rb +3 -3
  158. data/test/BioDSL/seq/test_assemble.rb +58 -56
  159. data/test/BioDSL/seq/test_backtrack.rb +83 -81
  160. data/test/BioDSL/seq/test_digest.rb +47 -45
  161. data/test/BioDSL/seq/test_dynamic.rb +66 -64
  162. data/test/BioDSL/seq/test_homopolymer.rb +35 -33
  163. data/test/BioDSL/seq/test_kmer.rb +29 -28
  164. data/test/BioDSL/seq/test_translate.rb +44 -42
  165. data/test/BioDSL/seq/test_trim.rb +59 -57
  166. data/test/BioDSL/test_cary.rb +1 -1
  167. data/test/BioDSL/test_command.rb +2 -2
  168. data/test/BioDSL/test_csv.rb +34 -31
  169. data/test/BioDSL/test_debug.rb +31 -31
  170. data/test/BioDSL/test_fasta.rb +30 -29
  171. data/test/BioDSL/test_fastq.rb +27 -26
  172. data/test/BioDSL/test_filesys.rb +28 -27
  173. data/test/BioDSL/test_fork.rb +29 -28
  174. data/test/BioDSL/test_math.rb +31 -30
  175. data/test/BioDSL/test_mummer.rb +1 -1
  176. data/test/BioDSL/test_pipeline.rb +1 -1
  177. data/test/BioDSL/test_seq.rb +42 -41
  178. data/test/BioDSL/test_serializer.rb +35 -33
  179. data/test/BioDSL/test_stream.rb +28 -27
  180. data/test/BioDSL/test_taxonomy.rb +38 -37
  181. data/test/BioDSL/test_test.rb +32 -31
  182. data/test/BioDSL/test_tmp_dir.rb +1 -1
  183. data/test/BioDSL/test_usearch.rb +28 -27
  184. data/test/BioDSL/test_verbose.rb +32 -31
  185. data/test/helper.rb +34 -31
  186. metadata +3 -2
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of the BioDSL framework (www.BioDSL.org). #
24
+ # This software is part of the BioDSL (www.BioDSL.org). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -1,36 +1,37 @@
1
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
- # #
3
- # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
- # #
5
- # This program is free software; you can redistribute it and/or #
6
- # modify it under the terms of the GNU General Public License #
7
- # as published by the Free Software Foundation; either version 2 #
8
- # of the License, or (at your option) any later version. #
9
- # #
10
- # This program is distributed in the hope that it will be useful, #
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
- # GNU General Public License for more details. #
14
- # #
15
- # You should have received a copy of the GNU General Public License #
16
- # along with this program; if not, write to the Free Software #
17
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
- # #
19
- # http://www.gnu.org/copyleft/gpl.html #
20
- # #
21
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
- # #
23
- # This software is part of BioDSL (www.BioDSL.org). #
24
- # #
25
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
27
 
27
28
  module BioDSL
28
29
  # Error class for all exceptions to do with BackTrack.
29
30
  class BackTrackError < StandardError; end
30
31
 
31
- # Module containing code to locate nucleotide patterns in sequences allowing for
32
- # ambiguity codes and a given maximum mismatches, insertions, and deletions. The
33
- # pattern match engine is based on a backtrack algorithm.
32
+ # Module containing code to locate nucleotide patterns in sequences allowing
33
+ # for ambiguity codes and a given maximum mismatches, insertions, and
34
+ # deletions. The pattern match engine is based on a backtrack algorithm.
34
35
  # Insertions are nucleotides found in the pattern but not in the sequence.
35
36
  # Deletions are nucleotides found in the sequence but not in the pattern.
36
37
  # Algorithm based on code kindly provided by j_random_hacker @ Stackoverflow:
@@ -43,7 +44,7 @@ module BioDSL
43
44
  MAX_INS = 5 # Maximum number of insertions allowed
44
45
  MAX_DEL = 5 # Maximum number of deletions allowed
45
46
 
46
- # ------------------------------------------------------------------------------
47
+ # --------------------------------------------------------------------------
47
48
  # str.patmatch(pattern[, options])
48
49
  # -> Match
49
50
  # str.patmatch(pattern[, options]) { |match|
@@ -58,20 +59,20 @@ module BioDSL
58
59
  # :max_insertions
59
60
  # :max_deletions
60
61
  #
61
- # ------------------------------------------------------------------------------
62
- # Method to iterate through a sequence from a given start position to the end of
63
- # the sequence or to a given stop position to locate a pattern allowing for a
64
- # maximum number of mismatches, insertions, and deletions. Insertions are
65
- # nucleotides found in the pattern but not in the sequence. Deletions are
66
- # nucleotides found in the sequence but not in the pattern.
62
+ # --------------------------------------------------------------------------
63
+ # Method to iterate through a sequence from a given start position to the
64
+ # end of the sequence or to a given stop position to locate a pattern
65
+ # allowing for a maximum number of mismatches, insertions, and deletions.
66
+ # Insertions are nucleotides found in the pattern but not in the sequence.
67
+ # Deletions are nucleotides found in the sequence but not in the pattern.
67
68
  def patmatch(pattern, options = {})
68
- options[:start] ||= 0
69
- options[:stop] ||= self.length - 1
69
+ options[:start] ||= 0
70
+ options[:stop] ||= length - 1
70
71
  options[:max_mismatches] ||= 0
71
72
  options[:max_insertions] ||= 0
72
- options[:max_deletions] ||= 0
73
+ options[:max_deletions] ||= 0
73
74
 
74
- self.patscan(pattern, options) do |m|
75
+ patscan(pattern, options) do |m|
75
76
  if block_given?
76
77
  yield m
77
78
  else
@@ -80,7 +81,7 @@ module BioDSL
80
81
  end
81
82
  end
82
83
 
83
- # ------------------------------------------------------------------------------
84
+ # --------------------------------------------------------------------------
84
85
  # str.patscan(pattern[, options])
85
86
  # -> Array
86
87
  # str.patscan(pattern[, options]) { |match|
@@ -95,39 +96,57 @@ module BioDSL
95
96
  # :max_insertions
96
97
  # :max_deletions
97
98
  #
98
- # ------------------------------------------------------------------------------
99
- # Method to iterate through a sequence from a given start position to the end of
100
- # the sequence or to a given stop position to locate a pattern allowing for a
101
- # maximum number of mismatches, insertions, and deletions. Insertions are
102
- # nucleotides found in the pattern but not in the sequence. Deletions are
103
- # nucleotides found in the sequence but not in the pattern. Matches found in
104
- # block context return the Match object. Otherwise matches are returned in an
105
- # Array of Match objects.
99
+ # --------------------------------------------------------------------------
100
+ # Method to iterate through a sequence from a given start position to the
101
+ # end of the sequence or to a given stop position to locate a pattern
102
+ # allowing for a maximum number of mismatches, insertions, and deletions.
103
+ # Insertions are nucleotides found in the pattern but not in the sequence.
104
+ # Deletions are nucleotides found in the sequence but not in the pattern.
105
+ # Matches found in block context return the Match object. Otherwise matches
106
+ # are returned in an Array of Match objects.
106
107
  def patscan(pattern, options = {})
107
- options[:start] ||= 0
108
- options[:stop] ||= self.length - 1
108
+ options[:start] ||= 0
109
+ options[:stop] ||= length - 1
109
110
  options[:max_mismatches] ||= 0
110
111
  options[:max_insertions] ||= 0
111
- options[:max_deletions] ||= 0
112
+ options[:max_deletions] ||= 0
112
113
 
113
- raise BackTrackError, "Bad pattern: #{pattern}" unless pattern.downcase =~ OK_PATTERN
114
- raise BackTrackError, "start: #{options[:start]} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? options[:start]
115
- raise BackTrackError, "stop: #{options[:stop]} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? options[:stop]
116
- raise BackTrackError, "max_mismatches: #{options[:max_mismatches]} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? options[:max_mismatches]
117
- raise BackTrackError, "max_insertions: #{options[:max_insertions]} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? options[:max_insertions]
118
- raise BackTrackError, "max_deletions: #{options[:max_deletions]} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? options[:max_deletions]
114
+ unless pattern.downcase =~ OK_PATTERN
115
+ fail BackTrackError, "Bad pattern: #{pattern}"
116
+ end
117
+
118
+ unless (0...length).include? options[:start]
119
+ fail BackTrackError, "start: #{options[:start]} out of range " \
120
+ "(0..#{length - 1})"
121
+ end
122
+
123
+ unless (0...length).include? options[:stop]
124
+ fail BackTrackError, "stop: #{options[:stop]} out of range " \
125
+ "(0..#{length - 1})"
126
+ end
127
+
128
+ unless (0..MAX_MIS).include? options[:max_mismatches]
129
+ fail BackTrackError, "max_mismatches: #{options[:max_mismatches]} " \
130
+ "out of range (0..#{MAX_MIS})"
131
+ end
132
+
133
+ unless (0..MAX_INS).include? options[:max_insertions]
134
+ fail BackTrackError, "max_insertions: #{options[:max_insertions]} " \
135
+ "out of range (0..#{MAX_INS})"
136
+ end
137
+
138
+ unless (0..MAX_DEL).include? options[:max_deletions]
139
+ fail BackTrackError, "max_deletions: #{options[:max_deletions]} " \
140
+ "out of range (0..#{MAX_DEL})"
141
+ end
119
142
 
120
143
  matches = []
121
144
 
122
- while result = scan_C(self.seq,
123
- pattern,
124
- options[:start],
125
- options[:stop],
126
- options[:max_mismatches],
127
- options[:max_insertions],
128
- options[:max_deletions]
129
- )
130
- match = Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
145
+ while (result = scan_C(@seq, pattern, options[:start], options[:stop],
146
+ options[:max_mismatches], options[:max_insertions],
147
+ options[:max_deletions]))
148
+ match = Match.new(result.first, result.last,
149
+ @seq[result.first...result.first + result.last])
131
150
 
132
151
  if block_given?
133
152
  yield match
@@ -146,10 +165,11 @@ module BioDSL
146
165
  inline do |builder|
147
166
  add_ambiguity_macro(builder)
148
167
 
149
- # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mis
150
- # mismatches, ins insertions and del deletions. ss is the start of the sequence, used only for
151
- # reporting the match endpoints. State is used to avoid ins followed by del and visa versa which
152
- # are nonsense.
168
+ # Backtrack algorithm for matching a pattern (p) starting in a sequence
169
+ # (s) allowing for mis mismatches, ins insertions and del deletions. ss is
170
+ # the start of the sequence, used only for reporting the match endpoints.
171
+ # State is used to avoid ins followed by del and visa versa which are
172
+ # nonsense.
153
173
  builder.prefix %{
154
174
  unsigned int backtrack(
155
175
  char *ss, // Sequence start
@@ -177,9 +197,9 @@ module BioDSL
177
197
  return 0;
178
198
  }
179
199
  }
180
-
181
- # Find pattern (p) in a sequence (s) starting at pos, with at most mis mismatches, ins
182
- # insertions and del deletions.
200
+
201
+ # Find pattern (p) in a sequence (s) starting at pos, with at most mis
202
+ # mismatches, ins insertions and del deletions.
183
203
  builder.c %{
184
204
  VALUE scan_C(
185
205
  VALUE _s, // Sequence
@@ -247,6 +267,3 @@ module BioDSL
247
267
  end
248
268
  end
249
269
  end
250
-
251
-
252
- __END__
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -1,35 +1,36 @@
1
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
- # #
3
- # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
- # #
5
- # This program is free software; you can redistribute it and/or #
6
- # modify it under the terms of the GNU General Public License #
7
- # as published by the Free Software Foundation; either version 2 #
8
- # of the License, or (at your option) any later version. #
9
- # #
10
- # This program is distributed in the hope that it will be useful, #
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
- # GNU General Public License for more details. #
14
- # #
15
- # You should have received a copy of the GNU General Public License #
16
- # along with this program; if not, write to the Free Software #
17
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
- # #
19
- # http://www.gnu.org/copyleft/gpl.html #
20
- # #
21
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
- # #
23
- # This software is part of BioDSL (www.BioDSL.org). #
24
- # #
25
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
27
 
27
28
  module BioDSL
28
29
  # Error class for Dynamic.
29
30
  class DynamicError < StandardError; end
30
31
 
31
- # Module containing code to locate nucleotide patterns in sequences allowing for
32
- # ambiguity codes and a given maximum edit distance.
32
+ # Module containing code to locate nucleotide patterns in sequences allowing
33
+ # for ambiguity codes and a given maximum edit distance.
33
34
  # Insertions are nucleotides found in the pattern but not in the sequence.
34
35
  # Deletions are nucleotides found in the sequence but not in the pattern.
35
36
  #
@@ -38,7 +39,7 @@ module BioDSL
38
39
  module Dynamic
39
40
  extend BioDSL::Ambiguity
40
41
 
41
- # ------------------------------------------------------------------------------
42
+ # --------------------------------------------------------------------------
42
43
  # str.patmatch(pattern[, pos[, max_edit_distance]])
43
44
  # -> Match or nil
44
45
  # str.patscan(pattern[, pos[, max_edit_distance]]) { |match|
@@ -46,16 +47,16 @@ module BioDSL
46
47
  # }
47
48
  # -> Match
48
49
  #
49
- # ------------------------------------------------------------------------------
50
+ # --------------------------------------------------------------------------
50
51
  # Method to iterate through a sequence to locate the first pattern match
51
52
  # starting from a given position and allowing for a maximum edit distance.
52
53
  def patmatch(pattern, pos = 0, max_edit_distance = 0)
53
- self.patscan(pattern, pos, max_edit_distance) do |m|
54
+ patscan(pattern, pos, max_edit_distance) do |m|
54
55
  return m
55
56
  end
56
57
  end
57
58
 
58
- # ------------------------------------------------------------------------------
59
+ # --------------------------------------------------------------------------
59
60
  # str.patscan(pattern[, pos[, max_edit_distance]])
60
61
  # -> Array or nil
61
62
  # str.patscan(pattern[, pos[, max_edit_distance]]) { |match|
@@ -63,16 +64,17 @@ module BioDSL
63
64
  # }
64
65
  # -> Match
65
66
  #
66
- # ------------------------------------------------------------------------------
67
- # Method to iterate through a sequence to locate pattern matches starting from a
68
- # given position and allowing for a maximum edit distance. Matches found in
69
- # block context return the Match object. Otherwise matches are returned in an
70
- # Array.
67
+ # --------------------------------------------------------------------------
68
+ # Method to iterate through a sequence to locate pattern matches starting
69
+ # from a given position and allowing for a maximum edit distance. Matches
70
+ # found in block context return the Match object. Otherwise matches are
71
+ # returned in an Array.
71
72
  def patscan(pattern, pos = 0, max_edit_distance = 0)
72
73
  matches = []
73
74
 
74
- while result = match_C(self.seq, self.length, pattern, pattern.length, pos, max_edit_distance)
75
- match = Match.new(*result, self.seq[result[0] ... result[0] + result[1]]);
75
+ while (result = match_C(@seq, length, pattern, pattern.length, pos,
76
+ max_edit_distance))
77
+ match = Match.new(*result, @seq[result[0]...result[0] + result[1]])
76
78
 
77
79
  if block_given?
78
80
  yield match
@@ -97,7 +99,7 @@ module BioDSL
97
99
  }
98
100
 
99
101
  builder.prefix %{
100
- typedef struct
102
+ typedef struct
101
103
  {
102
104
  unsigned int mis;
103
105
  unsigned int ins;
@@ -207,7 +209,7 @@ module BioDSL
207
209
  unsigned int pat_len = FIX2UINT(_pat_len);
208
210
  unsigned int pos = FIX2UINT(_pos);
209
211
  unsigned int max_ed = FIX2UINT(_max_ed);
210
-
212
+
211
213
  score vec[MAX_PAT] = {0};
212
214
  unsigned int vec_len = pat_len + 1;
213
215
  unsigned int match_beg = 0;
@@ -244,20 +246,6 @@ module BioDSL
244
246
  }
245
247
  end
246
248
 
247
- class Match
248
- attr_accessor :beg, :length, :mis, :ins, :del, :match
249
-
250
- def initialize(beg, length, mis, ins, del, match)
251
- @beg = beg
252
- @length = length
253
- @mis = mis
254
- @ins = ins
255
- @del = del
256
- @match = match
257
- end
258
- end
249
+ Match = Struct.new(:beg, :length, :mis, :ins, :del, :match)
259
250
  end
260
251
  end
261
-
262
-
263
- __END__
@@ -1,39 +1,45 @@
1
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
- # #
3
- # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
- # #
5
- # This program is free software; you can redistribute it and/or #
6
- # modify it under the terms of the GNU General Public License #
7
- # as published by the Free Software Foundation; either version 2 #
8
- # of the License, or (at your option) any later version. #
9
- # #
10
- # This program is distributed in the hope that it will be useful, #
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
- # GNU General Public License for more details. #
14
- # #
15
- # You should have received a copy of the GNU General Public License #
16
- # along with this program; if not, write to the Free Software #
17
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
- # #
19
- # http://www.gnu.org/copyleft/gpl.html #
20
- # #
21
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
- # #
23
- # This software is part of BioDSL (www.BioDSL.org). #
24
- # #
25
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
27
 
28
+ # Namespace for BioDSL.
27
29
  module BioDSL
28
30
  # Error class for all exceptions to do with Homopolymer.
29
31
  class HomopolymerError < StandardError; end
30
32
 
33
+ # Namespace for Homopolymer
31
34
  module Homopolymer
32
35
  def each_homopolymer(min = 1)
33
- raise HomopolymerError, "Bad min value: #{min}" if min <= 0
36
+ fail HomopolymerError, "Bad min value: #{min}" if min <= 0
34
37
  list = []
35
38
 
36
- self.seq.upcase.scan(/A{#{min},}|T{#{min},}|G{#{min},}|C{#{min},}|N{#{min},}/) do |match|
39
+ regex = Regexp.new("A{#{min},}|T{#{min},}|G{#{min},}|C{#{min},}|" \
40
+ "N{#{min},}")
41
+
42
+ @seq.upcase.scan(regex) do |match|
37
43
  hp = Homopolymer.new(match, match.length, $`.length)
38
44
 
39
45
  if block_given?
@@ -46,14 +52,6 @@ module BioDSL
46
52
  block_given? ? self : list
47
53
  end
48
54
 
49
- class Homopolymer
50
- attr_reader :pattern, :length, :pos
51
-
52
- def initialize(pattern, length, pos)
53
- @pattern = pattern
54
- @length = length
55
- @pos = pos
56
- end
57
- end
55
+ Homopolymer = Struct.new(:pattern, :length, :pos)
58
56
  end
59
57
  end