BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,200 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # Class for creating HTML reports from an executed BioDSL pipeline.
30
+ class HtmlReport
31
+ require 'tilt/haml'
32
+ require 'base64'
33
+ require 'BioDSL/helpers/options_helper'
34
+
35
+ include OptionsHelper
36
+
37
+ # Constructor for HtmlReport.
38
+ #
39
+ # @param pipeline [BioPeices::Pipeline] Pipeline object
40
+ def initialize(pipeline)
41
+ @pipeline = pipeline
42
+ @commands = pipeline.commands
43
+ end
44
+
45
+ # Render HTML output.
46
+ def to_html
47
+ render('layout.html.haml', self, pipeline: @pipeline.to_s,
48
+ commands: @commands)
49
+ end
50
+
51
+ private
52
+
53
+ # Render HTML templates.
54
+ #
55
+ # @param template [Path] Path to template file.
56
+ # @param scope [Object] Scope.
57
+ # @param args [Hash] Argument hash.
58
+ def render(template, scope, args = {})
59
+ Tilt.new(File.join(root_dir, template)).render(scope, args)
60
+ end
61
+
62
+ # Render HTML CSS section.
63
+ def render_css
64
+ render('css.html.haml', self)
65
+ end
66
+
67
+ # Render HTML pipeline section
68
+ #
69
+ # @param pipeline [String] String from BioDSL::Pipeline#to_s
70
+ def render_pipeline(pipeline)
71
+ pipeline = pipeline.scan(/[^.]+\(.*?\)|[^.(]+/).join(".\n").sub(/\n/, '')
72
+
73
+ render('pipeline.html.haml', self, pipeline: pipeline)
74
+ end
75
+
76
+ # Render HTML overview section.
77
+ #
78
+ # @param commands [Array] List of commands from a pipeline.
79
+ def render_overview(commands)
80
+ render('overview.html.haml', self, commands: commands)
81
+ end
82
+
83
+ # Render HTML command section.
84
+ #
85
+ # @param command [BioDSL::Command] Command object.
86
+ def render_command(command, index)
87
+ render('command.html.haml', self, command: command, index: index)
88
+ end
89
+
90
+ # Render HTML status section.
91
+ #
92
+ # @param command [BioDSL::Command] Command object.
93
+ def render_status(command)
94
+ stats = command.status.reject { |k, _| k.to_s[0..3] == 'time' }
95
+ render('status.html.haml', self, exit_status: command.run_status,
96
+ statsus: stats)
97
+ end
98
+
99
+ # Render HTML time section.
100
+ #
101
+ # @param status [BioDSL::Status] Status object.
102
+ def render_time(status)
103
+ render('time.html.haml', self, status: status)
104
+ end
105
+
106
+ # Render HTML input files section.
107
+ #
108
+ # @param options [Hash] Command options hash.
109
+ def render_input_files(options)
110
+ render('input_files.html.haml', self,
111
+ files: options_glob(options[:input]))
112
+ end
113
+
114
+ # Render HTML output file section.
115
+ #
116
+ # @param options [Hash] Command options hash.
117
+ def render_output_files(options)
118
+ render('output_files.html.haml', self, options: options)
119
+ end
120
+
121
+ # Render PNG data.
122
+ #
123
+ # @param options [Hash] Command options hash.
124
+ def render_png(options)
125
+ path = options[:output]
126
+ png_data = 'data:image/png;base64,'
127
+
128
+ File.open(path, 'r') do |ios|
129
+ png_data << Base64.encode64(ios.read)
130
+ end
131
+
132
+ render('png.html.haml', self, path: path, png_data: png_data)
133
+ end
134
+
135
+ # Detect if any input options are set.
136
+ #
137
+ # @param options [Hash] Options hash.
138
+ # @option options [String] :input File glob expression.
139
+ #
140
+ # @return [Boolean]
141
+ def input?(options)
142
+ if options[:input]
143
+ true
144
+ else
145
+ false
146
+ end
147
+ end
148
+
149
+ # Detect if any output options are set.
150
+ #
151
+ # @param options [Hash] Options hash.
152
+ # @option options [String] :output Path to output file.
153
+ #
154
+ # @return [Boolean]
155
+ def output?(options)
156
+ if options[:output]
157
+ true
158
+ else
159
+ false
160
+ end
161
+ end
162
+
163
+ # Detect if any PNG file is available.
164
+ #
165
+ # @param options [Hash] Options hash.
166
+ # @option options [String] :output Path to output file.
167
+ # @option options [Symbol] :terminal Plot type.
168
+ #
169
+ # @return [Boolean]
170
+ def png?(options)
171
+ if options[:output] &&
172
+ options[:terminal] &&
173
+ options[:terminal] == :png &&
174
+ File.exist?(options[:output])
175
+ true
176
+ else
177
+ false
178
+ end
179
+ end
180
+
181
+ # Return the path of the HTML root dir.
182
+ #
183
+ # @return [String] Root dir.
184
+ def root_dir
185
+ File.join(File.dirname(__FILE__), '..', '..', 'www')
186
+ end
187
+
188
+ # Return the help URL for a given command.
189
+ #
190
+ # @param command [Symbol] Command name.
191
+ #
192
+ # @return [String] HTML link.
193
+ def help_url(command)
194
+ camel = command.to_s.split('_').map(&:capitalize).join
195
+
196
+ 'http://www.rubydoc.info/gems/BioDSL/' \
197
+ "#{BioDSL::VERSION}/BioDSL/#{camel}"
198
+ end
199
+ end
200
+ end
@@ -0,0 +1,55 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
29
+ module BioDSL
30
+ # Adding methods to Math module.
31
+ module Math
32
+ # Class method to calculate the distance from at point to a line.
33
+ # The point and line are given as pairs of coordinates.
34
+ def self.dist_point2line(
35
+ px, # point x coordinate
36
+ py, # point y coordinate
37
+ x1, # line 1 x coordinate
38
+ y1, # line 1 y coordinate
39
+ x2, # line 2 x coordinate
40
+ y2 # line 2 y coordinate
41
+ )
42
+
43
+ a = (y2 - y1).to_f / (x2 - x1).to_f
44
+ b = y1 - a * x1
45
+
46
+ (a * px + b - py).abs / ::Math.sqrt(a**2 + 1)
47
+ end
48
+
49
+ # Class method to calculate the distance between two points given
50
+ # as pairs of coordinates.
51
+ def self.dist_point2point(x1, y1, x2, y2)
52
+ ::Math.sqrt((x2.to_f - x1.to_f)**2 + (y2.to_f - y1.to_f)**2)
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,216 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
3
+ # #
4
+ # This program is free software; you can redistribute it and/or #
5
+ # modify it under the terms of the GNU General Public License #
6
+ # as published by the Free Software Foundation; either version 2 #
7
+ # of the License, or (at your option) any later version. #
8
+ # #
9
+ # This program is distributed in the hope that it will be useful, #
10
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
11
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
12
+ # GNU General Public License for more details. #
13
+ # #
14
+ # You should have received a copy of the GNU General Public License #
15
+ # along with this program; if not, write to the Free Software #
16
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
17
+ # USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+
27
+ # Namespace for BipPieces.
28
+ module BioDSL
29
+ # Error class for Mummer errors.
30
+ MummerError = Class.new(StandardError)
31
+
32
+ # rubocop: disable ClassLength
33
+
34
+ # Class for executing MUMmer and parsing MUMmer results.
35
+ class Mummer
36
+ # @param seq1 [BioDSL::Seq] Sequence 1.
37
+ # @param seq2 [BioPeices::Seq] Sequence 2.
38
+ # @param options [Hash] Options hash.
39
+ #
40
+ # @yield [Mummer::Match] A match object
41
+ # @return [Enumerable] An Enumerable
42
+ def self.each_mem(seq1, seq2, options = {})
43
+ mummer = new(seq1, seq2, options)
44
+
45
+ if block_given?
46
+ mummer.each_mem { |mem| yield mem }
47
+ else
48
+ mummer.each_mem
49
+ end
50
+ end
51
+
52
+ # Constructor for Mummer class.
53
+ #
54
+ # @param seq1 [BioDSL::Seq] Sequence 1.
55
+ # @param seq2 [BioPeices::Seq] Sequence 2.
56
+ # @param options [Hash] Options hash.
57
+ #
58
+ # @return [Mummer] Class instance.
59
+ def initialize(seq1, seq2, options = {})
60
+ @seq1 = seq1
61
+ @seq2 = seq2
62
+ @options = options
63
+ @command = []
64
+ @q_id = nil
65
+ @dir = nil
66
+
67
+ default_options
68
+ check_options
69
+ end
70
+
71
+ # @yield [Mummer::Match] A match object
72
+ # @return [Enumerable] An Enumerable
73
+ def each_mem
74
+ return to_enum :each_mem unless block_given?
75
+
76
+ TmpDir.create('in1', 'in2', 'out') do |file_in1, file_in2, file_out|
77
+ BioDSL::Fasta.open(file_in1, 'w') { |io| io.puts @seq1.to_fasta }
78
+ BioDSL::Fasta.open(file_in2, 'w') { |io| io.puts @seq2.to_fasta }
79
+
80
+ execute(file_in1, file_in2, file_out)
81
+
82
+ File.open(file_out) do |io|
83
+ while (match = get_match(io))
84
+ yield match
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ private
91
+
92
+ # Get a match if possible.
93
+ #
94
+ # @param io [IO] IO stream.
95
+ #
96
+ # @return [Match, nil] match or nil whether a match was found.
97
+ def get_match(io)
98
+ io.each do |line|
99
+ line.chomp!
100
+
101
+ case line
102
+ when /^> (\S+)\s+Reverse\s+Len = \d+$/
103
+ @q_id = Regexp.last_match(1)
104
+ @dir = 'reverse'
105
+ when /^> (\S+)\s+Len = \d+$/
106
+ @q_id = Regexp.last_match(1)
107
+ @dir = 'forward'
108
+ when /^\s*(.\S+)\s+(\d+)\s+(\d+)\s+(\d+)$/
109
+ s_id = Regexp.last_match(1)
110
+ s_beg = Regexp.last_match(2).to_i - 1
111
+ q_beg = Regexp.last_match(3).to_i - 1
112
+ hit_len = Regexp.last_match(4).to_i
113
+
114
+ return Match.new(@q_id, s_id, @dir, s_beg, q_beg, hit_len)
115
+ end
116
+ end
117
+
118
+ nil
119
+ end
120
+
121
+ # Check that the options are OK
122
+ def check_options
123
+ check_length_min_value
124
+ check_length_min_type
125
+ check_direction
126
+ end
127
+
128
+ # Check the that the value of :length_min is OK.
129
+ #
130
+ # @raise [BioDSL::MummerError] on bad length_min value.
131
+ def check_length_min_value
132
+ return if @options[:length_min] > 0
133
+
134
+ fail MummerError, "Bad length_min: #{@options[:length_min]}"
135
+ end
136
+
137
+ # Check that the type of :length_min is OK.
138
+ #
139
+ # @raise [BioDSL::MummerError] on bad length_min type.
140
+ def check_length_min_type
141
+ return if @options[:length_min].class == Fixnum
142
+
143
+ fail MummerError, "Bad length_min type: #{@options[:length_min].class}"
144
+ end
145
+
146
+ # Check that the value of :direction is OK.
147
+ #
148
+ # @raise [BioDSL::MummerError] on bad direction.
149
+ def check_direction
150
+ return if @options[:direction] == :forward ||
151
+ @options[:direction] == :reverse ||
152
+ @options[:direction] == :both
153
+
154
+ fail MummerError, "Bad direction: #{@options[:direction]}"
155
+ end
156
+
157
+ # Set some sensible default options.
158
+ def default_options
159
+ @options[:length_min] ||= 20
160
+ @options[:direction] ||= :both
161
+ end
162
+
163
+ # Execute MUMmer.
164
+ #
165
+ # @param file_in1 [String] Path to sequence filen.
166
+ # @param file_in1 [String] Path to sequence filen.
167
+ # @param file_out [String] Path to output file.
168
+ def execute(file_in1, file_in2, file_out)
169
+ cmd = compile_command(file_in1, file_in2, file_out)
170
+
171
+ $stderr.puts "Running command: #{cmd}" if BioDSL.verbose
172
+
173
+ system(cmd)
174
+
175
+ fail "Error running command: #{cmd}" unless $CHILD_STATUS.success?
176
+ end
177
+
178
+ # Compile a command for execution of mummer.
179
+ #
180
+ # @param file_in1 [String] Path to sequence filen.
181
+ # @param file_in1 [String] Path to sequence filen.
182
+ # @param file_out [String] Path to output file.
183
+ #
184
+ # @return [String] Command string.
185
+ def compile_command(file_in1, file_in2, file_out)
186
+ @command << 'mummer'
187
+ @command << '-c' # report position of revcomp match relative to query seq.
188
+ @command << '-L' # show length of query seq in header.
189
+ @command << '-F' # force 4-column output.
190
+ @command << "-l #{@options[:length_min]}"
191
+ @command << '-n' # nucleotides only [atcg].
192
+
193
+ case @options[:direction]
194
+ when :reverse then @command << '-r' # only compute reverse matches.
195
+ when :both then @command << '-b' # compute forward and reverse matches.
196
+ end
197
+
198
+ @command << file_in1
199
+ @command << file_in2
200
+ @command << "> #{file_out}"
201
+ @command << '2>&1' unless BioDSL.verbose
202
+
203
+ @command.join(' ')
204
+ end
205
+
206
+ Match = Struct.new(:q_id, :s_id, :dir, :s_beg, :q_beg, :hit_len) do
207
+ def q_end
208
+ q_beg + hit_len - 1
209
+ end
210
+
211
+ def s_end
212
+ s_beg + hit_len - 1
213
+ end
214
+ end
215
+ end
216
+ end