BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,354 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
3
+ # #
4
+ # This program is free software; you can redistribute it and/or #
5
+ # modify it under the terms of the GNU General Public License #
6
+ # as published by the Free Software Foundation; either version 2 #
7
+ # of the License, or (at your option) any later version. #
8
+ # #
9
+ # This program is distributed in the hope that it will be useful, #
10
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
11
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
12
+ # GNU General Public License for more details. #
13
+ # #
14
+ # You should have received a copy of the GNU General Public License #
15
+ # along with this program; if not, write to the Free Software #
16
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
17
+ # USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+ module BioDSL
27
+ trap('INT') { fail 'Interrupted: ctrl-c pressed' }
28
+
29
+ # Error class for Pipeline errors.
30
+ PipelineError = Class.new(StandardError)
31
+
32
+ # rubocop: disable ClassLength
33
+
34
+ # Pipeline class
35
+ class Pipeline
36
+ require 'BioDSL/command'
37
+ require 'BioDSL/helpers/email_helper'
38
+ require 'BioDSL/helpers/history_helper'
39
+ require 'BioDSL/helpers/log_helper'
40
+ require 'BioDSL/helpers/options_helper'
41
+ require 'BioDSL/helpers/status_helper'
42
+ require 'mail'
43
+ require 'yaml'
44
+
45
+ include EmailHelper
46
+ include LogHelper
47
+ include HistoryHelper
48
+ include OptionsHelper
49
+ include StatusHelper
50
+
51
+ attr_accessor :commands, :complete
52
+
53
+ # Pipeline class constructor.
54
+ def initialize
55
+ @commands = [] # Array of Commands in the Pipeline.
56
+ @options = {} # Options hash.
57
+ @enums = [[]] # Array of Enumerators.
58
+ @complete = false # Flag denoting if run was completed.
59
+ end
60
+
61
+ # @return [Integer] The size or number of commands in a pipeline.
62
+ def size
63
+ @commands.size
64
+ end
65
+
66
+ # Method for merging one pipeline onto another.
67
+ #
68
+ # @param other [Pipeline] Pipeline to merge.
69
+ #
70
+ # @return [self].
71
+ def <<(other)
72
+ other.commands.map { |command| commands << command }
73
+ other.status.map { |status| self.status << status }
74
+
75
+ self
76
+ end
77
+
78
+ # Method that adds two Pipelines and return a new Pipeline.
79
+ def +(other)
80
+ unless other.is_a?(BioDSL::Pipeline)
81
+ fail PipelineError, "Not a pipeline: #{other.inspect}"
82
+ end
83
+
84
+ p = self.class.new
85
+ p << self
86
+ p << other
87
+ end
88
+
89
+ # Removes last command from a Pipeline and returns a new Pipeline with this
90
+ # command.
91
+ def pop
92
+ p = BioDSL::Pipeline.new
93
+ p.commands = [@commands.pop]
94
+ p
95
+ end
96
+
97
+ # Run all the commands in the Pipeline.
98
+ #
99
+ # @param options [Hash]
100
+ # @option options [Boolean] :verbose (false) Enable verbose output.
101
+ #
102
+ # @raise [PipelineError] If no commands are added to the pipeline.
103
+ #
104
+ # @return [self]
105
+ def run(options = {})
106
+ prime_variables(options)
107
+
108
+ fail BioDSL::PipelineError, 'Empty pipeline' if @commands.empty?
109
+
110
+ @options = options
111
+
112
+ check_options
113
+ command_runner
114
+ print_status
115
+ send_email(self)
116
+ save_report
117
+ log_ok
118
+
119
+ self
120
+ rescue => exception
121
+ exit_gracefully(exception)
122
+ ensure
123
+ save_history
124
+ end
125
+
126
+ # Return a list of all status hashes from the commands.
127
+ #
128
+ # @return [Array] List of status hashes.
129
+ def status
130
+ @commands.each_with_object([]) do |e, a|
131
+ if @complete
132
+ e.calc_time_elapsed
133
+ e.calc_delta
134
+ end
135
+
136
+ a << e.status
137
+ end
138
+ end
139
+
140
+ # Format a Pipeline to a pretty string which is returned.
141
+ def to_s
142
+ command_strings = %w(BP new)
143
+
144
+ @commands.each { |command| command_strings << command.to_s }
145
+
146
+ if @complete
147
+ if @options.empty?
148
+ command_strings << 'run'
149
+ else
150
+ command_strings << "run(#{options_string})"
151
+ end
152
+ end
153
+
154
+ command_strings.join('.')
155
+ end
156
+
157
+ private
158
+
159
+ # Add a command to the pipeline. This is done by first requiring the
160
+ # relevant Class/Module and then calling the relevant command.
161
+ #
162
+ # @param method [Symbol] Method name.
163
+ # @param args [Array] Method arguments.
164
+ # @param block [Proc] Method block.
165
+ #
166
+ # @example Here we add the command `dump` to the pipeline.
167
+ # Pipeline.new.dump
168
+ # # => self
169
+ #
170
+ # @return [self]
171
+ def method_missing(method, *args, &block)
172
+ require_file(method)
173
+
174
+ const = method.to_s.split('_').map(&:capitalize).join('')
175
+
176
+ if BioDSL.const_defined? const
177
+ options = args.first || {}
178
+ options_load_rc(options, method)
179
+
180
+ klass = BioDSL.const_get(const)
181
+ klass.send(:include, OptionsHelper)
182
+ klass.send(:include, StatusHelper)
183
+ lmb = klass.send(:new, options).lmb
184
+
185
+ @commands << Command.new(method, lmb, options)
186
+ else
187
+ super
188
+ end
189
+
190
+ self
191
+ end
192
+
193
+ # Require a file form the lib/commands directory given a method name that
194
+ # must match the file name. E.g. `require_file(:dump)` requires the file
195
+ # `lib/commands/dump.rb`.
196
+ #
197
+ # @param method [Symbol]
198
+ # The name of the method.
199
+ #
200
+ # @raise [Errno::ENOENT] If no such file was found.
201
+ def require_file(method)
202
+ return if BioDSL.const_defined? method.to_s.capitalize
203
+
204
+ # FIXME
205
+ # file = File.join('lib', 'BioDSL', 'commands', "#{method}.rb")
206
+ # fail Errno::ENOENT, "No such file: #{file}" unless File.exist? file
207
+
208
+ require File.join('BioDSL', 'commands', method.to_s)
209
+ end
210
+
211
+ # Print status.
212
+ def print_status
213
+ return unless @options[:verbose]
214
+
215
+ @commands.each do |command|
216
+ hash = {}
217
+ hash[:command] = command.name
218
+ hash[:options] = command.options
219
+ hash[:status] = command.status
220
+ puts hash.to_yaml
221
+ end
222
+ end
223
+
224
+ # Check all run options.
225
+ def check_options
226
+ options_allowed(@options, :debug, :verbose, :email, :progress, :subject,
227
+ :input, :output, :output_dir, :report, :force)
228
+ options_allowed_values(@options, debug: [true, false, nil])
229
+ options_allowed_values(@options, verbose: [true, false, nil])
230
+ options_conflict(@options, progress: :verbose)
231
+ options_tie(@options, subject: :email)
232
+ options_files_exist_force(@options, :report)
233
+ end
234
+
235
+ # Run all commands in the Pipeline.
236
+ def run_commands
237
+ prefix_output_dir
238
+ run_time_start
239
+ run_add_enumerators
240
+ run_enumerate
241
+ end
242
+
243
+ # Add start time to the status of all commands.
244
+ def run_time_start
245
+ time = Time.now
246
+
247
+ @commands.each do |command|
248
+ command.status[:time_start] = time
249
+ end
250
+ end
251
+
252
+ # Add enumerators to instance array.
253
+ def run_add_enumerators
254
+ @commands.each do |command|
255
+ input = @options[:input] || @enums.last
256
+ @enums << Enumerator.new { |output| command.call(input, output) }
257
+ end
258
+ end
259
+
260
+ # Iterate through all enumerators.
261
+ def run_enumerate
262
+ if @options[:output]
263
+ @enums.last.each { |record| @options[:output].write record }
264
+ @options[:output].close # TODO: this close is ugly here
265
+ else
266
+ @enums.last.each {}
267
+ end
268
+ end
269
+
270
+ # Create an output directory and prefix all output files in the commands
271
+ # with this directory.
272
+ def prefix_output_dir
273
+ return unless @options[:output_dir]
274
+
275
+ unless File.exist?(@options[:output_dir])
276
+ FileUtils.mkdir_p(@options[:output_dir])
277
+ end
278
+
279
+ @commands.each do |command|
280
+ if (value = command.options[:output])
281
+ command.options[:output] = File.join(@options[:output_dir], value)
282
+ end
283
+ end
284
+ end
285
+
286
+ # Save a HTML status report to file.
287
+ def save_report
288
+ return unless @options[:report]
289
+
290
+ file = if @options[:output_dir]
291
+ File.join(@options[:output_dir], @options[:report])
292
+ else
293
+ @options[:report]
294
+ end
295
+
296
+ File.open(file, 'w') do |ios|
297
+ ios.puts BioDSL::HtmlReport.new(self).to_html
298
+ end
299
+ end
300
+
301
+ # Run all commands.
302
+ def command_runner
303
+ return if @complete
304
+
305
+ if @options[:progress]
306
+ status_progress(@commands) { run_commands }
307
+ else
308
+ run_commands
309
+ end
310
+
311
+ @complete = true
312
+ end
313
+
314
+ # Set some global variables.
315
+ #
316
+ # @param options [Hash] Options hash.
317
+ # @option options [Booleon] :debug Debug flag.
318
+ # @option options [Booleon] :verbose Verbose flag.
319
+ def prime_variables(options)
320
+ BioDSL.test = ENV['BP_TEST']
321
+ BioDSL.debug = options[:debug]
322
+ BioDSL.verbose = options[:verbose]
323
+ end
324
+
325
+ # Output exception message and possibly stack tracre to STDERR,
326
+ # log error message and exit with non-zero status.
327
+ def exit_gracefully(exception)
328
+ fail exception if BioDSL.test
329
+
330
+ STDERR.puts "Error in run: #{exception.message}"
331
+ STDERR.puts exception.backtrace if BioDSL.verbose
332
+ log_error(exception)
333
+ exit 2
334
+ end
335
+
336
+ # Generate a comma separated string from the options ensuring that
337
+ # values are in "" if need be.
338
+ #
339
+ # Return [Array] List of options.
340
+ def options_string
341
+ options = []
342
+
343
+ @options.each_pair do |key, value|
344
+ if value.is_a? String
345
+ options << %(#{key}: "#{value}")
346
+ else
347
+ options << %(#{key}: #{value})
348
+ end
349
+ end
350
+
351
+ options.join(', ')
352
+ end
353
+ end
354
+ end
@@ -0,0 +1,66 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
29
+ module BioDSL
30
+ # Namespace for Ambiguity.
31
+ module Ambiguity
32
+ # Add C functions to Inline::C object.
33
+ #
34
+ # @param inline_builder [Inline::C] Inline C object.
35
+ def add_ambiguity_macro(inline_builder)
36
+ # Macro for matching nucleotides including ambiguity codes.
37
+ inline_builder.prefix %(
38
+ #define MATCH(A,B) ((bitmap[(int) A] & bitmap[(int) B]) != 0)
39
+ )
40
+
41
+ # Bitmap for matching nucleotides including ambiguity codes.
42
+ # For each value bits are set from the left: bit pos 1 for A,
43
+ # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G.
44
+ inline_builder.prefix %(
45
+ char bitmap[256] = {
46
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
49
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50
+ 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
51
+ 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
52
+ 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
53
+ 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
54
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
62
+ };
63
+ )
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,240 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
29
+ module BioDSL
30
+ # Error class for all Assemble errors.
31
+ AssembleError = Class.new(StandardError)
32
+
33
+ # rubocop: disable ClassLength
34
+
35
+ # Class with methods for assembling pair-end reads.
36
+ class Assemble
37
+ require 'inline'
38
+
39
+ extend Ambiguity
40
+
41
+ # Class method to assemble two Seq objects.
42
+ def self.pair(entry1, entry2, options = {})
43
+ assemble = new(entry1, entry2, options)
44
+ assemble.match
45
+ end
46
+
47
+ # Method to initialize an Assembly object.
48
+ def initialize(entry1, entry2, options)
49
+ @entry1 = entry1
50
+ @entry2 = entry2
51
+ @overlap = 0
52
+ @offset1 = 0
53
+ @offset2 = 0
54
+ @options = options
55
+ @options[:mismatches_max] ||= 0
56
+ @options[:overlap_min] ||= 1
57
+
58
+ check_options
59
+ end
60
+
61
+ # Check option values are sane.
62
+ #
63
+ # @raise [AssembleError] on bad values.
64
+ def check_options
65
+ if @options[:mismatches_max] < 0
66
+ fail AssembleError, "mismatches_max must be zero or greater - not: \
67
+ #{@options[:mismatches_max]}"
68
+ end
69
+
70
+ if @options[:overlap_max] && @options[:overlap_max] <= 0
71
+ fail AssembleError, "overlap_max must be one or greater - not: \
72
+ #{@options[:overlap_max]}"
73
+ end
74
+
75
+ if @options[:overlap_min] <= 0
76
+ fail AssembleError, "overlap_min must be one or greater - not: \
77
+ #{@options[:overlap_min]}"
78
+ end
79
+ end
80
+
81
+ # Method to locate overlapping matches between two sequences.
82
+ def match
83
+ calc_overlap
84
+ diff = calc_diff
85
+
86
+ @offset1 = @entry1.length - @overlap - diff
87
+
88
+ while @overlap >= @options[:overlap_min]
89
+ mismatches_max = (@overlap * @options[:mismatches_max] * 0.01).round
90
+
91
+ if (mismatches = match_C(@entry1.seq, @entry2.seq, @offset1, @offset2,
92
+ @overlap, mismatches_max)) && mismatches >= 0
93
+ entry_merged = entry_left + entry_overlap + entry_right
94
+ entry_merged.seq_name = @entry1.seq_name +
95
+ ":overlap=#{@overlap}:hamming=#{mismatches}" if @entry1.seq_name
96
+
97
+ return entry_merged
98
+ end
99
+
100
+ diff > 0 ? diff -= 1 : @overlap -= 1
101
+
102
+ @offset1 += 1
103
+ end
104
+ end
105
+
106
+ # Calculate the overlap to be matched.
107
+ def calc_overlap
108
+ @overlap = if @options[:overlap_max]
109
+ [@options[:overlap_max], @entry1.length, @entry2.length].min
110
+ else
111
+ [@entry1.length, @entry2.length].min
112
+ end
113
+ end
114
+
115
+ # Calculate the diff between sequence lengths and return this.
116
+ #
117
+ # @return [Fixnum] Diff.
118
+ def calc_diff
119
+ diff = @entry1.length - @entry2.length
120
+ diff = 0 if diff < 0
121
+ diff
122
+ end
123
+
124
+ # Method to extract and downcase the left part of an assembled pair.
125
+ #
126
+ # @return [BioDSL::Seq] Left part.
127
+ def entry_left
128
+ entry = @entry1[0...@offset1]
129
+ entry.seq.downcase!
130
+ entry
131
+ end
132
+
133
+ # Method to extract and downcase the right part of an assembled pair.
134
+ #
135
+ # @return [BioDSL::Seq] Right part.
136
+ def entry_right
137
+ entry = if @entry1.length > @offset1 + @overlap
138
+ @entry1[@offset1 + @overlap..-1]
139
+ else
140
+ @entry2[@offset2 + @overlap..-1]
141
+ end
142
+
143
+ entry.seq.downcase!
144
+ entry
145
+ end
146
+
147
+ # Method to extract and upcase the overlapping part of an assembled pair.
148
+ #
149
+ # @return [BioDSL::Seq] Overlapping part.
150
+ def entry_overlap
151
+ if @entry1.qual && @entry2.qual
152
+ entry_overlap1 = @entry1[@offset1...@offset1 + @overlap]
153
+ entry_overlap2 = @entry2[@offset2...@offset2 + @overlap]
154
+
155
+ entry = merge_overlap(entry_overlap1, entry_overlap2)
156
+ else
157
+ entry = @entry1[@offset1...@offset1 + @overlap]
158
+ end
159
+
160
+ entry.seq.upcase!
161
+ entry
162
+ end
163
+
164
+ # Method to merge sequence and quality scores in an overlap.
165
+ # The residue with the highest score at mismatch positions is selected.
166
+ # The quality scores of the overlap are the mean of the two sequences.
167
+ def merge_overlap(entry_overlap1, entry_overlap2)
168
+ na_seq = NArray.byte(entry_overlap1.length, 2)
169
+ na_seq[true, 0] = NArray.to_na(entry_overlap1.seq.downcase, 'byte')
170
+ na_seq[true, 1] = NArray.to_na(entry_overlap2.seq.downcase, 'byte')
171
+
172
+ na_qual = NArray.byte(entry_overlap1.length, 2)
173
+ na_qual[true, 0] = NArray.to_na(entry_overlap1.qual, 'byte')
174
+ na_qual[true, 1] = NArray.to_na(entry_overlap2.qual, 'byte')
175
+
176
+ mask_xor = na_seq[true, 0] ^ na_seq[true, 1] > 0
177
+ mask_seq = ((na_qual * mask_xor).eq((na_qual * mask_xor).max(1)))
178
+
179
+ merged = Seq.new
180
+ merged.seq = (na_seq * mask_seq).max(1).to_s
181
+ merged.qual = na_qual.mean(1).round.to_type('byte').to_s
182
+
183
+ merged
184
+ end
185
+
186
+ inline do |builder|
187
+ add_ambiguity_macro(builder)
188
+
189
+ # C method for determining if two strings of equal length match
190
+ # given a maximum allowed mismatches and allowing for IUPAC
191
+ # ambiguity codes. Returns number of mismatches is true if match, else
192
+ # false.
193
+ builder.c %{
194
+ VALUE match_C(
195
+ VALUE _string1, // String 1
196
+ VALUE _string2, // String 2
197
+ VALUE _offset1, // Offset 1
198
+ VALUE _offset2, // Offset 2
199
+ VALUE _length, // String length
200
+ VALUE _max_mismatch // Maximum mismatches
201
+ )
202
+ {
203
+ char *string1 = StringValuePtr(_string1);
204
+ char *string2 = StringValuePtr(_string2);
205
+ unsigned int offset1 = FIX2UINT(_offset1);
206
+ unsigned int offset2 = FIX2UINT(_offset2);
207
+ unsigned int length = FIX2UINT(_length);
208
+ unsigned int max_mismatch = FIX2UINT(_max_mismatch);
209
+
210
+ unsigned int max_match = length - max_mismatch;
211
+ unsigned int match = 0;
212
+ unsigned int mismatch = 0;
213
+ unsigned int i = 0;
214
+
215
+ for (i = 0; i < length; i++)
216
+ {
217
+ if (MATCH(string1[i + offset1], string2[i + offset2]))
218
+ {
219
+ match++;
220
+
221
+ if (match >= max_match) {
222
+ return UINT2NUM(mismatch);
223
+ }
224
+ }
225
+ else
226
+ {
227
+ mismatch++;
228
+
229
+ if (mismatch > max_mismatch) {
230
+ return INT2NUM(-1);
231
+ }
232
+ }
233
+ }
234
+
235
+ return INT2NUM(-1);
236
+ }
237
+ }
238
+ end
239
+ end
240
+ end