BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
3
|
+
# #
|
|
4
|
+
# This program is free software; you can redistribute it and/or #
|
|
5
|
+
# modify it under the terms of the GNU General Public License #
|
|
6
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
7
|
+
# of the License, or (at your option) any later version. #
|
|
8
|
+
# #
|
|
9
|
+
# This program is distributed in the hope that it will be useful, #
|
|
10
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
11
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
12
|
+
# GNU General Public License for more details. #
|
|
13
|
+
# #
|
|
14
|
+
# You should have received a copy of the GNU General Public License #
|
|
15
|
+
# along with this program; if not, write to the Free Software #
|
|
16
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
17
|
+
# USA. #
|
|
18
|
+
# #
|
|
19
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
20
|
+
# #
|
|
21
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
22
|
+
# #
|
|
23
|
+
# This software is part of BioDSL (www.github.com/maasha/BioDSL). #
|
|
24
|
+
# #
|
|
25
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
26
|
+
module BioDSL
|
|
27
|
+
trap('INT') { fail 'Interrupted: ctrl-c pressed' }
|
|
28
|
+
|
|
29
|
+
# Error class for Pipeline errors.
|
|
30
|
+
PipelineError = Class.new(StandardError)
|
|
31
|
+
|
|
32
|
+
# rubocop: disable ClassLength
|
|
33
|
+
|
|
34
|
+
# Pipeline class
|
|
35
|
+
class Pipeline
|
|
36
|
+
require 'BioDSL/command'
|
|
37
|
+
require 'BioDSL/helpers/email_helper'
|
|
38
|
+
require 'BioDSL/helpers/history_helper'
|
|
39
|
+
require 'BioDSL/helpers/log_helper'
|
|
40
|
+
require 'BioDSL/helpers/options_helper'
|
|
41
|
+
require 'BioDSL/helpers/status_helper'
|
|
42
|
+
require 'mail'
|
|
43
|
+
require 'yaml'
|
|
44
|
+
|
|
45
|
+
include EmailHelper
|
|
46
|
+
include LogHelper
|
|
47
|
+
include HistoryHelper
|
|
48
|
+
include OptionsHelper
|
|
49
|
+
include StatusHelper
|
|
50
|
+
|
|
51
|
+
attr_accessor :commands, :complete
|
|
52
|
+
|
|
53
|
+
# Pipeline class constructor.
|
|
54
|
+
def initialize
|
|
55
|
+
@commands = [] # Array of Commands in the Pipeline.
|
|
56
|
+
@options = {} # Options hash.
|
|
57
|
+
@enums = [[]] # Array of Enumerators.
|
|
58
|
+
@complete = false # Flag denoting if run was completed.
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# @return [Integer] The size or number of commands in a pipeline.
|
|
62
|
+
def size
|
|
63
|
+
@commands.size
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Method for merging one pipeline onto another.
|
|
67
|
+
#
|
|
68
|
+
# @param other [Pipeline] Pipeline to merge.
|
|
69
|
+
#
|
|
70
|
+
# @return [self].
|
|
71
|
+
def <<(other)
|
|
72
|
+
other.commands.map { |command| commands << command }
|
|
73
|
+
other.status.map { |status| self.status << status }
|
|
74
|
+
|
|
75
|
+
self
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Method that adds two Pipelines and return a new Pipeline.
|
|
79
|
+
def +(other)
|
|
80
|
+
unless other.is_a?(BioDSL::Pipeline)
|
|
81
|
+
fail PipelineError, "Not a pipeline: #{other.inspect}"
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
p = self.class.new
|
|
85
|
+
p << self
|
|
86
|
+
p << other
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Removes last command from a Pipeline and returns a new Pipeline with this
|
|
90
|
+
# command.
|
|
91
|
+
def pop
|
|
92
|
+
p = BioDSL::Pipeline.new
|
|
93
|
+
p.commands = [@commands.pop]
|
|
94
|
+
p
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Run all the commands in the Pipeline.
|
|
98
|
+
#
|
|
99
|
+
# @param options [Hash]
|
|
100
|
+
# @option options [Boolean] :verbose (false) Enable verbose output.
|
|
101
|
+
#
|
|
102
|
+
# @raise [PipelineError] If no commands are added to the pipeline.
|
|
103
|
+
#
|
|
104
|
+
# @return [self]
|
|
105
|
+
def run(options = {})
|
|
106
|
+
prime_variables(options)
|
|
107
|
+
|
|
108
|
+
fail BioDSL::PipelineError, 'Empty pipeline' if @commands.empty?
|
|
109
|
+
|
|
110
|
+
@options = options
|
|
111
|
+
|
|
112
|
+
check_options
|
|
113
|
+
command_runner
|
|
114
|
+
print_status
|
|
115
|
+
send_email(self)
|
|
116
|
+
save_report
|
|
117
|
+
log_ok
|
|
118
|
+
|
|
119
|
+
self
|
|
120
|
+
rescue => exception
|
|
121
|
+
exit_gracefully(exception)
|
|
122
|
+
ensure
|
|
123
|
+
save_history
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Return a list of all status hashes from the commands.
|
|
127
|
+
#
|
|
128
|
+
# @return [Array] List of status hashes.
|
|
129
|
+
def status
|
|
130
|
+
@commands.each_with_object([]) do |e, a|
|
|
131
|
+
if @complete
|
|
132
|
+
e.calc_time_elapsed
|
|
133
|
+
e.calc_delta
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
a << e.status
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Format a Pipeline to a pretty string which is returned.
|
|
141
|
+
def to_s
|
|
142
|
+
command_strings = %w(BP new)
|
|
143
|
+
|
|
144
|
+
@commands.each { |command| command_strings << command.to_s }
|
|
145
|
+
|
|
146
|
+
if @complete
|
|
147
|
+
if @options.empty?
|
|
148
|
+
command_strings << 'run'
|
|
149
|
+
else
|
|
150
|
+
command_strings << "run(#{options_string})"
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
command_strings.join('.')
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
private
|
|
158
|
+
|
|
159
|
+
# Add a command to the pipeline. This is done by first requiring the
|
|
160
|
+
# relevant Class/Module and then calling the relevant command.
|
|
161
|
+
#
|
|
162
|
+
# @param method [Symbol] Method name.
|
|
163
|
+
# @param args [Array] Method arguments.
|
|
164
|
+
# @param block [Proc] Method block.
|
|
165
|
+
#
|
|
166
|
+
# @example Here we add the command `dump` to the pipeline.
|
|
167
|
+
# Pipeline.new.dump
|
|
168
|
+
# # => self
|
|
169
|
+
#
|
|
170
|
+
# @return [self]
|
|
171
|
+
def method_missing(method, *args, &block)
|
|
172
|
+
require_file(method)
|
|
173
|
+
|
|
174
|
+
const = method.to_s.split('_').map(&:capitalize).join('')
|
|
175
|
+
|
|
176
|
+
if BioDSL.const_defined? const
|
|
177
|
+
options = args.first || {}
|
|
178
|
+
options_load_rc(options, method)
|
|
179
|
+
|
|
180
|
+
klass = BioDSL.const_get(const)
|
|
181
|
+
klass.send(:include, OptionsHelper)
|
|
182
|
+
klass.send(:include, StatusHelper)
|
|
183
|
+
lmb = klass.send(:new, options).lmb
|
|
184
|
+
|
|
185
|
+
@commands << Command.new(method, lmb, options)
|
|
186
|
+
else
|
|
187
|
+
super
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
self
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Require a file form the lib/commands directory given a method name that
|
|
194
|
+
# must match the file name. E.g. `require_file(:dump)` requires the file
|
|
195
|
+
# `lib/commands/dump.rb`.
|
|
196
|
+
#
|
|
197
|
+
# @param method [Symbol]
|
|
198
|
+
# The name of the method.
|
|
199
|
+
#
|
|
200
|
+
# @raise [Errno::ENOENT] If no such file was found.
|
|
201
|
+
def require_file(method)
|
|
202
|
+
return if BioDSL.const_defined? method.to_s.capitalize
|
|
203
|
+
|
|
204
|
+
# FIXME
|
|
205
|
+
# file = File.join('lib', 'BioDSL', 'commands', "#{method}.rb")
|
|
206
|
+
# fail Errno::ENOENT, "No such file: #{file}" unless File.exist? file
|
|
207
|
+
|
|
208
|
+
require File.join('BioDSL', 'commands', method.to_s)
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# Print status.
|
|
212
|
+
def print_status
|
|
213
|
+
return unless @options[:verbose]
|
|
214
|
+
|
|
215
|
+
@commands.each do |command|
|
|
216
|
+
hash = {}
|
|
217
|
+
hash[:command] = command.name
|
|
218
|
+
hash[:options] = command.options
|
|
219
|
+
hash[:status] = command.status
|
|
220
|
+
puts hash.to_yaml
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
# Check all run options.
|
|
225
|
+
def check_options
|
|
226
|
+
options_allowed(@options, :debug, :verbose, :email, :progress, :subject,
|
|
227
|
+
:input, :output, :output_dir, :report, :force)
|
|
228
|
+
options_allowed_values(@options, debug: [true, false, nil])
|
|
229
|
+
options_allowed_values(@options, verbose: [true, false, nil])
|
|
230
|
+
options_conflict(@options, progress: :verbose)
|
|
231
|
+
options_tie(@options, subject: :email)
|
|
232
|
+
options_files_exist_force(@options, :report)
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
# Run all commands in the Pipeline.
|
|
236
|
+
def run_commands
|
|
237
|
+
prefix_output_dir
|
|
238
|
+
run_time_start
|
|
239
|
+
run_add_enumerators
|
|
240
|
+
run_enumerate
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# Add start time to the status of all commands.
|
|
244
|
+
def run_time_start
|
|
245
|
+
time = Time.now
|
|
246
|
+
|
|
247
|
+
@commands.each do |command|
|
|
248
|
+
command.status[:time_start] = time
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# Add enumerators to instance array.
|
|
253
|
+
def run_add_enumerators
|
|
254
|
+
@commands.each do |command|
|
|
255
|
+
input = @options[:input] || @enums.last
|
|
256
|
+
@enums << Enumerator.new { |output| command.call(input, output) }
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
# Iterate through all enumerators.
|
|
261
|
+
def run_enumerate
|
|
262
|
+
if @options[:output]
|
|
263
|
+
@enums.last.each { |record| @options[:output].write record }
|
|
264
|
+
@options[:output].close # TODO: this close is ugly here
|
|
265
|
+
else
|
|
266
|
+
@enums.last.each {}
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
# Create an output directory and prefix all output files in the commands
|
|
271
|
+
# with this directory.
|
|
272
|
+
def prefix_output_dir
|
|
273
|
+
return unless @options[:output_dir]
|
|
274
|
+
|
|
275
|
+
unless File.exist?(@options[:output_dir])
|
|
276
|
+
FileUtils.mkdir_p(@options[:output_dir])
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
@commands.each do |command|
|
|
280
|
+
if (value = command.options[:output])
|
|
281
|
+
command.options[:output] = File.join(@options[:output_dir], value)
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
# Save a HTML status report to file.
|
|
287
|
+
def save_report
|
|
288
|
+
return unless @options[:report]
|
|
289
|
+
|
|
290
|
+
file = if @options[:output_dir]
|
|
291
|
+
File.join(@options[:output_dir], @options[:report])
|
|
292
|
+
else
|
|
293
|
+
@options[:report]
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
File.open(file, 'w') do |ios|
|
|
297
|
+
ios.puts BioDSL::HtmlReport.new(self).to_html
|
|
298
|
+
end
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
# Run all commands.
|
|
302
|
+
def command_runner
|
|
303
|
+
return if @complete
|
|
304
|
+
|
|
305
|
+
if @options[:progress]
|
|
306
|
+
status_progress(@commands) { run_commands }
|
|
307
|
+
else
|
|
308
|
+
run_commands
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
@complete = true
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
# Set some global variables.
|
|
315
|
+
#
|
|
316
|
+
# @param options [Hash] Options hash.
|
|
317
|
+
# @option options [Booleon] :debug Debug flag.
|
|
318
|
+
# @option options [Booleon] :verbose Verbose flag.
|
|
319
|
+
def prime_variables(options)
|
|
320
|
+
BioDSL.test = ENV['BP_TEST']
|
|
321
|
+
BioDSL.debug = options[:debug]
|
|
322
|
+
BioDSL.verbose = options[:verbose]
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
# Output exception message and possibly stack tracre to STDERR,
|
|
326
|
+
# log error message and exit with non-zero status.
|
|
327
|
+
def exit_gracefully(exception)
|
|
328
|
+
fail exception if BioDSL.test
|
|
329
|
+
|
|
330
|
+
STDERR.puts "Error in run: #{exception.message}"
|
|
331
|
+
STDERR.puts exception.backtrace if BioDSL.verbose
|
|
332
|
+
log_error(exception)
|
|
333
|
+
exit 2
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
# Generate a comma separated string from the options ensuring that
|
|
337
|
+
# values are in "" if need be.
|
|
338
|
+
#
|
|
339
|
+
# Return [Array] List of options.
|
|
340
|
+
def options_string
|
|
341
|
+
options = []
|
|
342
|
+
|
|
343
|
+
@options.each_pair do |key, value|
|
|
344
|
+
if value.is_a? String
|
|
345
|
+
options << %(#{key}: "#{value}")
|
|
346
|
+
else
|
|
347
|
+
options << %(#{key}: #{value})
|
|
348
|
+
end
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
options.join(', ')
|
|
352
|
+
end
|
|
353
|
+
end
|
|
354
|
+
end
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of BioDSL (www.github.com/maasha/BioDSL). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
# Namespace for BioDSL.
|
|
29
|
+
module BioDSL
|
|
30
|
+
# Namespace for Ambiguity.
|
|
31
|
+
module Ambiguity
|
|
32
|
+
# Add C functions to Inline::C object.
|
|
33
|
+
#
|
|
34
|
+
# @param inline_builder [Inline::C] Inline C object.
|
|
35
|
+
def add_ambiguity_macro(inline_builder)
|
|
36
|
+
# Macro for matching nucleotides including ambiguity codes.
|
|
37
|
+
inline_builder.prefix %(
|
|
38
|
+
#define MATCH(A,B) ((bitmap[(int) A] & bitmap[(int) B]) != 0)
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Bitmap for matching nucleotides including ambiguity codes.
|
|
42
|
+
# For each value bits are set from the left: bit pos 1 for A,
|
|
43
|
+
# bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G.
|
|
44
|
+
inline_builder.prefix %(
|
|
45
|
+
char bitmap[256] = {
|
|
46
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
47
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
48
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
49
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
50
|
+
0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
|
|
51
|
+
0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
|
|
52
|
+
0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
|
|
53
|
+
0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
|
|
54
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
55
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
56
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
57
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
58
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
59
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
60
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
61
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
62
|
+
};
|
|
63
|
+
)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
# Namespace for BioDSL.
|
|
29
|
+
module BioDSL
|
|
30
|
+
# Error class for all Assemble errors.
|
|
31
|
+
AssembleError = Class.new(StandardError)
|
|
32
|
+
|
|
33
|
+
# rubocop: disable ClassLength
|
|
34
|
+
|
|
35
|
+
# Class with methods for assembling pair-end reads.
|
|
36
|
+
class Assemble
|
|
37
|
+
require 'inline'
|
|
38
|
+
|
|
39
|
+
extend Ambiguity
|
|
40
|
+
|
|
41
|
+
# Class method to assemble two Seq objects.
|
|
42
|
+
def self.pair(entry1, entry2, options = {})
|
|
43
|
+
assemble = new(entry1, entry2, options)
|
|
44
|
+
assemble.match
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Method to initialize an Assembly object.
|
|
48
|
+
def initialize(entry1, entry2, options)
|
|
49
|
+
@entry1 = entry1
|
|
50
|
+
@entry2 = entry2
|
|
51
|
+
@overlap = 0
|
|
52
|
+
@offset1 = 0
|
|
53
|
+
@offset2 = 0
|
|
54
|
+
@options = options
|
|
55
|
+
@options[:mismatches_max] ||= 0
|
|
56
|
+
@options[:overlap_min] ||= 1
|
|
57
|
+
|
|
58
|
+
check_options
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Check option values are sane.
|
|
62
|
+
#
|
|
63
|
+
# @raise [AssembleError] on bad values.
|
|
64
|
+
def check_options
|
|
65
|
+
if @options[:mismatches_max] < 0
|
|
66
|
+
fail AssembleError, "mismatches_max must be zero or greater - not: \
|
|
67
|
+
#{@options[:mismatches_max]}"
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
if @options[:overlap_max] && @options[:overlap_max] <= 0
|
|
71
|
+
fail AssembleError, "overlap_max must be one or greater - not: \
|
|
72
|
+
#{@options[:overlap_max]}"
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
if @options[:overlap_min] <= 0
|
|
76
|
+
fail AssembleError, "overlap_min must be one or greater - not: \
|
|
77
|
+
#{@options[:overlap_min]}"
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Method to locate overlapping matches between two sequences.
|
|
82
|
+
def match
|
|
83
|
+
calc_overlap
|
|
84
|
+
diff = calc_diff
|
|
85
|
+
|
|
86
|
+
@offset1 = @entry1.length - @overlap - diff
|
|
87
|
+
|
|
88
|
+
while @overlap >= @options[:overlap_min]
|
|
89
|
+
mismatches_max = (@overlap * @options[:mismatches_max] * 0.01).round
|
|
90
|
+
|
|
91
|
+
if (mismatches = match_C(@entry1.seq, @entry2.seq, @offset1, @offset2,
|
|
92
|
+
@overlap, mismatches_max)) && mismatches >= 0
|
|
93
|
+
entry_merged = entry_left + entry_overlap + entry_right
|
|
94
|
+
entry_merged.seq_name = @entry1.seq_name +
|
|
95
|
+
":overlap=#{@overlap}:hamming=#{mismatches}" if @entry1.seq_name
|
|
96
|
+
|
|
97
|
+
return entry_merged
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
diff > 0 ? diff -= 1 : @overlap -= 1
|
|
101
|
+
|
|
102
|
+
@offset1 += 1
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Calculate the overlap to be matched.
|
|
107
|
+
def calc_overlap
|
|
108
|
+
@overlap = if @options[:overlap_max]
|
|
109
|
+
[@options[:overlap_max], @entry1.length, @entry2.length].min
|
|
110
|
+
else
|
|
111
|
+
[@entry1.length, @entry2.length].min
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Calculate the diff between sequence lengths and return this.
|
|
116
|
+
#
|
|
117
|
+
# @return [Fixnum] Diff.
|
|
118
|
+
def calc_diff
|
|
119
|
+
diff = @entry1.length - @entry2.length
|
|
120
|
+
diff = 0 if diff < 0
|
|
121
|
+
diff
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Method to extract and downcase the left part of an assembled pair.
|
|
125
|
+
#
|
|
126
|
+
# @return [BioDSL::Seq] Left part.
|
|
127
|
+
def entry_left
|
|
128
|
+
entry = @entry1[0...@offset1]
|
|
129
|
+
entry.seq.downcase!
|
|
130
|
+
entry
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Method to extract and downcase the right part of an assembled pair.
|
|
134
|
+
#
|
|
135
|
+
# @return [BioDSL::Seq] Right part.
|
|
136
|
+
def entry_right
|
|
137
|
+
entry = if @entry1.length > @offset1 + @overlap
|
|
138
|
+
@entry1[@offset1 + @overlap..-1]
|
|
139
|
+
else
|
|
140
|
+
@entry2[@offset2 + @overlap..-1]
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
entry.seq.downcase!
|
|
144
|
+
entry
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Method to extract and upcase the overlapping part of an assembled pair.
|
|
148
|
+
#
|
|
149
|
+
# @return [BioDSL::Seq] Overlapping part.
|
|
150
|
+
def entry_overlap
|
|
151
|
+
if @entry1.qual && @entry2.qual
|
|
152
|
+
entry_overlap1 = @entry1[@offset1...@offset1 + @overlap]
|
|
153
|
+
entry_overlap2 = @entry2[@offset2...@offset2 + @overlap]
|
|
154
|
+
|
|
155
|
+
entry = merge_overlap(entry_overlap1, entry_overlap2)
|
|
156
|
+
else
|
|
157
|
+
entry = @entry1[@offset1...@offset1 + @overlap]
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
entry.seq.upcase!
|
|
161
|
+
entry
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Method to merge sequence and quality scores in an overlap.
|
|
165
|
+
# The residue with the highest score at mismatch positions is selected.
|
|
166
|
+
# The quality scores of the overlap are the mean of the two sequences.
|
|
167
|
+
def merge_overlap(entry_overlap1, entry_overlap2)
|
|
168
|
+
na_seq = NArray.byte(entry_overlap1.length, 2)
|
|
169
|
+
na_seq[true, 0] = NArray.to_na(entry_overlap1.seq.downcase, 'byte')
|
|
170
|
+
na_seq[true, 1] = NArray.to_na(entry_overlap2.seq.downcase, 'byte')
|
|
171
|
+
|
|
172
|
+
na_qual = NArray.byte(entry_overlap1.length, 2)
|
|
173
|
+
na_qual[true, 0] = NArray.to_na(entry_overlap1.qual, 'byte')
|
|
174
|
+
na_qual[true, 1] = NArray.to_na(entry_overlap2.qual, 'byte')
|
|
175
|
+
|
|
176
|
+
mask_xor = na_seq[true, 0] ^ na_seq[true, 1] > 0
|
|
177
|
+
mask_seq = ((na_qual * mask_xor).eq((na_qual * mask_xor).max(1)))
|
|
178
|
+
|
|
179
|
+
merged = Seq.new
|
|
180
|
+
merged.seq = (na_seq * mask_seq).max(1).to_s
|
|
181
|
+
merged.qual = na_qual.mean(1).round.to_type('byte').to_s
|
|
182
|
+
|
|
183
|
+
merged
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
inline do |builder|
|
|
187
|
+
add_ambiguity_macro(builder)
|
|
188
|
+
|
|
189
|
+
# C method for determining if two strings of equal length match
|
|
190
|
+
# given a maximum allowed mismatches and allowing for IUPAC
|
|
191
|
+
# ambiguity codes. Returns number of mismatches is true if match, else
|
|
192
|
+
# false.
|
|
193
|
+
builder.c %{
|
|
194
|
+
VALUE match_C(
|
|
195
|
+
VALUE _string1, // String 1
|
|
196
|
+
VALUE _string2, // String 2
|
|
197
|
+
VALUE _offset1, // Offset 1
|
|
198
|
+
VALUE _offset2, // Offset 2
|
|
199
|
+
VALUE _length, // String length
|
|
200
|
+
VALUE _max_mismatch // Maximum mismatches
|
|
201
|
+
)
|
|
202
|
+
{
|
|
203
|
+
char *string1 = StringValuePtr(_string1);
|
|
204
|
+
char *string2 = StringValuePtr(_string2);
|
|
205
|
+
unsigned int offset1 = FIX2UINT(_offset1);
|
|
206
|
+
unsigned int offset2 = FIX2UINT(_offset2);
|
|
207
|
+
unsigned int length = FIX2UINT(_length);
|
|
208
|
+
unsigned int max_mismatch = FIX2UINT(_max_mismatch);
|
|
209
|
+
|
|
210
|
+
unsigned int max_match = length - max_mismatch;
|
|
211
|
+
unsigned int match = 0;
|
|
212
|
+
unsigned int mismatch = 0;
|
|
213
|
+
unsigned int i = 0;
|
|
214
|
+
|
|
215
|
+
for (i = 0; i < length; i++)
|
|
216
|
+
{
|
|
217
|
+
if (MATCH(string1[i + offset1], string2[i + offset2]))
|
|
218
|
+
{
|
|
219
|
+
match++;
|
|
220
|
+
|
|
221
|
+
if (match >= max_match) {
|
|
222
|
+
return UINT2NUM(mismatch);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
else
|
|
226
|
+
{
|
|
227
|
+
mismatch++;
|
|
228
|
+
|
|
229
|
+
if (mismatch > max_mismatch) {
|
|
230
|
+
return INT2NUM(-1);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
return INT2NUM(-1);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
end
|