BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# rubocop:disable ClassLength
|
|
30
|
+
|
|
31
|
+
# == Assemble sequences the stream using Ray.
|
|
32
|
+
#
|
|
33
|
+
# +assemble_seq_ray+ is a wrapper around the deBruijn graph assembler Ray:
|
|
34
|
+
#
|
|
35
|
+
# http://denovoassembler.sourceforge.net/
|
|
36
|
+
#
|
|
37
|
+
# Any records containing sequence information will be included in the
|
|
38
|
+
# assembly, but only the assembled contig sequences will be output to the
|
|
39
|
+
# stream.
|
|
40
|
+
#
|
|
41
|
+
# The sequences records may contain quality scores, and if the sequence
|
|
42
|
+
# names indicates that the sequence order is inter-leaved paired-end
|
|
43
|
+
# assembly will be performed.
|
|
44
|
+
#
|
|
45
|
+
# Kmer values must be odd.
|
|
46
|
+
#
|
|
47
|
+
# == Usage
|
|
48
|
+
#
|
|
49
|
+
# assemble_seq_ray([kmer_min: <uint>[, kmer_max: <uint>
|
|
50
|
+
# [, contig_min: <uint>[, cpus: <uint>]]]])
|
|
51
|
+
#
|
|
52
|
+
# === Options
|
|
53
|
+
#
|
|
54
|
+
# * kmer_min: <uint> - Minimum k-mer value (default: 21).
|
|
55
|
+
# * kmer_max: <uint> - Maximum k-mer value (default: 49).
|
|
56
|
+
# * contig_min: <uint> - Minimum contig size (default: 500).
|
|
57
|
+
# * cpus: <uint> - Number of CPUs to use (default: 1).
|
|
58
|
+
#
|
|
59
|
+
# == Examples
|
|
60
|
+
#
|
|
61
|
+
# If you have two pair-end sequence files with the Illumina data then you
|
|
62
|
+
# can assemble these using +assemble_seq_ray+ like this:
|
|
63
|
+
#
|
|
64
|
+
# BP.new.
|
|
65
|
+
# read_fastq(input: "file1.fq", input2: "file2.fq).
|
|
66
|
+
# assemble_seq_ray.
|
|
67
|
+
# write_fasta(output: "contigs.fna").
|
|
68
|
+
# run
|
|
69
|
+
class AssembleSeqRay
|
|
70
|
+
require 'English'
|
|
71
|
+
require 'BioDSL/helpers/aux_helper'
|
|
72
|
+
|
|
73
|
+
include AuxHelper
|
|
74
|
+
|
|
75
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
76
|
+
residues_out n50 contig_min contig_max kmer)
|
|
77
|
+
|
|
78
|
+
# Constructor for the AssembleSeqRay class.
|
|
79
|
+
#
|
|
80
|
+
# @param [Hash] options Options hash.
|
|
81
|
+
# @option options [Integer] :kmer_min Minimum kmer value.
|
|
82
|
+
# @option options [Integer] :kmer_max Maximum kmer value.
|
|
83
|
+
# @option options [Integer] :cpus CPUs to use.
|
|
84
|
+
#
|
|
85
|
+
# @return [AssembleSeqRay] Returns an instance of the class.
|
|
86
|
+
def initialize(options)
|
|
87
|
+
@options = options
|
|
88
|
+
@lengths = []
|
|
89
|
+
@paired = nil
|
|
90
|
+
|
|
91
|
+
aux_exist('Ray')
|
|
92
|
+
aux_exist('mpiexec')
|
|
93
|
+
defaults
|
|
94
|
+
check_options
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Return a lambda for the AssembleSeqRay command.
|
|
98
|
+
#
|
|
99
|
+
# @return [Proc] Returns the command lambda.
|
|
100
|
+
def lmb
|
|
101
|
+
lambda do |input, output, status|
|
|
102
|
+
status_init(status, STATS)
|
|
103
|
+
|
|
104
|
+
TmpDir.create('reads.fa') do |fa_in, tmp_dir|
|
|
105
|
+
process_input(input, output, fa_in)
|
|
106
|
+
@paired = paired?(fa_in)
|
|
107
|
+
|
|
108
|
+
n50s = run_assemblies(fa_in, tmp_dir)
|
|
109
|
+
|
|
110
|
+
best_kmer = n50s.sort_by(&:n50).reverse.first.kmer
|
|
111
|
+
|
|
112
|
+
process_output(output, tmp_dir, best_kmer)
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
private
|
|
118
|
+
|
|
119
|
+
# Run assemblies for all kmers and return a list of N50 objects which
|
|
120
|
+
# contain info about the resulting n50 for each kmer.
|
|
121
|
+
#
|
|
122
|
+
# @param fa_in [String] Path to input FASTA file.
|
|
123
|
+
# @param tmp_dir [String] Temporary directory path.
|
|
124
|
+
#
|
|
125
|
+
# @return [Array] List of N50 objects.
|
|
126
|
+
def run_assemblies(fa_in, tmp_dir)
|
|
127
|
+
n50s = []
|
|
128
|
+
|
|
129
|
+
(@options[:kmer_min]..@options[:kmer_max]).step(2).to_a.each do |kmer|
|
|
130
|
+
result_dir = File.join(tmp_dir, kmer.to_s)
|
|
131
|
+
execute_ray(fa_in, result_dir, kmer)
|
|
132
|
+
n50s << parse_result(result_dir, kmer)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
n50s
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Check the options.
|
|
139
|
+
def check_options
|
|
140
|
+
options_allowed(@options, :kmer_min, :kmer_max, :contig_min, :cpus)
|
|
141
|
+
options_assert(@options, ':kmer_min >= 21')
|
|
142
|
+
options_assert(@options, ':kmer_min <= 255')
|
|
143
|
+
options_assert(@options, ':kmer_max >= 21')
|
|
144
|
+
options_assert(@options, ':kmer_max <= 255')
|
|
145
|
+
options_assert(@options, ':contig_min > 0')
|
|
146
|
+
options_assert(@options, ':cpus >= 1')
|
|
147
|
+
options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
|
|
148
|
+
|
|
149
|
+
assert_uneven(@options, :kmer_min)
|
|
150
|
+
assert_uneven(@options, :kmer_max)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Assert that the value to a given key and options hash is uneven.
|
|
154
|
+
#
|
|
155
|
+
# @param options [Hash] Options hash.
|
|
156
|
+
# @param key [Symbol] Hash key whos value to check.
|
|
157
|
+
#
|
|
158
|
+
# @raise [RuntimeError] if even.
|
|
159
|
+
def assert_uneven(options, key)
|
|
160
|
+
return unless options[key].even?
|
|
161
|
+
|
|
162
|
+
fail "#{key} must be an odd number - not #{options[key]}"
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Set the default option values.
|
|
166
|
+
def defaults
|
|
167
|
+
@options[:kmer_min] ||= 21
|
|
168
|
+
@options[:kmer_max] ||= 49
|
|
169
|
+
@options[:contig_min] ||= 500
|
|
170
|
+
@options[:cpus] ||= 1
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Read all records from input and emit non-sequence records to the output
|
|
174
|
+
# stream. Sequence records are saved to a temporary file.
|
|
175
|
+
#
|
|
176
|
+
# @param input [Enumerator] input stream.
|
|
177
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
178
|
+
# @param fa_in [String] Path to temporary FASTA file.
|
|
179
|
+
def process_input(input, output, fa_in)
|
|
180
|
+
BioDSL::Fasta.open(fa_in, 'w') do |fasta_io|
|
|
181
|
+
input.each do |record|
|
|
182
|
+
@status[:records_in] += 1
|
|
183
|
+
|
|
184
|
+
if record.key? :SEQ
|
|
185
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
186
|
+
|
|
187
|
+
@status[:sequences_in] += 1
|
|
188
|
+
@status[:residues_in] += entry.length
|
|
189
|
+
|
|
190
|
+
fasta_io.puts entry.to_fasta
|
|
191
|
+
else
|
|
192
|
+
@status[:records_out] += 1
|
|
193
|
+
output.puts record
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Check if the reads in a given FASTA file are
|
|
200
|
+
# paired by inspecting the sequence names of the first
|
|
201
|
+
# two entries.
|
|
202
|
+
#
|
|
203
|
+
# @param file [String] Path to FASTA file.
|
|
204
|
+
#
|
|
205
|
+
# @return [Booleon] True if paired else false.
|
|
206
|
+
def paired?(file)
|
|
207
|
+
BioDSL::Fasta.open(file, 'r') do |ios|
|
|
208
|
+
entry1 = ios.next_entry
|
|
209
|
+
entry2 = ios.next_entry
|
|
210
|
+
|
|
211
|
+
begin
|
|
212
|
+
BioDSL::Seq.check_name_pair(entry1, entry2)
|
|
213
|
+
|
|
214
|
+
return true
|
|
215
|
+
rescue SeqError
|
|
216
|
+
return false
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# Execute Ray.
|
|
222
|
+
#
|
|
223
|
+
# @param fa_in [String] Path to input FASTA file.
|
|
224
|
+
# @param tmp_dir [String] Temporary directory path.
|
|
225
|
+
# @param kmer [Fixnum] Kmer size.
|
|
226
|
+
#
|
|
227
|
+
# @raise If execution fails.
|
|
228
|
+
def execute_ray(fa_in, tmp_dir, kmer)
|
|
229
|
+
cmd_line = compile_cmd_line(fa_in, tmp_dir, kmer)
|
|
230
|
+
$stderr.puts "Running: #{cmd_line}" if BioDSL.verbose
|
|
231
|
+
system(cmd_line)
|
|
232
|
+
|
|
233
|
+
fail cmd_line unless $CHILD_STATUS.success?
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
# Compile the command and options for executing IDBA.
|
|
237
|
+
#
|
|
238
|
+
# @param fa_in [String] Path to input FASTA file.
|
|
239
|
+
# @param out_dir [String] Output directory path.
|
|
240
|
+
# @param kmer [Fixnum] Kmer size.
|
|
241
|
+
#
|
|
242
|
+
# @return [String] The command line for the IDBA system call.
|
|
243
|
+
def compile_cmd_line(fa_in, out_dir, kmer)
|
|
244
|
+
# mpiexec -n 6 Ray -k 31 -i interleaved -o output_dir
|
|
245
|
+
# mpiexec -n 6 Ray -k 31 -s single -o output_dir
|
|
246
|
+
cmd = []
|
|
247
|
+
cmd << 'mpiexec'
|
|
248
|
+
cmd << "-n #{@options[:cpus]}"
|
|
249
|
+
cmd << 'Ray'
|
|
250
|
+
cmd << "-k #{kmer}"
|
|
251
|
+
|
|
252
|
+
if @paired
|
|
253
|
+
cmd << "-i #{fa_in}"
|
|
254
|
+
else
|
|
255
|
+
cmd << "-s #{fa_in}"
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
cmd << "-o #{out_dir}"
|
|
259
|
+
cmd << '> /dev/null 2>&1' unless BioDSL.verbose
|
|
260
|
+
|
|
261
|
+
cmd.join(' ')
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# Read the assembled scaffolds and return a N50 object.
|
|
265
|
+
#
|
|
266
|
+
# @param dir [String] Path to output dir.
|
|
267
|
+
# @param kmer [Fixnum] Kmer size.
|
|
268
|
+
#
|
|
269
|
+
# @return [N50] Result object
|
|
270
|
+
def parse_result(dir, kmer)
|
|
271
|
+
lengths = []
|
|
272
|
+
|
|
273
|
+
BioDSL::Fasta.open(File.join(dir, 'Scaffolds.fasta')) do |ios|
|
|
274
|
+
ios.each do |entry|
|
|
275
|
+
lengths << entry.length if entry.length >= @options[:contig_min]
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
N50.new(kmer, calc_n50(lengths))
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
# Calculate the n50.
|
|
283
|
+
#
|
|
284
|
+
# {http://en.wikipedia.org/wiki/N50_statistic}
|
|
285
|
+
#
|
|
286
|
+
# @param lengths [Array] List of contig lengths.
|
|
287
|
+
def calc_n50(lengths)
|
|
288
|
+
lengths.sort!
|
|
289
|
+
lengths.reverse!
|
|
290
|
+
|
|
291
|
+
sum = lengths.inject(&:+)
|
|
292
|
+
count = 0
|
|
293
|
+
|
|
294
|
+
lengths.each do |length|
|
|
295
|
+
count += length
|
|
296
|
+
|
|
297
|
+
return length if count >= sum * 0.50
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
nil
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
# Read the best contigs and emit to the output stream.
|
|
304
|
+
#
|
|
305
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
306
|
+
# @param dir [String] Path to tmp_dir.
|
|
307
|
+
# @param kmer [Fixnum] Highest n50 scoring kmer.
|
|
308
|
+
def process_output(output, dir, kmer)
|
|
309
|
+
lengths = []
|
|
310
|
+
file = File.join(dir, kmer.to_s, 'Scaffolds.fasta')
|
|
311
|
+
|
|
312
|
+
BioDSL::Fasta.open(file, 'r') do |ios|
|
|
313
|
+
ios.each do |entry|
|
|
314
|
+
next if entry.length < @options[:contig_min]
|
|
315
|
+
|
|
316
|
+
lengths << entry.length
|
|
317
|
+
output << entry.to_bp
|
|
318
|
+
|
|
319
|
+
@status[:records_out] += 1
|
|
320
|
+
@status[:sequences_out] += 1
|
|
321
|
+
@status[:residues_out] += entry.length
|
|
322
|
+
end
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
add_stats(kmer, lengths)
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
# Add status values to status hash.
|
|
329
|
+
#
|
|
330
|
+
# @param kmer [Fixnum] Highest n50 scoring kmer.
|
|
331
|
+
# @param lengths [Array] List of contig lengths.
|
|
332
|
+
def add_stats(kmer, lengths)
|
|
333
|
+
@status[:kmer] = kmer
|
|
334
|
+
@status[:paired] = @paired
|
|
335
|
+
|
|
336
|
+
unless lengths.empty?
|
|
337
|
+
@status[:contig_min] = lengths.min
|
|
338
|
+
@status[:contig_max] = lengths.max
|
|
339
|
+
@status[:n50] = calc_n50(lengths)
|
|
340
|
+
end
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
N50 = Struct.new(:kmer, :n50)
|
|
344
|
+
end
|
|
345
|
+
end
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Assemble sequences the stream using SPAdes.
|
|
30
|
+
#
|
|
31
|
+
# +assemble_seq_spades+ is a wrapper around the single prokaryotic genome
|
|
32
|
+
# assembler SPAdes:
|
|
33
|
+
#
|
|
34
|
+
# http://bioinf.spbau.ru/spades
|
|
35
|
+
#
|
|
36
|
+
# Any records containing sequence information will be included in the
|
|
37
|
+
# assembly, but only the assembled contig sequences will be output to the
|
|
38
|
+
# stream.
|
|
39
|
+
#
|
|
40
|
+
# The sequences records may contain qualty scores, and if the sequence
|
|
41
|
+
# names indicates that the sequence order is inter-leaved paired-end
|
|
42
|
+
# assembly will be performed.
|
|
43
|
+
#
|
|
44
|
+
# == Usage
|
|
45
|
+
#
|
|
46
|
+
# assemble_seq_spades([careful: <bool>[, cpus: <uint>[, kmers: <list>]]])
|
|
47
|
+
#
|
|
48
|
+
# === Options
|
|
49
|
+
#
|
|
50
|
+
# * careful: <bool> - Run SPAdes with the careful flag set.
|
|
51
|
+
# * cpus: <uint> - Number of CPUs to use (default: 1).
|
|
52
|
+
# * kmers: <list> - List of kmers to use (default: auto).
|
|
53
|
+
#
|
|
54
|
+
# == Examples
|
|
55
|
+
#
|
|
56
|
+
# If you have two pair-end sequence files with the Illumina data then you
|
|
57
|
+
# can assemble these using assemble_seq_spades like this:
|
|
58
|
+
#
|
|
59
|
+
# BP.new.
|
|
60
|
+
# read_fastq(input: "file1.fq", input2: "file2.fq).
|
|
61
|
+
# assemble_seq_spades(kmers: [55,77,99,127]).
|
|
62
|
+
# write_fasta(output: "contigs.fna").
|
|
63
|
+
# run
|
|
64
|
+
# rubocop:disable ClassLength
|
|
65
|
+
class AssembleSeqSpades
|
|
66
|
+
require 'English'
|
|
67
|
+
require 'BioDSL/helpers/aux_helper'
|
|
68
|
+
|
|
69
|
+
include AuxHelper
|
|
70
|
+
|
|
71
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
72
|
+
records_out assembled)
|
|
73
|
+
|
|
74
|
+
# Constructor for the AssembleSeqSpades class.
|
|
75
|
+
#
|
|
76
|
+
# @param [Hash] options Options hash.
|
|
77
|
+
#
|
|
78
|
+
# @option options [Boolean] :careful
|
|
79
|
+
# Flag indicating use of careful assembly.
|
|
80
|
+
#
|
|
81
|
+
# @option options [Array] :kmers
|
|
82
|
+
# List of kmers to use.
|
|
83
|
+
#
|
|
84
|
+
# @option options [Integer] :cpus
|
|
85
|
+
# CPUs to use.
|
|
86
|
+
#
|
|
87
|
+
# @return [AssembleSeqSpades] Returns an instance of the class.
|
|
88
|
+
def initialize(options)
|
|
89
|
+
@options = options
|
|
90
|
+
@lengths = []
|
|
91
|
+
@type = nil
|
|
92
|
+
|
|
93
|
+
aux_exist('spades.py')
|
|
94
|
+
check_options
|
|
95
|
+
defaults
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Return a lambda for the AssembleSeqSpades command.
|
|
99
|
+
#
|
|
100
|
+
# @return [Proc] Returns the command lambda.
|
|
101
|
+
def lmb
|
|
102
|
+
lambda do |input, output, status|
|
|
103
|
+
status_init(status, STATS)
|
|
104
|
+
|
|
105
|
+
TmpDir.create('reads.fq', 'reads.fa') do |in_fq, in_fa, tmp_dir|
|
|
106
|
+
process_input(in_fq, in_fa, input, output)
|
|
107
|
+
input_file = (@type == :fastq) ? in_fq : in_fa
|
|
108
|
+
execute_spades(input_file, tmp_dir)
|
|
109
|
+
process_output(output, File.join(tmp_dir, 'scaffolds.fasta'))
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
calc_n50(status)
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
private
|
|
117
|
+
|
|
118
|
+
# Check the options.
|
|
119
|
+
def check_options
|
|
120
|
+
options_allowed(@options, :careful, :cpus, :kmers)
|
|
121
|
+
options_allowed_values(@options, careful: [true, false, nil])
|
|
122
|
+
options_assert(@options, ':cpus >= 1')
|
|
123
|
+
options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Set default options.
|
|
127
|
+
def defaults
|
|
128
|
+
@options[:cpus] ||= 1
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Process input stream and write all sequence records to a temporary file.
|
|
132
|
+
#
|
|
133
|
+
# @param in_fq [String] Path to FASTQ temp file.
|
|
134
|
+
# @param in_fa [String] Path to FASTA temp file.
|
|
135
|
+
# @param input [Enumerator] Input stream.
|
|
136
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
137
|
+
def process_input(in_fq, in_fa, input, output)
|
|
138
|
+
BioDSL::Fastq.open(in_fq, 'w') do |io_fq|
|
|
139
|
+
BioDSL::Fasta.open(in_fa, 'w') do |io_fa|
|
|
140
|
+
input.each do |record|
|
|
141
|
+
@status[:records_in] += 1
|
|
142
|
+
|
|
143
|
+
if record.key? :SEQ
|
|
144
|
+
write_sequence(io_fq, io_fa, record)
|
|
145
|
+
else
|
|
146
|
+
@status[:records_out] += 1
|
|
147
|
+
output.puts record
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Write a sequence record to the temporary file.
|
|
155
|
+
#
|
|
156
|
+
# @param io_fq [BioDSL::Fastq::IO] FASTQ IO stream.
|
|
157
|
+
# @param io_fa [BioDSL::Fasta::IO] FASTA IO stream.
|
|
158
|
+
# @param record [Hash] BioPiece record with sequence.
|
|
159
|
+
def write_sequence(io_fq, io_fa, record)
|
|
160
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
161
|
+
|
|
162
|
+
@status[:sequences_in] += 1
|
|
163
|
+
@status[:residues_in] += entry.length
|
|
164
|
+
|
|
165
|
+
if entry.qual
|
|
166
|
+
@type = :fastq
|
|
167
|
+
io_fq.puts entry.to_fastq
|
|
168
|
+
else
|
|
169
|
+
io_fa.puts entry.to_fasta
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Execute spades using a system call.
|
|
174
|
+
#
|
|
175
|
+
# @param input_file [String] Path to input file.
|
|
176
|
+
# @param tmp_dir [String] Path to temp dir.
|
|
177
|
+
#
|
|
178
|
+
# @raise if command fails.
|
|
179
|
+
def execute_spades(input_file, tmp_dir)
|
|
180
|
+
cmd_line = compile_command(input_file, tmp_dir)
|
|
181
|
+
|
|
182
|
+
if BioDSL.verbose
|
|
183
|
+
$stderr.puts cmd_line
|
|
184
|
+
system(cmd_line)
|
|
185
|
+
else
|
|
186
|
+
system(cmd_line + ' > /dev/null 2>&1')
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
fail "Command failed: #{cmd_line}" unless $CHILD_STATUS.success?
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Compile the spades command.
|
|
193
|
+
#
|
|
194
|
+
# @param input_file [String] Path to input file.
|
|
195
|
+
# @param tmp_dir [String] Path to temp dir.
|
|
196
|
+
#
|
|
197
|
+
# @return [String] A command string for executing Spades.
|
|
198
|
+
def compile_command(input_file, tmp_dir)
|
|
199
|
+
cmd = []
|
|
200
|
+
cmd << 'spades.py'
|
|
201
|
+
cmd << "--12 #{input_file}"
|
|
202
|
+
cmd << '--only-assembler'
|
|
203
|
+
cmd << '--careful' if @options[:careful]
|
|
204
|
+
cmd << "-k #{@options[:kmers].join(',')}" if @options[:kmers]
|
|
205
|
+
cmd << "-t #{@options[:cpus]}"
|
|
206
|
+
cmd << "-o #{tmp_dir}"
|
|
207
|
+
|
|
208
|
+
cmd.join(' ')
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# Process the spades output and emit the contigs to the output stream.
|
|
212
|
+
#
|
|
213
|
+
# @param output [Enumerator::Yielder] Output stream
|
|
214
|
+
# @param output_file [String] Path to output FASTA file with contigs.
|
|
215
|
+
def process_output(output, output_file)
|
|
216
|
+
BioDSL::Fasta.open(output_file) do |ios|
|
|
217
|
+
ios.each do |entry|
|
|
218
|
+
output << entry.to_bp
|
|
219
|
+
@status[:records_out] += 1
|
|
220
|
+
@status[:sequences_out] += 1
|
|
221
|
+
@status[:residues_out] += entry.length
|
|
222
|
+
|
|
223
|
+
@lengths << entry.length
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Calculate the n50 and add to the status.
|
|
229
|
+
#
|
|
230
|
+
# {http://en.wikipedia.org/wiki/N50_statistic}
|
|
231
|
+
#
|
|
232
|
+
# @param status [Hash] Status hash.
|
|
233
|
+
def calc_n50(status)
|
|
234
|
+
@lengths.sort!
|
|
235
|
+
@lengths.reverse!
|
|
236
|
+
|
|
237
|
+
status[:contig_max] = @lengths.first
|
|
238
|
+
status[:contig_min] = @lengths.last
|
|
239
|
+
|
|
240
|
+
count = 0
|
|
241
|
+
|
|
242
|
+
@lengths.each do |length|
|
|
243
|
+
count += length
|
|
244
|
+
|
|
245
|
+
if count >= status[:residues_out] * 0.50
|
|
246
|
+
status[:contig_n50] = length
|
|
247
|
+
break
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
end
|