BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Assemble ordered overlapping pair-end sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# +assemble_pairs+ assembles overlapping pair-end sequences into single
|
|
32
|
+
# sequences that are output to the stream - the orginal sequences are no
|
|
33
|
+
# output. Assembly works by progressively considering all overlaps between the
|
|
34
|
+
# maximum considered overlap using the +overlap_max+ option (default is the
|
|
35
|
+
# length of the shortest sequence) until the minimum required overlap supplied
|
|
36
|
+
# with the +overlap_min+ option (default 1). For each overlap a percentage of
|
|
37
|
+
# mismatches can be allowed using the +mismatch_percent+ option (default 20%).
|
|
38
|
+
#
|
|
39
|
+
# Mismatches in the overlapping regions are resolved so that the residues with
|
|
40
|
+
# the highest quality score is used in the assembled sequence. The quality
|
|
41
|
+
# scores are averaged in the overlapping region. The sequence of the
|
|
42
|
+
# overlapping region is output in upper case and the remaining in lower case.
|
|
43
|
+
#
|
|
44
|
+
# Futhermore, sequences must be in interleaved order in the stream - use
|
|
45
|
+
# +read_fastq+ with +input+ and +input2+ options for that.
|
|
46
|
+
#
|
|
47
|
+
# The additional keys are added to records with assembled sequences:
|
|
48
|
+
#
|
|
49
|
+
# * OVERLAP_LEN - the length of the located overlap.
|
|
50
|
+
# * HAMMING_DIST - the number of mismatches in the assembly.
|
|
51
|
+
#
|
|
52
|
+
# Using the +merge_unassembled+ option will merge any unassembled sequences
|
|
53
|
+
# taking into account reverse complementation of read2 if the
|
|
54
|
+
# +reverse_complement+ option is true. Note that you probably want to set
|
|
55
|
+
# +overlap_min+ to 1 before using +merge_unassembled+ to improve chances of
|
|
56
|
+
# making an assembly before falling back to a simple merge.
|
|
57
|
+
#
|
|
58
|
+
# == Usage
|
|
59
|
+
#
|
|
60
|
+
# assemble_pairs([mismatch_percent: <uint>[, overlap_min: <uint>
|
|
61
|
+
# [, overlap_max: <uint>[, reverse_complement: <bool>
|
|
62
|
+
# [, merge_unassembled: <bool>]]]]])
|
|
63
|
+
#
|
|
64
|
+
# === Options
|
|
65
|
+
#
|
|
66
|
+
# * mismatch_percent: <uint> - Maximum allowed overlap mismatches in
|
|
67
|
+
# percent (default=20).
|
|
68
|
+
# * overlap_min: <uint> - Minimum overlap required (default=1).
|
|
69
|
+
# * overlap_max: <uint> - Maximum overlap considered
|
|
70
|
+
# (default=<length of shortest sequences>).
|
|
71
|
+
# * reverse_complement: <bool> - Reverse-complement read2 before assembly
|
|
72
|
+
# (default=false).
|
|
73
|
+
# * merge_unassembled: <bool> - Merge unassembled pairs (default=false).
|
|
74
|
+
#
|
|
75
|
+
# == Examples
|
|
76
|
+
#
|
|
77
|
+
# If you have two pair-end sequence files with the Illumina data then you
|
|
78
|
+
# can assemble these using assemble_pairs like this:
|
|
79
|
+
#
|
|
80
|
+
# BP.new.
|
|
81
|
+
# read_fastq(input: "file1.fq", input2: "file2.fq).
|
|
82
|
+
# assemble_pairs(reverse_complement: true).
|
|
83
|
+
# run
|
|
84
|
+
# rubocop:disable ClassLength
|
|
85
|
+
class AssemblePairs
|
|
86
|
+
STATS = %i(overlap_sum hamming_sum records_in records_out sequences_in
|
|
87
|
+
sequences_out residues_in residues_out assembled unassembled)
|
|
88
|
+
|
|
89
|
+
# Constructor for the AssemblePairs class.
|
|
90
|
+
#
|
|
91
|
+
# @param [Hash] options Options hash.
|
|
92
|
+
#
|
|
93
|
+
# @option options [Integer] :mismatch_percent
|
|
94
|
+
# Maximum allowed overlap mismatches in percent.
|
|
95
|
+
#
|
|
96
|
+
# @option options [Integer] :overlap_min
|
|
97
|
+
# Minimum length of overlap.
|
|
98
|
+
#
|
|
99
|
+
# @option options [Integer] :overlap_max
|
|
100
|
+
# Maximum length of overlap.
|
|
101
|
+
#
|
|
102
|
+
# @option options [Boolean] :reverse_complement
|
|
103
|
+
# Reverse-complment read2.
|
|
104
|
+
#
|
|
105
|
+
# @option options [Boolean] :merge_unassembled
|
|
106
|
+
# Merge read pairs that couldn't be assembled.
|
|
107
|
+
#
|
|
108
|
+
# @option options [Boolean] :allow_unassembled
|
|
109
|
+
# Output reads that couldn't be assembled.
|
|
110
|
+
#
|
|
111
|
+
# @return [ReadFasta] Returns an instance of the class.
|
|
112
|
+
def initialize(options)
|
|
113
|
+
@options = options
|
|
114
|
+
|
|
115
|
+
@overlap_sum = 0
|
|
116
|
+
@hamming_sum = 0
|
|
117
|
+
|
|
118
|
+
check_options
|
|
119
|
+
defaults
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Return a lambda for the read_fasta command.
|
|
123
|
+
#
|
|
124
|
+
# @return [Proc] Returns the read_fasta command lambda.
|
|
125
|
+
def lmb
|
|
126
|
+
lambda do |input, output, status|
|
|
127
|
+
status_init(status, STATS)
|
|
128
|
+
|
|
129
|
+
input.each_slice(2) do |record1, record2|
|
|
130
|
+
@status[:records_in] += 2
|
|
131
|
+
|
|
132
|
+
if record2 && record1[:SEQ] && record2[:SEQ]
|
|
133
|
+
assemble_pairs(record1, record2, output)
|
|
134
|
+
else
|
|
135
|
+
output_record(record1, output)
|
|
136
|
+
output_record(record2, output) if record2
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
calc_status
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
private
|
|
145
|
+
|
|
146
|
+
# Check the options.
|
|
147
|
+
def check_options
|
|
148
|
+
options_allowed(@options, :mismatch_percent, :overlap_min, :overlap_max,
|
|
149
|
+
:reverse_complement, :merge_unassembled,
|
|
150
|
+
:allow_unassembled)
|
|
151
|
+
options_allowed_values(@options, reverse_complement: [true, false, nil])
|
|
152
|
+
options_allowed_values(@options, merge_unassembled: [true, false, nil])
|
|
153
|
+
options_allowed_values(@options, allow_unassembled: [true, false, nil])
|
|
154
|
+
options_conflict(@options, allow_unassembled: :merge_unassembled)
|
|
155
|
+
options_assert(@options, ':mismatch_percent >= 0')
|
|
156
|
+
options_assert(@options, ':mismatch_percent <= 100')
|
|
157
|
+
options_assert(@options, ':overlap_min > 0')
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Set default options.
|
|
161
|
+
def defaults
|
|
162
|
+
@options[:mismatch_percent] ||= 20
|
|
163
|
+
@options[:overlap_min] ||= 1
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Output a record to the stream if a stram is provided.
|
|
167
|
+
#
|
|
168
|
+
# @param record [Hash] BioDSL record to output.
|
|
169
|
+
# @param output [Enumerator::Yielder, nil] Output stream or nil.
|
|
170
|
+
def output_record(record, output)
|
|
171
|
+
return unless output
|
|
172
|
+
output << record
|
|
173
|
+
@status[:records_out] += 1
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Assemble records with sequences and output to the stream
|
|
177
|
+
#
|
|
178
|
+
# @param record1 [Hash] BioDSL record1.
|
|
179
|
+
# @param record2 [Hash] BioDSL record2.
|
|
180
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
181
|
+
def assemble_pairs(record1, record2, output)
|
|
182
|
+
entry1, entry2 = records2entries(record1, record2)
|
|
183
|
+
|
|
184
|
+
if overlap_possible?(entry1, entry2, @options[:overlap_min]) &&
|
|
185
|
+
assembled = assemble_entries(entry1, entry2)
|
|
186
|
+
output_assembled(assembled, output)
|
|
187
|
+
elsif @options[:merge_unassembled]
|
|
188
|
+
output_merged(entry1, entry2, output)
|
|
189
|
+
elsif @options[:allow_unassembled]
|
|
190
|
+
output_entries(entry1, entry2, output)
|
|
191
|
+
else
|
|
192
|
+
@status[:unassembled] += 1
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Given a pair of records convert these into sequence entries and
|
|
197
|
+
# reverse-complment if need be.
|
|
198
|
+
#
|
|
199
|
+
# @param record1 [Hash] Record1.
|
|
200
|
+
# @param record2 [Hash] Record2.
|
|
201
|
+
#
|
|
202
|
+
# @return [Array] Returns a tuple of sequence entries.
|
|
203
|
+
def records2entries(record1, record2)
|
|
204
|
+
entry1 = BioDSL::Seq.new_bp(record1)
|
|
205
|
+
entry2 = BioDSL::Seq.new_bp(record2)
|
|
206
|
+
entry1.type = :dna
|
|
207
|
+
entry2.type = :dna
|
|
208
|
+
|
|
209
|
+
if @options[:reverse_complement] && entry2.length > 0
|
|
210
|
+
entry2.reverse!.complement!
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
@status[:sequences_in] += 2
|
|
214
|
+
@status[:residues_in] += entry1.length + entry2.length
|
|
215
|
+
|
|
216
|
+
[entry1, entry2]
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# Determines if an overlap between two given entries is possible considering
|
|
220
|
+
# the minimum overlap length.
|
|
221
|
+
#
|
|
222
|
+
# @param entry1 [BioDSL::Seq] Sequence entry1.
|
|
223
|
+
# @param entry2 [BioDSL::Seq] Sequence entry2.
|
|
224
|
+
# @param overlap_min [Integer] Minimum overlap.
|
|
225
|
+
#
|
|
226
|
+
# @return [Boolean] True if overlap possible otherwise false.
|
|
227
|
+
def overlap_possible?(entry1, entry2, overlap_min)
|
|
228
|
+
entry1.length >= overlap_min && entry2.length >= overlap_min
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# Assemble a pair of given entries if possible and return an assembled
|
|
232
|
+
# entry, or nil the entries could not be assembled.
|
|
233
|
+
#
|
|
234
|
+
# @param entry1 [BioDSL::Seq] Sequence entry1.
|
|
235
|
+
# @param entry2 [BioDSL::Seq] Sequence entry2.
|
|
236
|
+
#
|
|
237
|
+
# @return [BioDSL::Seq, nil] Returns Seq entry or nil.
|
|
238
|
+
def assemble_entries(entry1, entry2)
|
|
239
|
+
BioDSL::Assemble.pair(
|
|
240
|
+
entry1,
|
|
241
|
+
entry2,
|
|
242
|
+
mismatches_max: @options[:mismatch_percent],
|
|
243
|
+
overlap_min: @options[:overlap_min],
|
|
244
|
+
overlap_max: @options[:overlap_max]
|
|
245
|
+
)
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# Output assembled pairs to the output stream.
|
|
249
|
+
#
|
|
250
|
+
# @param assembled [BioDSL::Seq] Assembled sequence entry.
|
|
251
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
252
|
+
def output_assembled(assembled, output)
|
|
253
|
+
output << assembled2record(assembled)
|
|
254
|
+
|
|
255
|
+
@status[:assembled] += 1
|
|
256
|
+
@status[:records_out] += 1
|
|
257
|
+
@status[:sequences_out] += 1
|
|
258
|
+
@status[:residues_out] += assembled.length
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Convert a sequence entry to a BioPiece record with hamming distance and
|
|
262
|
+
# overlap length from the entry's seq_name.
|
|
263
|
+
#
|
|
264
|
+
# @param assembled [BioDSL::Seq] Merged sequence entry.
|
|
265
|
+
#
|
|
266
|
+
# @return [Hash] BioDSL record.
|
|
267
|
+
def assembled2record(assembled)
|
|
268
|
+
new_record = assembled.to_bp
|
|
269
|
+
|
|
270
|
+
if assembled.seq_name =~ /overlap=(\d+):hamming=(\d+)$/
|
|
271
|
+
overlap = Regexp.last_match(1).to_i
|
|
272
|
+
hamming = Regexp.last_match(2).to_i
|
|
273
|
+
@overlap_sum += overlap
|
|
274
|
+
@hamming_sum += hamming
|
|
275
|
+
new_record[:OVERLAP_LEN] = overlap
|
|
276
|
+
new_record[:HAMMING_DIST] = hamming
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
new_record
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
# Merge and output entries to the stream.
|
|
283
|
+
#
|
|
284
|
+
# @param entry1 [BioDSL::Seq] Entry1.
|
|
285
|
+
# @param entry2 [BioDSL::Seq] Entry2.
|
|
286
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
287
|
+
def output_merged(entry1, entry2, output)
|
|
288
|
+
entry1 << entry2
|
|
289
|
+
|
|
290
|
+
output << entry2record(entry1)
|
|
291
|
+
|
|
292
|
+
@status[:unassembled] += 1
|
|
293
|
+
@status[:sequences_out] += 1
|
|
294
|
+
@status[:residues_out] += entry1.length
|
|
295
|
+
@status[:records_out] += 1
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
# Output unassembled entries to the stream.
|
|
299
|
+
#
|
|
300
|
+
# @param entry1 [BioDSL::Seq] Entry1.
|
|
301
|
+
# @param entry2 [BioDSL::Seq] Entry2.
|
|
302
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
303
|
+
def output_entries(entry1, entry2, output)
|
|
304
|
+
output << entry2record(entry1)
|
|
305
|
+
output << entry2record(entry2)
|
|
306
|
+
|
|
307
|
+
@status[:unassembled] += 2
|
|
308
|
+
@status[:sequences_out] += 2
|
|
309
|
+
@status[:residues_out] += entry1.length + entry2.length
|
|
310
|
+
@status[:records_out] += 2
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
# Converts a sequence entry to a BioPeice record.
|
|
314
|
+
#
|
|
315
|
+
# @param entry [BioDSL::Seq] Sequence entry.
|
|
316
|
+
#
|
|
317
|
+
# @return [Hash] BioDSL record.
|
|
318
|
+
def entry2record(entry)
|
|
319
|
+
record = entry.to_bp
|
|
320
|
+
record[:OVERLAP_LEN] = 0
|
|
321
|
+
record[:HAMMING_DIST] = entry.length
|
|
322
|
+
record
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
# Calculate additional status values.
|
|
326
|
+
def calc_status
|
|
327
|
+
assembled_percent =
|
|
328
|
+
(100 * 2 * @status[:assembled].to_f / @status[:sequences_in]).round(2)
|
|
329
|
+
@status[:assembled_percent] = assembled_percent
|
|
330
|
+
@status[:overlap_mean] =
|
|
331
|
+
(@overlap_sum.to_f / @status[:records_out]).round(2)
|
|
332
|
+
@status[:hamming_dist_mean] =
|
|
333
|
+
(@hamming_sum.to_f / @status[:records_out]).round(2)
|
|
334
|
+
end
|
|
335
|
+
end
|
|
336
|
+
end
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# rubocop:disable ClassLength
|
|
30
|
+
|
|
31
|
+
# == Assemble sequences the stream using IDBA_UD.
|
|
32
|
+
#
|
|
33
|
+
# +assemble_seq_idba+ is a wrapper around the prokaryotic metagenome
|
|
34
|
+
# assembler IDBA_UD:
|
|
35
|
+
#
|
|
36
|
+
# http://i.cs.hku.hk/~alse/hkubrg/projects/idba_ud/
|
|
37
|
+
#
|
|
38
|
+
# Any records containing sequence information will be included in the
|
|
39
|
+
# assembly, but only the assembled contig sequences will be output to the
|
|
40
|
+
# stream.
|
|
41
|
+
#
|
|
42
|
+
# The sequences records may contain quality scores, and if the sequence
|
|
43
|
+
# names indicates that the sequence order is inter-leaved paired-end
|
|
44
|
+
# assembly will be performed.
|
|
45
|
+
#
|
|
46
|
+
# == Usage
|
|
47
|
+
#
|
|
48
|
+
# assemble_seq_idba([kmer_min: <uint>[, kmer_max: <uint>[, cpus: <uint>]]])
|
|
49
|
+
#
|
|
50
|
+
# === Options
|
|
51
|
+
#
|
|
52
|
+
# * kmer_min: <uint> - Minimum k-mer value (default: 24).
|
|
53
|
+
# * kmer_max: <uint> - Maximum k-mer value (default: 128).
|
|
54
|
+
# * cpus: <uint> - Number of CPUs to use (default: 1).
|
|
55
|
+
#
|
|
56
|
+
# == Examples
|
|
57
|
+
#
|
|
58
|
+
# If you have two pair-end sequence files with the Illumina data then you
|
|
59
|
+
# can assemble these using +assemble_seq_idba+ like this:
|
|
60
|
+
#
|
|
61
|
+
# BP.new.
|
|
62
|
+
# read_fastq(input: "file1.fq", input2: "file2.fq).
|
|
63
|
+
# assemble_seq_idba.
|
|
64
|
+
# write_fasta(output: "contigs.fna").
|
|
65
|
+
# run
|
|
66
|
+
class AssembleSeqIdba
|
|
67
|
+
require 'English'
|
|
68
|
+
require 'BioDSL/helpers/aux_helper'
|
|
69
|
+
|
|
70
|
+
include AuxHelper
|
|
71
|
+
|
|
72
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
73
|
+
residues_out)
|
|
74
|
+
|
|
75
|
+
# Constructor for the AssembleSeqIdba class.
|
|
76
|
+
#
|
|
77
|
+
# @param [Hash] options Options hash.
|
|
78
|
+
# @option options [Integer] :kmer_min Minimum kmer value.
|
|
79
|
+
# @option options [Integer] :kmer_max Maximum kmer value.
|
|
80
|
+
# @option options [Integer] :cpus CPUs to use.
|
|
81
|
+
#
|
|
82
|
+
# @return [AssembleSeqIdba] Returns an instance of the class.
|
|
83
|
+
def initialize(options)
|
|
84
|
+
@options = options
|
|
85
|
+
@lengths = []
|
|
86
|
+
|
|
87
|
+
aux_exist('idba_ud')
|
|
88
|
+
check_options
|
|
89
|
+
defaults
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Return a lambda for the AssembleSeqIdba command.
|
|
93
|
+
#
|
|
94
|
+
# @return [Proc] Returns the command lambda.
|
|
95
|
+
def lmb
|
|
96
|
+
lambda do |input, output, status|
|
|
97
|
+
status_init(status, STATS)
|
|
98
|
+
|
|
99
|
+
TmpDir.create('reads.fna', 'contig.fa') do |fa_in, fa_out, tmp_dir|
|
|
100
|
+
process_input(input, output, fa_in)
|
|
101
|
+
execute_idba(fa_in, tmp_dir)
|
|
102
|
+
lengths = process_output(output, fa_out)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
calc_n50(status)
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
private
|
|
110
|
+
|
|
111
|
+
# Check the options.
|
|
112
|
+
def check_options
|
|
113
|
+
options_allowed(@options, :kmer_min, :kmer_max, :cpus)
|
|
114
|
+
options_assert(@options, ':kmer_min >= 16')
|
|
115
|
+
options_assert(@options, ':kmer_min <= 256')
|
|
116
|
+
options_assert(@options, ':kmer_max >= 16')
|
|
117
|
+
options_assert(@options, ':kmer_max <= 512')
|
|
118
|
+
options_assert(@options, ':cpus >= 1')
|
|
119
|
+
options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Set the default option values.
|
|
123
|
+
def defaults
|
|
124
|
+
@options[:kmer_min] ||= 24
|
|
125
|
+
@options[:kmer_max] ||= 48
|
|
126
|
+
@options[:cpus] ||= 1
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Read all records from input and emit non-sequence records to the output
|
|
130
|
+
# stream. Sequence records are saved to a temporary file.
|
|
131
|
+
#
|
|
132
|
+
# @param input [Enumerator] input stream.
|
|
133
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
134
|
+
# @param fa_in [String] Path to temporary FASTA file.
|
|
135
|
+
def process_input(input, output, fa_in)
|
|
136
|
+
BioDSL::Fasta.open(fa_in, 'w') do |fasta_io|
|
|
137
|
+
input.each do |record|
|
|
138
|
+
@status[:records_in] += 1
|
|
139
|
+
|
|
140
|
+
if record.key? :SEQ
|
|
141
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
142
|
+
|
|
143
|
+
@status[:sequences_in] += 1
|
|
144
|
+
@status[:residues_in] += entry.length
|
|
145
|
+
|
|
146
|
+
fasta_io.puts entry.to_fasta
|
|
147
|
+
else
|
|
148
|
+
@status[:records_out] += 1
|
|
149
|
+
output.puts record
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Execute IDBA.
|
|
156
|
+
#
|
|
157
|
+
# @param fa_in [String] Path to input FASTA file.
|
|
158
|
+
# @param tmp_dir [String] Temporary directory path.
|
|
159
|
+
#
|
|
160
|
+
# @raise If execution fails.
|
|
161
|
+
def execute_idba(fa_in, tmp_dir)
|
|
162
|
+
cmd_line = compile_cmd_line(fa_in, tmp_dir)
|
|
163
|
+
$stderr.puts "Running: #{cmd_line}" if BioDSL.verbose
|
|
164
|
+
system(cmd_line)
|
|
165
|
+
|
|
166
|
+
fail cmd_line unless $CHILD_STATUS.success?
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Compile the command and options for executing IDBA.
|
|
170
|
+
#
|
|
171
|
+
# @param fa_in [String] Path to input FASTA file.
|
|
172
|
+
# @param tmp_dir [String] Temporary directory path.
|
|
173
|
+
#
|
|
174
|
+
# @return [String] The command line for the IDBA system call.
|
|
175
|
+
def compile_cmd_line(fa_in, tmp_dir)
|
|
176
|
+
cmd = []
|
|
177
|
+
cmd << 'idba_ud'
|
|
178
|
+
cmd << "--read #{fa_in}"
|
|
179
|
+
cmd << "--out #{tmp_dir}"
|
|
180
|
+
cmd << "--mink #{@options[:kmer_min]}"
|
|
181
|
+
cmd << "--maxk #{@options[:kmer_max]}"
|
|
182
|
+
cmd << "--num_threads #{@options[:cpus]}"
|
|
183
|
+
cmd << '> /dev/null 2>&1' unless BioDSL.verbose
|
|
184
|
+
|
|
185
|
+
cmd.join(' ')
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Read the IDBA assembled contigs and output to the stream.
|
|
189
|
+
#
|
|
190
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
191
|
+
# @param fa_out [String] Path to contig FASTA file.
|
|
192
|
+
def process_output(output, fa_out)
|
|
193
|
+
BioDSL::Fasta.open(fa_out, 'r') do |ios|
|
|
194
|
+
ios.each do |entry|
|
|
195
|
+
output << entry.to_bp
|
|
196
|
+
@status[:records_out] += 1
|
|
197
|
+
@status[:sequences_out] += 1
|
|
198
|
+
@status[:residues_out] += entry.length
|
|
199
|
+
|
|
200
|
+
@lengths << entry.length
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
# Calculate the n50 and add to the status.
|
|
206
|
+
#
|
|
207
|
+
# {http://en.wikipedia.org/wiki/N50_statistic}
|
|
208
|
+
#
|
|
209
|
+
# @param status [Hash] Status hash.
|
|
210
|
+
def calc_n50(status)
|
|
211
|
+
@lengths.sort!
|
|
212
|
+
@lengths.reverse!
|
|
213
|
+
|
|
214
|
+
status[:contig_max] = @lengths.first || 0
|
|
215
|
+
status[:contig_min] = @lengths.last || 0
|
|
216
|
+
status[:contig_n50] = 0
|
|
217
|
+
|
|
218
|
+
count = 0
|
|
219
|
+
|
|
220
|
+
@lengths.each do |length|
|
|
221
|
+
count += length
|
|
222
|
+
|
|
223
|
+
if count >= status[:residues_out] * 0.50
|
|
224
|
+
status[:contig_n50] = length
|
|
225
|
+
break
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
end
|