BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Run uclust on sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# This is a wrapper for the +usearch+ tool to run the program uclust.
|
|
32
|
+
# Basically sequence type records are clustered de-novo and records containing
|
|
33
|
+
# sequence and cluster information is output. If the +align+ option is given
|
|
34
|
+
# the sequnces will be aligned.
|
|
35
|
+
#
|
|
36
|
+
# Please refer to the manual:
|
|
37
|
+
#
|
|
38
|
+
# http://www.drive5.com/usearch/manual/cmd_cluster_smallmem.html
|
|
39
|
+
#
|
|
40
|
+
# Usearch 7.0 must be installed for +usearch+ to work. Read more here:
|
|
41
|
+
#
|
|
42
|
+
# http://www.drive5.com/usearch/
|
|
43
|
+
#
|
|
44
|
+
# == Usage
|
|
45
|
+
#
|
|
46
|
+
# uclust(<identity: float>, <strand: "plus|both">[, align: <bool>
|
|
47
|
+
# [, cpus: <uint>]])
|
|
48
|
+
#
|
|
49
|
+
# === Options
|
|
50
|
+
#
|
|
51
|
+
# * identity: <float> - Similarity for matching in percent between 0.0 and
|
|
52
|
+
# 1.0.
|
|
53
|
+
# * strand: <string> - For nucleotide search report hits from plus or both
|
|
54
|
+
# strands.
|
|
55
|
+
# * align: <bool> - Align sequences.
|
|
56
|
+
# * cpus: <uint> - Number of CPU cores to use (default=1).
|
|
57
|
+
#
|
|
58
|
+
# == Examples
|
|
59
|
+
#
|
|
60
|
+
# rubocop: disable ClassLength
|
|
61
|
+
class Uclust
|
|
62
|
+
require 'BioDSL/helpers/aux_helper'
|
|
63
|
+
|
|
64
|
+
include AuxHelper
|
|
65
|
+
|
|
66
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
67
|
+
residues_out clusters_out)
|
|
68
|
+
|
|
69
|
+
# Constructor for Uclust.
|
|
70
|
+
#
|
|
71
|
+
# @param options [Hash] Options hash.
|
|
72
|
+
# @option options [Float] :identity
|
|
73
|
+
# @option options [String,Symbol] :strand
|
|
74
|
+
# @option options [Boolean] :align
|
|
75
|
+
# @option options [Integer] :cpus
|
|
76
|
+
#
|
|
77
|
+
# @return [Uclust] Class instance.
|
|
78
|
+
def initialize(options)
|
|
79
|
+
@options = options
|
|
80
|
+
@options[:cpus] ||= 1
|
|
81
|
+
|
|
82
|
+
aux_exist('usearch')
|
|
83
|
+
check_options
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Return command lambda for uclust.
|
|
87
|
+
#
|
|
88
|
+
# @return [Proc] Command lambda.
|
|
89
|
+
def lmb
|
|
90
|
+
lambda do |input, output, status|
|
|
91
|
+
status_init(status, STATS)
|
|
92
|
+
|
|
93
|
+
TmpDir.create('rec', 'in', 'out') do |tmp_rec, tmp_in, tmp_out|
|
|
94
|
+
process_input(input, output, tmp_rec, tmp_in)
|
|
95
|
+
|
|
96
|
+
run_uclust(tmp_in, tmp_out)
|
|
97
|
+
|
|
98
|
+
if @options[:align]
|
|
99
|
+
process_output_align(output, tmp_out)
|
|
100
|
+
else
|
|
101
|
+
process_output(output, tmp_rec, tmp_out)
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
private
|
|
108
|
+
|
|
109
|
+
# Check options.
|
|
110
|
+
def check_options
|
|
111
|
+
options_allowed(@options, :identity, :strand, :align, :cpus)
|
|
112
|
+
options_required(@options, :identity, :strand)
|
|
113
|
+
options_allowed_values(@options, strand: ['plus', 'both', :plus, :both])
|
|
114
|
+
options_allowed_values(@options, align: [nil, false, true])
|
|
115
|
+
options_assert(@options, ':identity > 0.0')
|
|
116
|
+
options_assert(@options, ':identity <= 1.0')
|
|
117
|
+
options_assert(@options, ':cpus >= 1')
|
|
118
|
+
options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Process input data and serialize all records into a temporary file and all
|
|
122
|
+
# records containing sequence to a temporary FASTA file.
|
|
123
|
+
#
|
|
124
|
+
# @param input [Enumerator] Input stream
|
|
125
|
+
# @param output [Enumerator::Yeilder] Output stream.
|
|
126
|
+
# @param tmp_rec [String] Path to serialized records file.
|
|
127
|
+
# @param tmp_in [String] Path to input file.
|
|
128
|
+
def process_input(input, output, tmp_rec, tmp_in)
|
|
129
|
+
File.open(tmp_rec, 'wb') do |ios_rec|
|
|
130
|
+
BioDSL::Serializer.new(ios_rec) do |s|
|
|
131
|
+
BioDSL::Fasta.open(tmp_in, 'w') do |ios|
|
|
132
|
+
process_input_records(input, output, ios, s)
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Iterate over records in the input stream and serialize all records. Also,
|
|
139
|
+
# records with sequence are saved in a FASTA file or emitted to the output
|
|
140
|
+
# stream if the record contains no sequence.
|
|
141
|
+
#
|
|
142
|
+
# @param input [Enumerator] Input stream
|
|
143
|
+
# @param output [Enumerator::Yeilder] Output stream.
|
|
144
|
+
# @param ios [Fasta::IO] Output stream to a FASTA file
|
|
145
|
+
# @param serializer [BioDSL::Serializer] Serializer IO.
|
|
146
|
+
def process_input_records(input, output, ios, serializer)
|
|
147
|
+
input.each_with_index do |record, i|
|
|
148
|
+
@status[:records_in] += 1
|
|
149
|
+
|
|
150
|
+
if record[:SEQ]
|
|
151
|
+
output_entry(ios, record, i)
|
|
152
|
+
else
|
|
153
|
+
@status[:records_out] += 1
|
|
154
|
+
output << record
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
serializer << record
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Save a BioDSL record to a FASTA file.
|
|
162
|
+
#
|
|
163
|
+
# @param ios [Fasta::IO] Output stream to a FASTA file
|
|
164
|
+
# @param record [Hash] BioDSL record.
|
|
165
|
+
# @param i [Integer] Record index.
|
|
166
|
+
def output_entry(ios, record, i)
|
|
167
|
+
@status[:sequences_in] += 1
|
|
168
|
+
|
|
169
|
+
record[:SEQ_NAME] ||= i.to_s
|
|
170
|
+
|
|
171
|
+
entry = BioDSL::Seq.new(seq_name: record[:SEQ_NAME], seq: record[:SEQ])
|
|
172
|
+
|
|
173
|
+
@status[:residues_in] += entry.length
|
|
174
|
+
|
|
175
|
+
ios.puts entry.to_fasta
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Run the uclust command.
|
|
179
|
+
#
|
|
180
|
+
# @param tmp_in [String] Path to input file.
|
|
181
|
+
# @param tmp_out [String] Path to output file.
|
|
182
|
+
#
|
|
183
|
+
# @raise [BioDSL::UsearchError] if command fails.
|
|
184
|
+
def run_uclust(tmp_in, tmp_out)
|
|
185
|
+
uclust_opts = {
|
|
186
|
+
input: tmp_in,
|
|
187
|
+
output: tmp_out,
|
|
188
|
+
strand: @options[:strand],
|
|
189
|
+
identity: @options[:identity],
|
|
190
|
+
align: @options[:align],
|
|
191
|
+
cpus: @options[:cpus],
|
|
192
|
+
verbose: @options[:verbose]
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
BioDSL::Usearch.cluster_smallmem(uclust_opts)
|
|
196
|
+
rescue BioDSL::UsearchError => e
|
|
197
|
+
raise unless e.message =~ /Empty input file/
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# Parse uclust output file and return a hash with Q_ID as key and the uclust
|
|
201
|
+
# record as value.
|
|
202
|
+
#
|
|
203
|
+
# @param tmp_out [String] Path to output file.
|
|
204
|
+
#
|
|
205
|
+
# @return [Hash] Q_ID as keys and Uclust records.
|
|
206
|
+
def parse_output(tmp_out)
|
|
207
|
+
results = {}
|
|
208
|
+
|
|
209
|
+
BioDSL::Usearch.open(tmp_out) do |ios|
|
|
210
|
+
ios.each(:uc) do |record|
|
|
211
|
+
record[:RECORD_TYPE] = 'uclust'
|
|
212
|
+
|
|
213
|
+
results[record[:Q_ID]] = record
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
results
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Parse MSA alignment data from uclust output file and emit to the output
|
|
221
|
+
# stream.
|
|
222
|
+
#
|
|
223
|
+
# @param output [Enumerator::Yeilder] Output stream.
|
|
224
|
+
# @param tmp_out [String] Path to uclust output file.
|
|
225
|
+
def process_output_align(output, tmp_out)
|
|
226
|
+
BioDSL::Fasta.open(tmp_out) do |ios|
|
|
227
|
+
ios.each do |entry|
|
|
228
|
+
if entry.seq_name == 'consensus'
|
|
229
|
+
@status[:clusters_out] += 1
|
|
230
|
+
else
|
|
231
|
+
record = {RECORD_TYPE: 'uclust', CLUSTER: @status[:clusters_out]}
|
|
232
|
+
record.merge!(entry.to_bp)
|
|
233
|
+
|
|
234
|
+
output << record
|
|
235
|
+
@status[:records_out] += 1
|
|
236
|
+
@status[:sequences_out] += 1
|
|
237
|
+
@status[:residues_out] += entry.length
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# Parse results form uclust and merge with serialized data and output to the
|
|
244
|
+
# output stream.
|
|
245
|
+
#
|
|
246
|
+
# @param output [Enumerator::Yeilder] Output stream.
|
|
247
|
+
# @param tmp_rec [String] Path to serialized records file.
|
|
248
|
+
# @param tmp_out [String] Path to uclust output file.
|
|
249
|
+
def process_output(output, tmp_rec, tmp_out)
|
|
250
|
+
results = parse_output(tmp_out)
|
|
251
|
+
|
|
252
|
+
File.open(tmp_rec, 'rb') do |ios_rec|
|
|
253
|
+
BioDSL::Serializer.new(ios_rec) do |s|
|
|
254
|
+
process_output_serial(s, results, output)
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# Deserialize records from temporary file, merge these with cluster data and
|
|
260
|
+
# emit to the output stream.
|
|
261
|
+
#
|
|
262
|
+
# @param serializer [BioDSL::Serializer]
|
|
263
|
+
# Serializer IO.
|
|
264
|
+
#
|
|
265
|
+
# @param results [Hash]
|
|
266
|
+
# Results from uclust with Q_ID as key and uclust record as value
|
|
267
|
+
#
|
|
268
|
+
# @param output [Enumerator::Yeilder]
|
|
269
|
+
# Output stream.
|
|
270
|
+
def process_output_serial(serializer, results, output)
|
|
271
|
+
serializer.each do |record|
|
|
272
|
+
next unless record[:SEQ_NAME]
|
|
273
|
+
|
|
274
|
+
if (r = results[record[:SEQ_NAME]])
|
|
275
|
+
output << record.merge(r)
|
|
276
|
+
@status[:records_out] += 1
|
|
277
|
+
@status[:sequences_out] += 1
|
|
278
|
+
@status[:residues_out] += record[:SEQ].length
|
|
279
|
+
else
|
|
280
|
+
fail BioDSL::UsearchError, 'Sequence name: ' \
|
|
281
|
+
"#{record[:SEQ_NAME]} not found in uclust results"
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
end
|
|
285
|
+
end
|
|
286
|
+
end
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Select unique or non-unique records based on the value of a given key.
|
|
30
|
+
#
|
|
31
|
+
# _unique_values+ selects records from the stream by checking values of a
|
|
32
|
+
# given key. If a duplicate record exists based on the given key, it will only
|
|
33
|
+
# output one record (the first). If the +invert+ option is used, then
|
|
34
|
+
# non-unique records are selected.
|
|
35
|
+
#
|
|
36
|
+
# == Usage
|
|
37
|
+
#
|
|
38
|
+
# unique_values(<key: <string>[, invert: <bool>])
|
|
39
|
+
#
|
|
40
|
+
# === Options
|
|
41
|
+
#
|
|
42
|
+
# * key: <string> - Key for which the value is checked for uniqueness.
|
|
43
|
+
# * invert: <bool> - Select non-unique records (default=false).
|
|
44
|
+
#
|
|
45
|
+
# == Examples
|
|
46
|
+
#
|
|
47
|
+
# Consider the following two column table in the file `test.tab`:
|
|
48
|
+
#
|
|
49
|
+
# Human H1
|
|
50
|
+
# Human H2
|
|
51
|
+
# Human H3
|
|
52
|
+
# Dog D1
|
|
53
|
+
# Dog D2
|
|
54
|
+
# Mouse M1
|
|
55
|
+
#
|
|
56
|
+
# To output only unique values for the first column we first read the table
|
|
57
|
+
# with +read_table+ and then pass the result to +unique_values+:
|
|
58
|
+
#
|
|
59
|
+
# BP.new.read_table(input: "test.tab").unique_values(key: :V0).dump.run
|
|
60
|
+
#
|
|
61
|
+
# {:V0=>"Human", :V1=>"H1"}
|
|
62
|
+
# {:V0=>"Dog", :V1=>"D1"}
|
|
63
|
+
# {:V0=>"Mouse", :V1=>"M1"}
|
|
64
|
+
#
|
|
65
|
+
# To output duplicate records instead use the +invert+ options:
|
|
66
|
+
#
|
|
67
|
+
# BP.new.
|
|
68
|
+
# read_table(input: "test.tab").
|
|
69
|
+
# unique_values(key: :V0, invert: true).
|
|
70
|
+
# dump.
|
|
71
|
+
# run
|
|
72
|
+
#
|
|
73
|
+
# {:V0=>"Human", :V1=>"H2"}
|
|
74
|
+
# {:V0=>"Human", :V1=>"H3"}
|
|
75
|
+
# {:V0=>"Dog", :V1=>"D2"}
|
|
76
|
+
class UniqueValues
|
|
77
|
+
require 'set'
|
|
78
|
+
|
|
79
|
+
STATS = %i(records_in records_out)
|
|
80
|
+
|
|
81
|
+
# Constructor for UniqueValues.
|
|
82
|
+
#
|
|
83
|
+
# @param options [Hash] Options hash.
|
|
84
|
+
# @option options [String,Symbol] :key
|
|
85
|
+
# @option options [Boolean] :invert
|
|
86
|
+
#
|
|
87
|
+
# @return [UniqueValues] Class instance.
|
|
88
|
+
def initialize(options)
|
|
89
|
+
@options = options
|
|
90
|
+
@lookup = Set.new
|
|
91
|
+
@key = options[:key].to_sym
|
|
92
|
+
@invert = options[:invert]
|
|
93
|
+
|
|
94
|
+
check_options
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Return command lambda for unique_values
|
|
98
|
+
#
|
|
99
|
+
# @return [Proc] Command lambda.
|
|
100
|
+
def lmb
|
|
101
|
+
lambda do |input, output, status|
|
|
102
|
+
status_init(status, STATS)
|
|
103
|
+
|
|
104
|
+
input.each do |record|
|
|
105
|
+
@status[:records_in] += 1
|
|
106
|
+
|
|
107
|
+
if output_record?(record)
|
|
108
|
+
output << record
|
|
109
|
+
@status[:records_out] += 1
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
private
|
|
116
|
+
|
|
117
|
+
# Check options.
|
|
118
|
+
def check_options
|
|
119
|
+
options_allowed(@options, :key, :invert)
|
|
120
|
+
options_required(@options, :key)
|
|
121
|
+
options_allowed_values(@options, invert: [true, false, nil])
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# rubocop: disable Metrics/CyclomaticComplexity
|
|
125
|
+
|
|
126
|
+
# Determine if a record should be output or not. If the wanted key is not
|
|
127
|
+
# present in the record it will be output. If the value is unique the record
|
|
128
|
+
# will be output, unless the +invert+ option was used which will result in
|
|
129
|
+
# non-unique records to be output.
|
|
130
|
+
#
|
|
131
|
+
# @param record [Hash] BioDSL record.
|
|
132
|
+
#
|
|
133
|
+
# @return [Boolean]
|
|
134
|
+
def output_record?(record)
|
|
135
|
+
return true unless (value = record[@key])
|
|
136
|
+
|
|
137
|
+
value = value.to_sym if value.is_a? String
|
|
138
|
+
found = @lookup.include?(value)
|
|
139
|
+
|
|
140
|
+
@lookup.add(value) unless found
|
|
141
|
+
|
|
142
|
+
found && @invert || !found && !@invert
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Run usearch_global on sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# This is a wrapper for the +usearch+ tool to run the program usearch_global.
|
|
32
|
+
# Basically sequence type records are searched against a reference database
|
|
33
|
+
# and records with hit information are output.
|
|
34
|
+
#
|
|
35
|
+
# Please refer to the manual:
|
|
36
|
+
#
|
|
37
|
+
# http://drive5.com/usearch/manual/usearch_global.html
|
|
38
|
+
#
|
|
39
|
+
# Usearch 7.0 must be installed for +usearch+ to work. Read more here:
|
|
40
|
+
#
|
|
41
|
+
# http://www.drive5.com/usearch/
|
|
42
|
+
#
|
|
43
|
+
# == Usage
|
|
44
|
+
#
|
|
45
|
+
# usearch_global(<database: <file>, <identity: float>,
|
|
46
|
+
# <strand: "plus|both">[, cpus: <uint>])
|
|
47
|
+
#
|
|
48
|
+
# === Options
|
|
49
|
+
#
|
|
50
|
+
# * database: <file> - Database to search (in FASTA format).
|
|
51
|
+
# * identity: <float> - Similarity for matching in percent between 0.0 and
|
|
52
|
+
# 1.0.
|
|
53
|
+
# * strand: <string> - For nucleotide search report hits from plus or both
|
|
54
|
+
# strands.
|
|
55
|
+
# * cpus: <uint> - Number of CPU cores to use (default=1).
|
|
56
|
+
#
|
|
57
|
+
# == Examples
|
|
58
|
+
#
|
|
59
|
+
class UsearchGlobal
|
|
60
|
+
require 'BioDSL/helpers/aux_helper'
|
|
61
|
+
|
|
62
|
+
include AuxHelper
|
|
63
|
+
|
|
64
|
+
STATS = %i(records_in records_out sequences_in hits_out)
|
|
65
|
+
|
|
66
|
+
# Constructor for UsearchGlobal.
|
|
67
|
+
#
|
|
68
|
+
# @param options [Hash] Options hash.
|
|
69
|
+
# @option options [String] :database
|
|
70
|
+
# @option options [Float] :identity
|
|
71
|
+
# @option options [String,Symbol] :strand
|
|
72
|
+
# @option options [Integer] :cpus
|
|
73
|
+
#
|
|
74
|
+
# @return [UsearchGlobal] Class instance.
|
|
75
|
+
def initialize(options)
|
|
76
|
+
@options = options
|
|
77
|
+
@options[:cpus] ||= 1
|
|
78
|
+
|
|
79
|
+
aux_exist('usearch')
|
|
80
|
+
check_options
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Return command lambda for usearch_global.
|
|
84
|
+
#
|
|
85
|
+
# @return [Proc] Command lambda.
|
|
86
|
+
def lmb
|
|
87
|
+
lambda do |input, output, status|
|
|
88
|
+
status_init(status, STATS)
|
|
89
|
+
|
|
90
|
+
TmpDir.create('in', 'out') do |tmp_in, tmp_out|
|
|
91
|
+
process_input(input, output, tmp_in)
|
|
92
|
+
run_usearch_global(tmp_in, tmp_out)
|
|
93
|
+
process_output(output, tmp_out)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
private
|
|
99
|
+
|
|
100
|
+
# Check options.
|
|
101
|
+
def check_options
|
|
102
|
+
options_allowed(@options, :database, :identity, :strand, :cpus)
|
|
103
|
+
options_required(@options, :database, :identity)
|
|
104
|
+
options_allowed_values(@options, strand: ['plus', 'both', :plus, :both])
|
|
105
|
+
options_files_exist(@options, :database)
|
|
106
|
+
options_assert(@options, ':identity > 0.0')
|
|
107
|
+
options_assert(@options, ':identity <= 1.0')
|
|
108
|
+
options_assert(@options, ':cpus >= 1')
|
|
109
|
+
options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Process input and emit to the output stream while saving all records
|
|
113
|
+
# containing sequences to a temporary FASTA file.
|
|
114
|
+
#
|
|
115
|
+
# @param input [Enumerator] Input stream.
|
|
116
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
117
|
+
# @param tmp_in [String] Path to temporary file.
|
|
118
|
+
def process_input(input, output, tmp_in)
|
|
119
|
+
BioDSL::Fasta.open(tmp_in, 'w') do |ios|
|
|
120
|
+
input.each_with_index do |record, i|
|
|
121
|
+
@status[:records_in] += 1
|
|
122
|
+
|
|
123
|
+
output << record
|
|
124
|
+
|
|
125
|
+
@status[:records_out] += 1
|
|
126
|
+
|
|
127
|
+
next unless record[:SEQ]
|
|
128
|
+
|
|
129
|
+
@status[:sequences_in] += 1
|
|
130
|
+
seq_name = record[:SEQ_NAME] || i.to_s
|
|
131
|
+
|
|
132
|
+
entry = BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ])
|
|
133
|
+
|
|
134
|
+
ios.puts entry.to_fasta
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Run usearch global on the input file and save results in the output file.
|
|
140
|
+
def run_usearch_global(tmp_in, tmp_out)
|
|
141
|
+
run_opts = {
|
|
142
|
+
input: tmp_in,
|
|
143
|
+
output: tmp_out,
|
|
144
|
+
database: @options[:database],
|
|
145
|
+
strand: @options[:strand],
|
|
146
|
+
identity: @options[:identity],
|
|
147
|
+
cpus: @options[:cpus],
|
|
148
|
+
verbose: @options[:verbose]
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
BioDSL::Usearch.usearch_global(run_opts)
|
|
152
|
+
rescue BioDSL::UsearchError => e
|
|
153
|
+
raise unless e.message =~ /Empty input file/
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Parse usearch output file and emit records to the output stream.
|
|
157
|
+
#
|
|
158
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
159
|
+
# @param tmp_out [String] Path to output file.
|
|
160
|
+
def process_output(output, tmp_out)
|
|
161
|
+
BioDSL::Usearch.open(tmp_out) do |ios|
|
|
162
|
+
ios.each(:uc) do |record|
|
|
163
|
+
record[:RECORD_TYPE] = 'usearch'
|
|
164
|
+
output << record
|
|
165
|
+
@status[:hits_out] += 1
|
|
166
|
+
@status[:records_out] += 1
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
end
|