BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Classify sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# +classify_seq+ searches sequences in the stream against a pre-indexed
|
|
32
|
+
# (using +index_taxonomy+) database. The database consists a taxonomic tree
|
|
33
|
+
# index and indices for each taxonomic level saved in the following files
|
|
34
|
+
# (here using the prefix "taxonomy"):
|
|
35
|
+
#
|
|
36
|
+
# * taxonomy_tax_index.dat - return node for a given node id.
|
|
37
|
+
# * taxonomy_kmer_index.dat - return list of node ids for a given level
|
|
38
|
+
# and kmer.
|
|
39
|
+
#
|
|
40
|
+
# Each sequence is broken down into unique kmers of a given kmer_size
|
|
41
|
+
# overlapping with a given step_size - see +index_taxonomy+. Now, for each
|
|
42
|
+
# taxonomic level, starting from species all nodes for each kmer is looked
|
|
43
|
+
# up in the database. The nodes containing most kmers are considered hits.
|
|
44
|
+
# If there are no hits at a taxonomic level, we move to the next level. Hits
|
|
45
|
+
# are sorted according to how many kmers matched this particular node and a
|
|
46
|
+
# consensus taxonomy string is determined. Hits are also filtered with the
|
|
47
|
+
# following options:
|
|
48
|
+
#
|
|
49
|
+
# * hits_max - Include maximally this number of hits in the consensus.
|
|
50
|
+
# * best_only - Include only the best scoring hits in the consensus.
|
|
51
|
+
# That is if a hit consists of 344 kmers out of 345
|
|
52
|
+
# possible, only hits with 344 kmers are included.
|
|
53
|
+
# * coverage - Filter hits based on kmer coverage. If a hit contains
|
|
54
|
+
# fewer kmers than the total amount of kmers x coverage
|
|
55
|
+
# it will be filtered.
|
|
56
|
+
# * consensus - For a number of hits accept consensus at a given level
|
|
57
|
+
# if within this percentage.
|
|
58
|
+
#
|
|
59
|
+
# The output of +classify_seq+ are sequence type records with the
|
|
60
|
+
# additional keys:
|
|
61
|
+
#
|
|
62
|
+
# * TAXONOMY_HITS - The number of hits used in the consensus.
|
|
63
|
+
# * TAXONOMY - The taxonomy string.
|
|
64
|
+
#
|
|
65
|
+
# The consensus is determined from a list of taxonomic strings, i.e. the
|
|
66
|
+
# TAXONOMIC_HITS, and is composed of a consensus for each taxonomic level.
|
|
67
|
+
# E.g. for the kingdom level if 60% of the taxonomic strings indicate
|
|
68
|
+
# 'Bacteria' and the consensus is 50% then the consensus for the kingdom
|
|
69
|
+
# level will be reported as 'Bacteria(60)'. If the name at any level
|
|
70
|
+
# consists of multiple words they are treated independently. E.g if we have
|
|
71
|
+
# three taxonomic strings at the species level with the names:
|
|
72
|
+
#
|
|
73
|
+
# * Escherichia coli K-12
|
|
74
|
+
# * Escherichia coli sp. AC3432
|
|
75
|
+
# * Escherichia coli sp. AC1232
|
|
76
|
+
#
|
|
77
|
+
# The corresponding consensus for that level will be reported as
|
|
78
|
+
# 'Escherichia coli sp.(100/100/66)'. The forth word in the last two
|
|
79
|
+
# taxonomy strings (AC3432 and AC1232) have a consensus below 50% and are
|
|
80
|
+
# ignored.
|
|
81
|
+
#
|
|
82
|
+
# == Usage
|
|
83
|
+
#
|
|
84
|
+
# classify_seq(<dir: <dir>>[, prefix: <string>[, kmer_size: <uint>
|
|
85
|
+
# [, step_size: <uint>[, hits_max: <uint>[, consensus:
|
|
86
|
+
# <float>[, coverage: <float>[, best_only: <bool>]]]]]]])
|
|
87
|
+
#
|
|
88
|
+
# === Options
|
|
89
|
+
#
|
|
90
|
+
# * dir: <dir> - Directory containing taxonomy files.
|
|
91
|
+
# * prefix: <string> - Taxonomy files prefix (default="taxonomy").
|
|
92
|
+
# * kmer_size: <uint> - Kmer size (default=8).
|
|
93
|
+
# * step_size: <uint> - Step size (default=1).
|
|
94
|
+
# * hits_max: <uint> - Maximum hits to include in consensus (default=50).
|
|
95
|
+
# * consensus: <float> - Consensus cutoff (default=0.51).
|
|
96
|
+
# * coverage: <float> - Coverate cutoff (default=0.9).
|
|
97
|
+
# * best_only: <bool> - Only use best hits for consensus (default=true).
|
|
98
|
+
#
|
|
99
|
+
# == Examples
|
|
100
|
+
#
|
|
101
|
+
# To classify a bunch of OTU sequences in the file +otus.fna+ we do:
|
|
102
|
+
#
|
|
103
|
+
# BP.new.
|
|
104
|
+
# read_fasta(input: "otus.fna").
|
|
105
|
+
# classify_seq(dir: "RDP11_3").
|
|
106
|
+
# write_table(keys: [:SEQ_NAME, :TAXONOMY_HITS, :TAXONOMY]).
|
|
107
|
+
# run
|
|
108
|
+
#
|
|
109
|
+
# OTU_0 1 K#Bacteria(100);P#Proteobacteria(100);C#Gammaproteobacteria...
|
|
110
|
+
# OTU_1 1 K#Bacteria(100);P#Proteobacteria(100);C#Gammaproteobacteria...
|
|
111
|
+
# OTU_2 1 K#Bacteria(100);P#Proteobacteria(100);C#Gammaproteobacteria...
|
|
112
|
+
# OTU_3 1 K#Bacteria(100);P#Proteobacteria(100);C#Gammaproteobacteria...
|
|
113
|
+
# OTU_4 2 K#Bacteria(100);P#Fusobacteria(100);C#Fusobacteriia(100);O#...
|
|
114
|
+
class ClassifySeq
|
|
115
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
116
|
+
residues_out)
|
|
117
|
+
|
|
118
|
+
# Constructor for the ClassifySeq class.
|
|
119
|
+
#
|
|
120
|
+
# @param [Hash] options Options hash.
|
|
121
|
+
# @option options [String] :dir Directory path with indexes.
|
|
122
|
+
# @option options [String] :prefix Index prefix.
|
|
123
|
+
# @option options [Integer] :kmer_size Kmer size.
|
|
124
|
+
# @option options [Integer] :step_size Step size.
|
|
125
|
+
# @option options [Integer] :hits_max Max hits to report per sequence.
|
|
126
|
+
# @option options [Float] :consensus Taxonomy string consensus percent.
|
|
127
|
+
# @option options [Float] :coverage Kmer coverage filter percent.
|
|
128
|
+
# @option options [Boolean] :best_only Flag to report best hit only.
|
|
129
|
+
#
|
|
130
|
+
# @return [ClassifySeq] Returns an instance of the class.
|
|
131
|
+
def initialize(options)
|
|
132
|
+
@options = options
|
|
133
|
+
|
|
134
|
+
check_options
|
|
135
|
+
defaults
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Return a lambda for the ClassifySeq command.
|
|
139
|
+
#
|
|
140
|
+
# @return [Proc] Returns the command lambda.
|
|
141
|
+
def lmb
|
|
142
|
+
lambda do |input, output, status|
|
|
143
|
+
status_init(status, STATS)
|
|
144
|
+
|
|
145
|
+
@status[:sequences_in] = 0
|
|
146
|
+
|
|
147
|
+
search = BioDSL::Taxonomy::Search.new(@options)
|
|
148
|
+
|
|
149
|
+
input.each_with_index do |record, i|
|
|
150
|
+
@status[:records_in] += 1
|
|
151
|
+
|
|
152
|
+
classify_seq(record, i, search) if record.key? :SEQ
|
|
153
|
+
|
|
154
|
+
output << record
|
|
155
|
+
@status[:records_out] += 1
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
private
|
|
161
|
+
|
|
162
|
+
# Check options.
|
|
163
|
+
def check_options
|
|
164
|
+
options_allowed(@options, :dir, :prefix, :kmer_size, :step_size,
|
|
165
|
+
:hits_max, :consensus, :coverage, :best_only)
|
|
166
|
+
options_required(@options, :dir)
|
|
167
|
+
options_dirs_exist(@options, :dir)
|
|
168
|
+
options_allowed_values(@options, best_only: [nil, true, false])
|
|
169
|
+
|
|
170
|
+
run_assertions
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Run assertions.
|
|
174
|
+
def run_assertions
|
|
175
|
+
options_assert(@options, ':kmer_size > 0')
|
|
176
|
+
options_assert(@options, ':kmer_size <= 12')
|
|
177
|
+
options_assert(@options, ':step_size > 0')
|
|
178
|
+
options_assert(@options, ':step_size <= 12')
|
|
179
|
+
options_assert(@options, ':hits_max > 0')
|
|
180
|
+
options_assert(@options, ':consensus > 0')
|
|
181
|
+
options_assert(@options, ':consensus <= 1')
|
|
182
|
+
options_assert(@options, ':coverage > 0')
|
|
183
|
+
options_assert(@options, ':coverage <= 1')
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Set default options.
|
|
187
|
+
def defaults
|
|
188
|
+
@options[:prefix] ||= 'taxonomy'
|
|
189
|
+
@options[:kmer_size] ||= 8
|
|
190
|
+
@options[:step_size] ||= 1
|
|
191
|
+
@options[:hits_max] ||= 50
|
|
192
|
+
@options[:consensus] ||= 0.51
|
|
193
|
+
@options[:coverage] ||= 0.9
|
|
194
|
+
@options[:best_only] = true if @options[:best_only].nil?
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Execute classfication of a sequence containing record.
|
|
198
|
+
#
|
|
199
|
+
# @param record [Hash] BioDSL record.
|
|
200
|
+
# @param i [Fixnum] Record number,
|
|
201
|
+
# @param search [BioDSL::Taxonomy::Search] Search object.
|
|
202
|
+
def classify_seq(record, i, search)
|
|
203
|
+
@status[:sequences_in] += 1
|
|
204
|
+
@status[:sequences_out] += 1
|
|
205
|
+
@status[:residues_in] += record[:SEQ].length
|
|
206
|
+
@status[:residues_out] += record[:SEQ].length
|
|
207
|
+
seq_name = record[:SEQ_NAME] || i.to_s
|
|
208
|
+
|
|
209
|
+
result = search.execute(BioDSL::Seq.new(seq_name: seq_name,
|
|
210
|
+
seq: record[:SEQ]))
|
|
211
|
+
|
|
212
|
+
record[:TAXONOMY] = result.taxonomy
|
|
213
|
+
record[:TAXONOMY_HITS] = result.hits
|
|
214
|
+
record[:RECORD_TYPE] = 'taxonomy'
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
end
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Run classify_seq_mothur on sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# This is a wrapper for the +mothur+ command +classify.seqs()+. Basically,
|
|
32
|
+
# it classifies sequences in the stream given a database file and a taxonomy
|
|
33
|
+
# file which can be downloaded here:
|
|
34
|
+
#
|
|
35
|
+
# http://www.mothur.org/w/images/5/59/Trainset9_032012.pds.zip
|
|
36
|
+
#
|
|
37
|
+
# Please refer to the manual:
|
|
38
|
+
#
|
|
39
|
+
# http://www.mothur.org/wiki/Classify.seqs
|
|
40
|
+
#
|
|
41
|
+
# Mothur must be installed for +classify_seq_mothurs+ to work. Read more here:
|
|
42
|
+
#
|
|
43
|
+
# http://www.mothur.org/
|
|
44
|
+
#
|
|
45
|
+
# == Usage
|
|
46
|
+
#
|
|
47
|
+
# classify_seq_mothur(<database: <file>>, <taxonomy: <file>>
|
|
48
|
+
# [, confidence: <uint>[, cpus: <uint>]])
|
|
49
|
+
#
|
|
50
|
+
# === Options
|
|
51
|
+
#
|
|
52
|
+
# * database: <file> - Database to search.
|
|
53
|
+
# * taxonomy: <file> - Taxonomy file for mapping names.
|
|
54
|
+
# * confidence: <uint> - Confidence threshold (defualt=80).
|
|
55
|
+
# * cpus: <uint> - Number of CPU cores to use (default=1).
|
|
56
|
+
#
|
|
57
|
+
# == Examples
|
|
58
|
+
#
|
|
59
|
+
# To classify a bunch of OTU sequences in the file +otus.fna+ we do:
|
|
60
|
+
#
|
|
61
|
+
# database = "trainset9_032012.pds.fasta"
|
|
62
|
+
# taxonomy = "trainset9_032012.pds.tax"
|
|
63
|
+
#
|
|
64
|
+
# BP.new.
|
|
65
|
+
# read_fasta(input: "otus.fna").
|
|
66
|
+
# classify_seq_mothur(database: database, taxonomy: taxonomy).
|
|
67
|
+
# grab(exact: true, keys: :RECORD_TYPE, select: "taxonomy").
|
|
68
|
+
# write_table(output: "classified.tab", header: true, force: true,
|
|
69
|
+
# skip: [:RECORD_TYPE]).
|
|
70
|
+
# run
|
|
71
|
+
class ClassifySeqMothur
|
|
72
|
+
require 'English'
|
|
73
|
+
require 'BioDSL/helpers/aux_helper'
|
|
74
|
+
|
|
75
|
+
include AuxHelper
|
|
76
|
+
|
|
77
|
+
STATS = %i(records_in records_out sequences_in sequences_out
|
|
78
|
+
residues_in residues_out)
|
|
79
|
+
|
|
80
|
+
# Constructor for ClassifySeqMothur.
|
|
81
|
+
#
|
|
82
|
+
# @param options [Hash] Options hash.
|
|
83
|
+
# @option options [String] :database Path to database file.
|
|
84
|
+
# @option options [String] :taxonomy Path to taxonomy file.
|
|
85
|
+
# @option options [Integer] :confidence Confidence cutoff.
|
|
86
|
+
# @option options [Integer] :cpus Number of CPUs to use.
|
|
87
|
+
#
|
|
88
|
+
# @return [ClassifySeqMothur] Instance of class.
|
|
89
|
+
def initialize(options)
|
|
90
|
+
@options = options
|
|
91
|
+
|
|
92
|
+
aux_exist('mothur')
|
|
93
|
+
check_options
|
|
94
|
+
defaults
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Command lambda for ClassifySeqMothur.
|
|
98
|
+
#
|
|
99
|
+
# @return [Proc] Lambda for the command.
|
|
100
|
+
def lmb
|
|
101
|
+
lambda do |input, output, status|
|
|
102
|
+
status_init(status, STATS)
|
|
103
|
+
|
|
104
|
+
TmpDir.create('input.fasta') do |tmp_in, tmp_dir|
|
|
105
|
+
process_input(input, output, tmp_in)
|
|
106
|
+
run_mothur(tmp_dir, tmp_in)
|
|
107
|
+
tmp_out = Dir.glob("#{tmp_dir}/input.*.taxonomy").first
|
|
108
|
+
process_output(output, tmp_out)
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
private
|
|
114
|
+
|
|
115
|
+
# Check options.
|
|
116
|
+
def check_options
|
|
117
|
+
options_allowed(@options, :database, :taxonomy, :confidence, :cpus)
|
|
118
|
+
options_required(@options, :database, :taxonomy)
|
|
119
|
+
options_files_exist(@options, :database, :taxonomy)
|
|
120
|
+
options_assert(@options, ':confidence > 0')
|
|
121
|
+
options_assert(@options, ':confidence <= 100')
|
|
122
|
+
options_assert(@options, ':cpus >= 1')
|
|
123
|
+
options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
|
|
124
|
+
|
|
125
|
+
defaults
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Set default options.
|
|
129
|
+
def defaults
|
|
130
|
+
@options[:confidence] ||= 80
|
|
131
|
+
@options[:cpus] ||= 1
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Process input data and save sequences to a temporary file for
|
|
135
|
+
# classifcation.
|
|
136
|
+
#
|
|
137
|
+
# @param input [Enumerator] Input stream.
|
|
138
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
139
|
+
# @param tmp_in [String] Path to temporary FASTA file.
|
|
140
|
+
def process_input(input, output, tmp_in)
|
|
141
|
+
BioDSL::Fasta.open(tmp_in, 'w') do |ios|
|
|
142
|
+
input.each_with_index do |record, i|
|
|
143
|
+
@status[:records_in] += 1
|
|
144
|
+
|
|
145
|
+
if record[:SEQ]
|
|
146
|
+
@status[:sequences_in] += 1
|
|
147
|
+
@status[:sequences_out] += 1
|
|
148
|
+
@status[:residues_in] += record[:SEQ].length
|
|
149
|
+
@status[:records_out] += record[:SEQ].length
|
|
150
|
+
seq_name = record[:SEQ_NAME] || i.to_s
|
|
151
|
+
|
|
152
|
+
entry = BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ])
|
|
153
|
+
|
|
154
|
+
ios.puts entry.to_fasta
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
output << record
|
|
158
|
+
@status[:records_out] += 1
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Run Mothur using a system call.
|
|
164
|
+
#
|
|
165
|
+
# @param tmp_dir [String] Path to temporary dir.
|
|
166
|
+
# @param tmp_in [String] Path to input file.
|
|
167
|
+
#
|
|
168
|
+
# @raise [RunTimeError] If system call fails.
|
|
169
|
+
def run_mothur(tmp_dir, tmp_in)
|
|
170
|
+
cmd = <<-CMD.gsub(/^\s+\|/, '').delete("\n")
|
|
171
|
+
|mothur "#set.dir(input=#{tmp_dir});
|
|
172
|
+
|set.dir(output=#{tmp_dir});
|
|
173
|
+
|classify.seqs(fasta=#{tmp_in},
|
|
174
|
+
|reference=#{@options[:database]},
|
|
175
|
+
|taxonomy=#{@options[:taxonomy]},
|
|
176
|
+
|method=wang,
|
|
177
|
+
|processors=#{@options[:cpus]})"
|
|
178
|
+
CMD
|
|
179
|
+
|
|
180
|
+
BioDSL.verbose ? system(cmd) : system("#{cmd} > /dev/null 2>&1")
|
|
181
|
+
|
|
182
|
+
fail 'Mothur failed' unless $CHILD_STATUS.success?
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Parse mothur classfication output and emit to stream.
|
|
186
|
+
#
|
|
187
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
188
|
+
# @param tmp_out [String] Path to file with classfication result.
|
|
189
|
+
def process_output(output, tmp_out)
|
|
190
|
+
BioDSL::CSV.open(tmp_out) do |ios|
|
|
191
|
+
ios.each_hash do |new_record|
|
|
192
|
+
new_record[:SEQ_NAME] = new_record[:V0]
|
|
193
|
+
new_record[:TAXONOMY] = new_record[:V1]
|
|
194
|
+
new_record[:TAXONOMY].tr!('"', '')
|
|
195
|
+
new_record.delete(:V0)
|
|
196
|
+
new_record.delete(:V1)
|
|
197
|
+
new_record[:TAXONOMY] = confidence_filter(new_record)
|
|
198
|
+
new_record[:RECORD_TYPE] = 'taxonomy'
|
|
199
|
+
output << new_record
|
|
200
|
+
@status[:records_out] += 1
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
# Filter taxonomic leveles based on the confidence.
|
|
206
|
+
#
|
|
207
|
+
# @param record [Hash] BioDSL record with taxonomy.
|
|
208
|
+
#
|
|
209
|
+
# @return [String] Return taxonomic string.
|
|
210
|
+
def confidence_filter(record)
|
|
211
|
+
new_levels = []
|
|
212
|
+
|
|
213
|
+
record[:TAXONOMY].split(';').each do |level|
|
|
214
|
+
next unless level =~ /^([^(]+)\((\d+)\)$/
|
|
215
|
+
name = Regexp.last_match(1)
|
|
216
|
+
confidence = Regexp.last_match(2).to_i
|
|
217
|
+
|
|
218
|
+
if confidence >= @options[:confidence]
|
|
219
|
+
new_levels << "#{name}(#{confidence})"
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
new_levels.empty? ? 'Unclassified' : new_levels.join(';')
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
end
|