BioDSL 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
@@ -0,0 +1,691 @@
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
18
|
+
# USA. #
|
19
|
+
# #
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
21
|
+
# #
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
|
+
# #
|
24
|
+
# This software is part of BioDSL (www.github.com/maasha/BioDSL). #
|
25
|
+
# #
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
|
+
|
28
|
+
module BioDSL
|
29
|
+
class TaxonomyError < StandardError; end
|
30
|
+
|
31
|
+
# Module containing classes for creating a taxonomic database and searching
|
32
|
+
# this.
|
33
|
+
module Taxonomy
|
34
|
+
require 'narray'
|
35
|
+
|
36
|
+
TAX_LEVELS = [:r, :k, :p, :c, :o, :f, :g, :s]
|
37
|
+
|
38
|
+
# rubocop: disable ClassLength
|
39
|
+
|
40
|
+
# Class for creating and databasing an index of a taxonomic tree. This is
|
41
|
+
# done in two steps. 1) A temporary tree is creating using the taxonomic
|
42
|
+
# strings from the sequence names in a FASTA file. 2) A simplistic tree
|
43
|
+
# is constructed from the temporary tree allowing this to be saved to files.
|
44
|
+
# The resulting index consists of the following files:
|
45
|
+
# * taxonomy_tax_index.dat - return node for a given node id.
|
46
|
+
# * taxonomy_kmer_index.dat - return list of node ids for a given level and
|
47
|
+
# kmer.
|
48
|
+
class Index
|
49
|
+
require 'set'
|
50
|
+
|
51
|
+
attr_reader :size, :node_id
|
52
|
+
alias_method :size, :node_id
|
53
|
+
|
54
|
+
# Constructor Index object.
|
55
|
+
def initialize(options)
|
56
|
+
@options = options # Option hash
|
57
|
+
@seq_id = 0 # Sequence id
|
58
|
+
@node_id = 0 # Node id
|
59
|
+
@tree = TaxNode.new(nil, :r, 'root', nil, @node_id) # Root node
|
60
|
+
@node_id += 1
|
61
|
+
|
62
|
+
%i(kmer_size step_size output_dir prefix).each do |option|
|
63
|
+
fail TaxonomyError, "missing #{option} option" unless @options[option]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Method to add a Sequence entry to the taxonomic tree. The sequence name
|
68
|
+
# contain a taxonomic string.
|
69
|
+
#
|
70
|
+
# Example entry:
|
71
|
+
# seq_name: K#Bacteria;P#Proteobacteria;C#Gammaproteobacteria; \
|
72
|
+
# O#Vibrionales;F#Vibrionaceae;G#Vibrio;S#Vibrio
|
73
|
+
# seq: UCCUACGGGAGGCAGCAGUGGGGAAUAUUGCACAAUGGGCGCAAGCCUGA \
|
74
|
+
# UGCAGCCAUGCCGCGUGUAUGAAGGCCUUCGGGUUGUAACUC ...
|
75
|
+
#
|
76
|
+
# The sequence is reduced to a list of oligos of a given size and a given
|
77
|
+
# step size, e.g. 8 and 1, respectively:
|
78
|
+
#
|
79
|
+
# UCCUACGG
|
80
|
+
# CCUACGGG
|
81
|
+
# CUACGGGA
|
82
|
+
# UACGGGAG
|
83
|
+
# ACGGGAGG
|
84
|
+
# ...
|
85
|
+
#
|
86
|
+
# Each oligo is encoded as an kmer (integer) by encoding two bits per
|
87
|
+
# nucleotide:
|
88
|
+
#
|
89
|
+
# A = 00
|
90
|
+
# U = 01
|
91
|
+
# C = 10
|
92
|
+
# G = 11
|
93
|
+
#
|
94
|
+
# E.g. UCCUACGG = 0110100100101111 = 26927
|
95
|
+
#
|
96
|
+
# For each node in the tree a set is kept containing information of
|
97
|
+
# all observed oligos for that particular node. Thus all child nodes
|
98
|
+
# contain a subset of oligos compared to the parent node.
|
99
|
+
def add(entry)
|
100
|
+
node = @tree
|
101
|
+
old_name = false
|
102
|
+
tax_levels = entry.seq_name.split(';')
|
103
|
+
|
104
|
+
if tax_levels.size != TAX_LEVELS.size - 1
|
105
|
+
fail TaxonomyError, "Wrong number of tax levels in #{entry.seq_name}"
|
106
|
+
end
|
107
|
+
|
108
|
+
tax_levels.each_with_index do |tax_level, i|
|
109
|
+
level, name = tax_level.split('#')
|
110
|
+
|
111
|
+
if level.downcase.to_sym != TAX_LEVELS[i + 1]
|
112
|
+
fail TaxonomyError, "Unexpected tax id in #{entry.seq_name}"
|
113
|
+
end
|
114
|
+
|
115
|
+
if name
|
116
|
+
if i > 0 && !old_name
|
117
|
+
fail TaxonomyError, "Gapped tax level info in #{entry.seq_name}"
|
118
|
+
end
|
119
|
+
|
120
|
+
if (child = node[name])
|
121
|
+
else
|
122
|
+
child = TaxNode.new(node, level.downcase.to_sym, name, @seq_id,
|
123
|
+
@node_id)
|
124
|
+
@node_id += 1
|
125
|
+
end
|
126
|
+
|
127
|
+
if leaf?(tax_levels, i)
|
128
|
+
kmers = entry.to_kmers(kmer_size: @options[:kmer_size],
|
129
|
+
step_size: @options[:step_size])
|
130
|
+
child.kmers |= Set.new(kmers)
|
131
|
+
end
|
132
|
+
|
133
|
+
node[name] = child
|
134
|
+
node = node[name]
|
135
|
+
end
|
136
|
+
|
137
|
+
old_name = name
|
138
|
+
end
|
139
|
+
|
140
|
+
@seq_id += 1
|
141
|
+
|
142
|
+
self
|
143
|
+
end
|
144
|
+
|
145
|
+
# Remap and save taxonomic tree to index files.
|
146
|
+
def save
|
147
|
+
tree_union(@tree)
|
148
|
+
|
149
|
+
save_kmer_index
|
150
|
+
save_tax_index
|
151
|
+
end
|
152
|
+
|
153
|
+
# Testing method to get a node given an id. Returns nil if node wasn't
|
154
|
+
# found.
|
155
|
+
def get_node(id)
|
156
|
+
queue = [@tree]
|
157
|
+
|
158
|
+
until queue.empty?
|
159
|
+
node = queue.shift
|
160
|
+
|
161
|
+
return node if node.node_id == id
|
162
|
+
|
163
|
+
node.children.each_value do |child|
|
164
|
+
queue.unshift(child) unless child.nil?
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
nil
|
169
|
+
end
|
170
|
+
|
171
|
+
# Method that traverses the tax tree and populate all parent nodes with
|
172
|
+
# the union of all kmers from the patents children.
|
173
|
+
def tree_union(node = @tree)
|
174
|
+
node.children.each_value { |child| tree_union(child) }
|
175
|
+
|
176
|
+
node.children.each_value do |child|
|
177
|
+
if node.kmers.nil? && child.kmers.nil?
|
178
|
+
elsif node.kmers.nil?
|
179
|
+
node.kmers = child.kmers
|
180
|
+
else
|
181
|
+
node.kmers |= child.kmers if child.kmers
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
private
|
187
|
+
|
188
|
+
# Method that determines if a node is a leaf or not.
|
189
|
+
def leaf?(tax_levels, i)
|
190
|
+
if tax_levels[i + 1] && tax_levels[i + 1].split('#')[1]
|
191
|
+
false
|
192
|
+
else
|
193
|
+
true
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
# Save tax index to file.
|
198
|
+
def save_tax_index
|
199
|
+
file = File.join(@options[:output_dir],
|
200
|
+
"#{@options[:prefix]}_tax_index.dat")
|
201
|
+
File.open(file, 'wb') do |ios|
|
202
|
+
ios.puts %w(#SEQ_ID NODE_ID LEVEL NAME PARENT_ID).join("\t")
|
203
|
+
queue = [@tree]
|
204
|
+
|
205
|
+
until queue.empty?
|
206
|
+
node = queue.shift
|
207
|
+
|
208
|
+
ios.puts [node.seq_id, node.node_id, node.level, node.name,
|
209
|
+
node.parent_id].join("\t")
|
210
|
+
|
211
|
+
node.children.each_value do |child|
|
212
|
+
queue.unshift(child) unless child.nil?
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
# Construct and save kmer index to file. This is done BFS style one
|
219
|
+
# taxonomic level at a time to save memory.
|
220
|
+
def save_kmer_index
|
221
|
+
file = File.join(@options[:output_dir],
|
222
|
+
"#{@options[:prefix]}_kmer_index.dat")
|
223
|
+
File.open(file, 'wb') do |ios|
|
224
|
+
ios.puts %w(#LEVEL KMER NODES).join("\t")
|
225
|
+
|
226
|
+
level = 0
|
227
|
+
queue = [@tree]
|
228
|
+
|
229
|
+
until queue.empty?
|
230
|
+
kmer_index = Hash.new { |h, k| h[k] = [] }
|
231
|
+
new_queue = []
|
232
|
+
|
233
|
+
queue.each do |node|
|
234
|
+
node.kmers.to_a.map { |kmer| kmer_index[kmer] << node.node_id }
|
235
|
+
node.children.each_value { |child| child && new_queue << child }
|
236
|
+
end
|
237
|
+
|
238
|
+
kmer_index.keys.sort.each do |kmer|
|
239
|
+
nodes = kmer_index[kmer].sort.join(';')
|
240
|
+
|
241
|
+
ios.puts [TAX_LEVELS[level], kmer, nodes].join("\t")
|
242
|
+
end
|
243
|
+
|
244
|
+
queue = new_queue
|
245
|
+
level += 1
|
246
|
+
end
|
247
|
+
|
248
|
+
kmer_index
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
# Class for the nodes used for constructing the taxonomic tree.
|
253
|
+
class TaxNode
|
254
|
+
attr_accessor :kmers
|
255
|
+
attr_reader :parent, :level, :name, :children, :seq_id, :node_id
|
256
|
+
|
257
|
+
# Constructor for TaxNode objects.
|
258
|
+
def initialize(parent, level, name, seq_id, node_id)
|
259
|
+
@parent = parent # Parent node.
|
260
|
+
@level = level # Taxonomic level.
|
261
|
+
@name = name # Taxonomic name.
|
262
|
+
@kmers = Set.new # Kmer set.
|
263
|
+
@seq_id = seq_id # Sequ id (a representative seq for debugging).
|
264
|
+
@node_id = node_id # Node id.
|
265
|
+
@children = {} # Child node hash.
|
266
|
+
end
|
267
|
+
|
268
|
+
# Returns parent node id if a parent exist, else nil.
|
269
|
+
def parent_id
|
270
|
+
@parent.node_id if @parent
|
271
|
+
end
|
272
|
+
|
273
|
+
# Returns an array of children node ids.
|
274
|
+
def children_ids
|
275
|
+
ids = []
|
276
|
+
|
277
|
+
@children.each_value { |child| ids << child.id }
|
278
|
+
|
279
|
+
ids
|
280
|
+
end
|
281
|
+
|
282
|
+
# Getter method for node children.
|
283
|
+
def [](key)
|
284
|
+
@children[key]
|
285
|
+
end
|
286
|
+
|
287
|
+
# Setter method for node children.
|
288
|
+
def []=(key, value)
|
289
|
+
@children[key] = value
|
290
|
+
end
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
# Class for searching sequences in a taxonomic database. The database
|
295
|
+
# consists a taxonomic tree index and indices for each taxonomic level
|
296
|
+
# saved in the following files:
|
297
|
+
# * taxonomy_tax_index.dat - return node for a given node id.
|
298
|
+
# * taxonomy_kmer_index.dat - return list of node ids for a given level and
|
299
|
+
# kmer.
|
300
|
+
class Search
|
301
|
+
MAX_COUNT = 200_000
|
302
|
+
MAX_HITS = 2_000 # Max num of shared oligos between two sequences.
|
303
|
+
BYTES_IN_INT = 4
|
304
|
+
BYTES_IN_HIT = 2 * BYTES_IN_INT
|
305
|
+
|
306
|
+
# Constructor for initializing a Search object.
|
307
|
+
def initialize(options)
|
308
|
+
@options = options
|
309
|
+
|
310
|
+
symbols = %i(kmer_size step_size dir prefix consensus coverage hits_max)
|
311
|
+
|
312
|
+
symbols.each do |opt|
|
313
|
+
fail TaxonomyError, "missing #{opt} option" unless @options[opt]
|
314
|
+
end
|
315
|
+
|
316
|
+
@count_ary = BioDSL::CAry.new(MAX_COUNT, BYTES_IN_INT)
|
317
|
+
@hit_ary = BioDSL::CAry.new(MAX_HITS, BYTES_IN_HIT)
|
318
|
+
@tax_index = load_tax_index
|
319
|
+
@kmer_index = load_kmer_index
|
320
|
+
end
|
321
|
+
|
322
|
+
# Method to execute a search for a given sequence entry. First the
|
323
|
+
# sequence is broken down into unique kmers of a given kmer_size
|
324
|
+
# overlapping with a given step_size. See Taxonomy::Index.add.
|
325
|
+
# Now, for each taxonomic level, starting from species all nodes
|
326
|
+
# for each kmer is looked up in the database. The nodes containing
|
327
|
+
# most kmers are considered hits. If there are no hits at a taxonomic
|
328
|
+
# level, we move to the next level. Hits are sorted according to how
|
329
|
+
# many kmers matched this particular node and a consensus taxonomy
|
330
|
+
# string is determined. Hits are also filtered with the following
|
331
|
+
# options:
|
332
|
+
# * hits_max - Include maximally this number of hits in the consensus.
|
333
|
+
# * best_only - Include only the best scoring hits in the consensus.
|
334
|
+
# That is if a hit consists of 344 kmers out of 345
|
335
|
+
# possible, only hits with 344 kmers are included.
|
336
|
+
# * coverage - Filter hits based on kmer coverage. If a hit contains
|
337
|
+
# fewer kmers than the total amount of kmers x coverage
|
338
|
+
# it will be filtered.
|
339
|
+
# * consensus - For a number of hits accept consensus at a given level
|
340
|
+
# if within this percentage.
|
341
|
+
def execute(entry)
|
342
|
+
kmers = entry.to_kmers(kmer_size: @options[:kmer_size],
|
343
|
+
step_size: @options[:step_size])
|
344
|
+
|
345
|
+
puts "DEBUG Q: #{entry.seq_name}" if BioDSL.debug
|
346
|
+
|
347
|
+
TAX_LEVELS.reverse.each do |level|
|
348
|
+
kmers_lookup(kmers, level)
|
349
|
+
|
350
|
+
hit_count = hits_select_C(@count_ary.ary, @count_ary.count,
|
351
|
+
@hit_ary.ary, kmers.size,
|
352
|
+
(@options[:best_only] ? 1 : 0),
|
353
|
+
@options[:coverage])
|
354
|
+
hit_count = @options[:hits_max] if @options[:hits_max] < hit_count
|
355
|
+
|
356
|
+
if hit_count == 0
|
357
|
+
puts "DEBUG no hits @ #{level}" if BioDSL.debug
|
358
|
+
else
|
359
|
+
puts "DEBUG hit(s) @ #{level}" if BioDSL.debug
|
360
|
+
taxpaths = []
|
361
|
+
|
362
|
+
(0...hit_count).each do |i|
|
363
|
+
start = BYTES_IN_HIT * i
|
364
|
+
stop = BYTES_IN_HIT * i + BYTES_IN_HIT
|
365
|
+
|
366
|
+
node_id, count = @hit_ary.ary[start...stop].unpack('II')
|
367
|
+
|
368
|
+
taxpath = TaxPath.new(node_id, count, kmers.size, @tax_index)
|
369
|
+
|
370
|
+
if BioDSL.debug
|
371
|
+
seq_id = @tax_index[node_id].seq_id
|
372
|
+
puts "DEBUG S_ID: #{seq_id} KMERS: [#{count}/#{kmers.size}] \
|
373
|
+
#{taxpath}"
|
374
|
+
end
|
375
|
+
|
376
|
+
taxpaths << taxpath
|
377
|
+
end
|
378
|
+
|
379
|
+
return Result.new(hit_count, compile_consensus(taxpaths, hit_count).
|
380
|
+
tr('_', ' '))
|
381
|
+
end
|
382
|
+
end
|
383
|
+
|
384
|
+
Result.new(0, 'Unclassified')
|
385
|
+
end
|
386
|
+
|
387
|
+
private
|
388
|
+
|
389
|
+
# Method to load and return the tax_index from file.
|
390
|
+
def load_tax_index
|
391
|
+
tax_index = {}
|
392
|
+
file = File.join(@options[:dir], "#{@options[:prefix]}_tax_index.dat")
|
393
|
+
File.open(file) do |ios|
|
394
|
+
ios.each do |line|
|
395
|
+
line.chomp!
|
396
|
+
|
397
|
+
next if line[0] == '#'
|
398
|
+
|
399
|
+
seq_id, node_id, level, name, parent_id = line.split("\t")
|
400
|
+
|
401
|
+
tax_index[node_id.to_i] = Node.new(seq_id.to_i, node_id.to_i,
|
402
|
+
level.to_sym, name,
|
403
|
+
parent_id.to_i)
|
404
|
+
end
|
405
|
+
end
|
406
|
+
|
407
|
+
tax_index
|
408
|
+
end
|
409
|
+
|
410
|
+
# Method to load and return the kmer_index from file.
|
411
|
+
def load_kmer_index
|
412
|
+
kmer_index = Hash.new { |h, k| h[k] = {} }
|
413
|
+
file = File.join(@options[:dir], "#{@options[:prefix]}_kmer_index.dat")
|
414
|
+
File.open(file) do |ios|
|
415
|
+
ios.each do |line|
|
416
|
+
line.chomp!
|
417
|
+
|
418
|
+
next if line[0] == '#'
|
419
|
+
|
420
|
+
level, kmer, nodes = line.split("\t")
|
421
|
+
|
422
|
+
kmer_index[level.to_sym][kmer.to_i] = nodes.split(';').map(&:to_i).
|
423
|
+
pack('I*')
|
424
|
+
end
|
425
|
+
end
|
426
|
+
|
427
|
+
kmer_index
|
428
|
+
end
|
429
|
+
|
430
|
+
# Method that given a list of kmers and a taxonomic level
|
431
|
+
# lookups all the nodes for each kmer and increment the
|
432
|
+
# count array posisions for all nodes. The lookup for each
|
433
|
+
# kmer is initially done from a database, but subsequent
|
434
|
+
# lookups for that particular kmer are cached.
|
435
|
+
def kmers_lookup(kmers, level)
|
436
|
+
@count_ary.zero!
|
437
|
+
|
438
|
+
kmers.each do |kmer|
|
439
|
+
next unless @kmer_index[level]
|
440
|
+
|
441
|
+
if (nodes = @kmer_index[level][kmer])
|
442
|
+
increment_C(@count_ary.ary, nodes, nodes.size / BYTES_IN_INT)
|
443
|
+
end
|
444
|
+
end
|
445
|
+
end
|
446
|
+
|
447
|
+
# Method that given a list of taxonomic paths determines a consensus for
|
448
|
+
# each taxonomic level. E.g. for the kingdom level if 60% of the taxpaths
|
449
|
+
# indicate 'Bacteria' and the consensus is 50% then the consensus for the
|
450
|
+
# kingdom level will be reported as 'Bacteria(60)'. If the name at any
|
451
|
+
# level consists of multiple words they are treated independently. E.g if
|
452
|
+
# we have three taxpath at the species level with the names:
|
453
|
+
#
|
454
|
+
# * Escherichia coli K-12
|
455
|
+
# * Escherichia coli sp. AC3432
|
456
|
+
# * Escherichia coli sp. AC1232
|
457
|
+
#
|
458
|
+
# The corresponding consensus for that level will be reported as
|
459
|
+
# 'Escherichia coli sp.(100/100/66)'. The forth word in the last two
|
460
|
+
# taxonomy strings (AC3432 and AC1232) have a consensus below 50% and are
|
461
|
+
# ignored.
|
462
|
+
def compile_consensus(taxpaths, hit_size)
|
463
|
+
consensus = []
|
464
|
+
tax_hash = decompose_consensus(taxpaths)
|
465
|
+
|
466
|
+
tax_hash.each do |level, subhash|
|
467
|
+
cons = []
|
468
|
+
scores = []
|
469
|
+
|
470
|
+
subhash.each_value do |subsubhash|
|
471
|
+
subsubhash.sort_by { |_, count| count }.reverse.
|
472
|
+
each do |subname, count|
|
473
|
+
if count >= hit_size * @options[:consensus]
|
474
|
+
cons << subname
|
475
|
+
scores << ((count / hit_size.to_f) * 100).to_i
|
476
|
+
end
|
477
|
+
end
|
478
|
+
end
|
479
|
+
|
480
|
+
break if cons.empty?
|
481
|
+
|
482
|
+
consensus << "#{level.upcase}##{cons.join('_')}(#{scores.join('/')})"
|
483
|
+
end
|
484
|
+
|
485
|
+
if consensus.empty?
|
486
|
+
'Unclassified'
|
487
|
+
else
|
488
|
+
consensus.join(';')
|
489
|
+
end
|
490
|
+
end
|
491
|
+
|
492
|
+
# Method that given a list of taxonomic paths splits these into a data
|
493
|
+
# structure appropriate for subsequence determination of the taxonomic
|
494
|
+
# consensus.
|
495
|
+
def decompose_consensus(taxpaths)
|
496
|
+
tax_hash = Hash.new do |h1, k1|
|
497
|
+
h1[k1] = Hash.new { |h2, k2| h2[k2] = Hash.new(0) }
|
498
|
+
end
|
499
|
+
|
500
|
+
taxpaths.each do |taxpath|
|
501
|
+
taxpath.nodes[1..-1].each do |node| # Ignoring root level, start at 1
|
502
|
+
node.name.split('_').each_with_index do |subname, i|
|
503
|
+
tax_hash[node.level][i][subname] += 1
|
504
|
+
end
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
508
|
+
tax_hash
|
509
|
+
end
|
510
|
+
|
511
|
+
inline do |builder|
|
512
|
+
# Struct for a 'hit' containing two pieces of information:
|
513
|
+
# * node_id - Node id for this particular node.
|
514
|
+
# * count - Number of kmers matching this particular node.
|
515
|
+
builder.prefix %(
|
516
|
+
typedef struct
|
517
|
+
{
|
518
|
+
unsigned int node_id;
|
519
|
+
unsigned int count;
|
520
|
+
} hit;
|
521
|
+
)
|
522
|
+
|
523
|
+
# Qsort hit struct comparision function for sorting
|
524
|
+
# hits according to count (highest count first).
|
525
|
+
# Returns negative if a > b and positive if b > a.
|
526
|
+
builder.prefix %{
|
527
|
+
int hit_cmp_by_count_C(const void *a, const void *b)
|
528
|
+
{
|
529
|
+
hit *ia = (hit *) a;
|
530
|
+
hit *ib = (hit *) b;
|
531
|
+
|
532
|
+
return (int) (ib->count - ia->count);
|
533
|
+
}
|
534
|
+
}
|
535
|
+
|
536
|
+
# Method to select only the best hits from the hit ary, which is sorted
|
537
|
+
# according to count (highest count first).
|
538
|
+
builder.prefix %{
|
539
|
+
void hits_select_best_only_C(
|
540
|
+
hit *hit_ary, // hit array.
|
541
|
+
unsigned int *hit_ary_len // hit array length.
|
542
|
+
)
|
543
|
+
{
|
544
|
+
unsigned int i = 0;
|
545
|
+
unsigned int max = 0;
|
546
|
+
|
547
|
+
max = hit_ary[i].count;
|
548
|
+
|
549
|
+
i++;
|
550
|
+
|
551
|
+
while ((i < *hit_ary_len) && (hit_ary[i].count == max)){
|
552
|
+
i++;
|
553
|
+
}
|
554
|
+
|
555
|
+
*hit_ary_len = i;
|
556
|
+
}
|
557
|
+
}
|
558
|
+
|
559
|
+
# Method for incrementing the count_ary. Each position in the count_ary
|
560
|
+
# corresponds to a node_id. The value at the position that is
|
561
|
+
# incremented corresponds to the number of shared kmers between this
|
562
|
+
# node id and the query sequence.
|
563
|
+
builder.c %{
|
564
|
+
void increment_C(
|
565
|
+
VALUE _count_ary, // Count ary.
|
566
|
+
VALUE _nodes_ary, // Nodes ary.
|
567
|
+
VALUE _length // Nodes ary length.
|
568
|
+
)
|
569
|
+
{
|
570
|
+
int *count_ary = (int *) StringValuePtr(_count_ary);
|
571
|
+
int *nodes_ary = (int *) StringValuePtr(_nodes_ary);
|
572
|
+
int length = FIX2INT(_length);
|
573
|
+
int i = 0;
|
574
|
+
|
575
|
+
for (i = 0; i < length; i++) {
|
576
|
+
count_ary[nodes_ary[i]]++;
|
577
|
+
}
|
578
|
+
}
|
579
|
+
}
|
580
|
+
|
581
|
+
# Method for selecting hits based from the count_ary. Hits are selected
|
582
|
+
# on a number of specified parameters:
|
583
|
+
# * best_only - if this is true only top scoring hits are reported.
|
584
|
+
# * coverage - Filter hits based on kmer coverage. If a hit contains
|
585
|
+
# fewer kmers than the total amount of kmers x coverage
|
586
|
+
# it will be filtered.
|
587
|
+
# The resulting hit_ary is sorted according to count (highest count
|
588
|
+
# first) and the size of the hit_ary is returned.
|
589
|
+
builder.c %{
|
590
|
+
VALUE hits_select_C(
|
591
|
+
VALUE _count_ary, // Count ary.
|
592
|
+
VALUE _count_ary_len, // Count ary length.
|
593
|
+
VALUE _hit_ary, // Hit ary.
|
594
|
+
VALUE _kmers_size, // Number of kmers.
|
595
|
+
VALUE _best_only, // Option best_only
|
596
|
+
VALUE _coverage // Option coverage
|
597
|
+
)
|
598
|
+
{
|
599
|
+
int *count_ary = (int *) StringValuePtr(_count_ary);
|
600
|
+
int count_ary_len = FIX2INT(_count_ary_len);
|
601
|
+
hit *hit_ary = (hit *) StringValuePtr(_hit_ary);
|
602
|
+
int kmers_size = FIX2INT(_kmers_size);
|
603
|
+
int best_only = FIX2INT(_best_only);
|
604
|
+
double coverage = NUM2DBL(_coverage);
|
605
|
+
|
606
|
+
hit new_hit = {0, 0};
|
607
|
+
int count = 0;
|
608
|
+
int i = 0;
|
609
|
+
unsigned int j = 0;
|
610
|
+
|
611
|
+
for (i = 0; i < count_ary_len; i++)
|
612
|
+
{
|
613
|
+
if ((count = count_ary[i]))
|
614
|
+
{
|
615
|
+
if (count >= kmers_size * coverage)
|
616
|
+
{
|
617
|
+
new_hit.node_id = i;
|
618
|
+
new_hit.count = count;
|
619
|
+
|
620
|
+
hit_ary[j] = new_hit;
|
621
|
+
|
622
|
+
j++;
|
623
|
+
}
|
624
|
+
}
|
625
|
+
}
|
626
|
+
|
627
|
+
if (j > 1)
|
628
|
+
{
|
629
|
+
qsort(hit_ary, j, sizeof(hit), hit_cmp_by_count_C);
|
630
|
+
|
631
|
+
if (best_only) {
|
632
|
+
hits_select_best_only_C(hit_ary, &j);
|
633
|
+
}
|
634
|
+
}
|
635
|
+
|
636
|
+
return UINT2NUM(j);
|
637
|
+
}
|
638
|
+
}
|
639
|
+
end
|
640
|
+
|
641
|
+
# Structure for taxonomic tree nodes.
|
642
|
+
Node = Struct.new(:seq_id, :node_id, :level, :name, :parent_id)
|
643
|
+
|
644
|
+
# Structure for holding the search result.
|
645
|
+
Result = Struct.new(:hits, :taxonomy)
|
646
|
+
|
647
|
+
# Class holding methods for manipulating tanomic paths.
|
648
|
+
class TaxPath
|
649
|
+
attr_reader :nodes
|
650
|
+
|
651
|
+
# Constructor method for TaxPath objects.
|
652
|
+
def initialize(node_id, kmers_observed, kmers_total, tax_index)
|
653
|
+
@node_id = node_id
|
654
|
+
@kmers_observed = kmers_observed
|
655
|
+
@kmers_total = kmers_total
|
656
|
+
@tax_index = tax_index
|
657
|
+
@nodes = taxonomy_backtrack
|
658
|
+
end
|
659
|
+
|
660
|
+
# Method that returns a list of nodes for a given node_id and all
|
661
|
+
# parent ids up the taxonomy tree.
|
662
|
+
def taxonomy_backtrack
|
663
|
+
nodes = []
|
664
|
+
|
665
|
+
node_id = @node_id
|
666
|
+
|
667
|
+
while (node = @tax_index[node_id])
|
668
|
+
nodes << node
|
669
|
+
|
670
|
+
break if node.level == :r # At root level
|
671
|
+
|
672
|
+
node_id = node.parent_id
|
673
|
+
end
|
674
|
+
|
675
|
+
nodes.reverse
|
676
|
+
end
|
677
|
+
|
678
|
+
# Returns formatted taxonomy string.
|
679
|
+
def to_s
|
680
|
+
levels = []
|
681
|
+
|
682
|
+
@nodes[1..-1].each do |node|
|
683
|
+
levels << "#{node.level.upcase}##{node.name}"
|
684
|
+
end
|
685
|
+
|
686
|
+
levels.join(';')
|
687
|
+
end
|
688
|
+
end
|
689
|
+
end
|
690
|
+
end
|
691
|
+
end
|