BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,691 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of BioDSL (www.github.com/maasha/BioDSL). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
class TaxonomyError < StandardError; end
|
|
30
|
+
|
|
31
|
+
# Module containing classes for creating a taxonomic database and searching
|
|
32
|
+
# this.
|
|
33
|
+
module Taxonomy
|
|
34
|
+
require 'narray'
|
|
35
|
+
|
|
36
|
+
TAX_LEVELS = [:r, :k, :p, :c, :o, :f, :g, :s]
|
|
37
|
+
|
|
38
|
+
# rubocop: disable ClassLength
|
|
39
|
+
|
|
40
|
+
# Class for creating and databasing an index of a taxonomic tree. This is
|
|
41
|
+
# done in two steps. 1) A temporary tree is creating using the taxonomic
|
|
42
|
+
# strings from the sequence names in a FASTA file. 2) A simplistic tree
|
|
43
|
+
# is constructed from the temporary tree allowing this to be saved to files.
|
|
44
|
+
# The resulting index consists of the following files:
|
|
45
|
+
# * taxonomy_tax_index.dat - return node for a given node id.
|
|
46
|
+
# * taxonomy_kmer_index.dat - return list of node ids for a given level and
|
|
47
|
+
# kmer.
|
|
48
|
+
class Index
|
|
49
|
+
require 'set'
|
|
50
|
+
|
|
51
|
+
attr_reader :size, :node_id
|
|
52
|
+
alias_method :size, :node_id
|
|
53
|
+
|
|
54
|
+
# Constructor Index object.
|
|
55
|
+
def initialize(options)
|
|
56
|
+
@options = options # Option hash
|
|
57
|
+
@seq_id = 0 # Sequence id
|
|
58
|
+
@node_id = 0 # Node id
|
|
59
|
+
@tree = TaxNode.new(nil, :r, 'root', nil, @node_id) # Root node
|
|
60
|
+
@node_id += 1
|
|
61
|
+
|
|
62
|
+
%i(kmer_size step_size output_dir prefix).each do |option|
|
|
63
|
+
fail TaxonomyError, "missing #{option} option" unless @options[option]
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Method to add a Sequence entry to the taxonomic tree. The sequence name
|
|
68
|
+
# contain a taxonomic string.
|
|
69
|
+
#
|
|
70
|
+
# Example entry:
|
|
71
|
+
# seq_name: K#Bacteria;P#Proteobacteria;C#Gammaproteobacteria; \
|
|
72
|
+
# O#Vibrionales;F#Vibrionaceae;G#Vibrio;S#Vibrio
|
|
73
|
+
# seq: UCCUACGGGAGGCAGCAGUGGGGAAUAUUGCACAAUGGGCGCAAGCCUGA \
|
|
74
|
+
# UGCAGCCAUGCCGCGUGUAUGAAGGCCUUCGGGUUGUAACUC ...
|
|
75
|
+
#
|
|
76
|
+
# The sequence is reduced to a list of oligos of a given size and a given
|
|
77
|
+
# step size, e.g. 8 and 1, respectively:
|
|
78
|
+
#
|
|
79
|
+
# UCCUACGG
|
|
80
|
+
# CCUACGGG
|
|
81
|
+
# CUACGGGA
|
|
82
|
+
# UACGGGAG
|
|
83
|
+
# ACGGGAGG
|
|
84
|
+
# ...
|
|
85
|
+
#
|
|
86
|
+
# Each oligo is encoded as an kmer (integer) by encoding two bits per
|
|
87
|
+
# nucleotide:
|
|
88
|
+
#
|
|
89
|
+
# A = 00
|
|
90
|
+
# U = 01
|
|
91
|
+
# C = 10
|
|
92
|
+
# G = 11
|
|
93
|
+
#
|
|
94
|
+
# E.g. UCCUACGG = 0110100100101111 = 26927
|
|
95
|
+
#
|
|
96
|
+
# For each node in the tree a set is kept containing information of
|
|
97
|
+
# all observed oligos for that particular node. Thus all child nodes
|
|
98
|
+
# contain a subset of oligos compared to the parent node.
|
|
99
|
+
def add(entry)
|
|
100
|
+
node = @tree
|
|
101
|
+
old_name = false
|
|
102
|
+
tax_levels = entry.seq_name.split(';')
|
|
103
|
+
|
|
104
|
+
if tax_levels.size != TAX_LEVELS.size - 1
|
|
105
|
+
fail TaxonomyError, "Wrong number of tax levels in #{entry.seq_name}"
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
tax_levels.each_with_index do |tax_level, i|
|
|
109
|
+
level, name = tax_level.split('#')
|
|
110
|
+
|
|
111
|
+
if level.downcase.to_sym != TAX_LEVELS[i + 1]
|
|
112
|
+
fail TaxonomyError, "Unexpected tax id in #{entry.seq_name}"
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
if name
|
|
116
|
+
if i > 0 && !old_name
|
|
117
|
+
fail TaxonomyError, "Gapped tax level info in #{entry.seq_name}"
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
if (child = node[name])
|
|
121
|
+
else
|
|
122
|
+
child = TaxNode.new(node, level.downcase.to_sym, name, @seq_id,
|
|
123
|
+
@node_id)
|
|
124
|
+
@node_id += 1
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
if leaf?(tax_levels, i)
|
|
128
|
+
kmers = entry.to_kmers(kmer_size: @options[:kmer_size],
|
|
129
|
+
step_size: @options[:step_size])
|
|
130
|
+
child.kmers |= Set.new(kmers)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
node[name] = child
|
|
134
|
+
node = node[name]
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
old_name = name
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
@seq_id += 1
|
|
141
|
+
|
|
142
|
+
self
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Remap and save taxonomic tree to index files.
|
|
146
|
+
def save
|
|
147
|
+
tree_union(@tree)
|
|
148
|
+
|
|
149
|
+
save_kmer_index
|
|
150
|
+
save_tax_index
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Testing method to get a node given an id. Returns nil if node wasn't
|
|
154
|
+
# found.
|
|
155
|
+
def get_node(id)
|
|
156
|
+
queue = [@tree]
|
|
157
|
+
|
|
158
|
+
until queue.empty?
|
|
159
|
+
node = queue.shift
|
|
160
|
+
|
|
161
|
+
return node if node.node_id == id
|
|
162
|
+
|
|
163
|
+
node.children.each_value do |child|
|
|
164
|
+
queue.unshift(child) unless child.nil?
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
nil
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Method that traverses the tax tree and populate all parent nodes with
|
|
172
|
+
# the union of all kmers from the patents children.
|
|
173
|
+
def tree_union(node = @tree)
|
|
174
|
+
node.children.each_value { |child| tree_union(child) }
|
|
175
|
+
|
|
176
|
+
node.children.each_value do |child|
|
|
177
|
+
if node.kmers.nil? && child.kmers.nil?
|
|
178
|
+
elsif node.kmers.nil?
|
|
179
|
+
node.kmers = child.kmers
|
|
180
|
+
else
|
|
181
|
+
node.kmers |= child.kmers if child.kmers
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
private
|
|
187
|
+
|
|
188
|
+
# Method that determines if a node is a leaf or not.
|
|
189
|
+
def leaf?(tax_levels, i)
|
|
190
|
+
if tax_levels[i + 1] && tax_levels[i + 1].split('#')[1]
|
|
191
|
+
false
|
|
192
|
+
else
|
|
193
|
+
true
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Save tax index to file.
|
|
198
|
+
def save_tax_index
|
|
199
|
+
file = File.join(@options[:output_dir],
|
|
200
|
+
"#{@options[:prefix]}_tax_index.dat")
|
|
201
|
+
File.open(file, 'wb') do |ios|
|
|
202
|
+
ios.puts %w(#SEQ_ID NODE_ID LEVEL NAME PARENT_ID).join("\t")
|
|
203
|
+
queue = [@tree]
|
|
204
|
+
|
|
205
|
+
until queue.empty?
|
|
206
|
+
node = queue.shift
|
|
207
|
+
|
|
208
|
+
ios.puts [node.seq_id, node.node_id, node.level, node.name,
|
|
209
|
+
node.parent_id].join("\t")
|
|
210
|
+
|
|
211
|
+
node.children.each_value do |child|
|
|
212
|
+
queue.unshift(child) unless child.nil?
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# Construct and save kmer index to file. This is done BFS style one
|
|
219
|
+
# taxonomic level at a time to save memory.
|
|
220
|
+
def save_kmer_index
|
|
221
|
+
file = File.join(@options[:output_dir],
|
|
222
|
+
"#{@options[:prefix]}_kmer_index.dat")
|
|
223
|
+
File.open(file, 'wb') do |ios|
|
|
224
|
+
ios.puts %w(#LEVEL KMER NODES).join("\t")
|
|
225
|
+
|
|
226
|
+
level = 0
|
|
227
|
+
queue = [@tree]
|
|
228
|
+
|
|
229
|
+
until queue.empty?
|
|
230
|
+
kmer_index = Hash.new { |h, k| h[k] = [] }
|
|
231
|
+
new_queue = []
|
|
232
|
+
|
|
233
|
+
queue.each do |node|
|
|
234
|
+
node.kmers.to_a.map { |kmer| kmer_index[kmer] << node.node_id }
|
|
235
|
+
node.children.each_value { |child| child && new_queue << child }
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
kmer_index.keys.sort.each do |kmer|
|
|
239
|
+
nodes = kmer_index[kmer].sort.join(';')
|
|
240
|
+
|
|
241
|
+
ios.puts [TAX_LEVELS[level], kmer, nodes].join("\t")
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
queue = new_queue
|
|
245
|
+
level += 1
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
kmer_index
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# Class for the nodes used for constructing the taxonomic tree.
|
|
253
|
+
class TaxNode
|
|
254
|
+
attr_accessor :kmers
|
|
255
|
+
attr_reader :parent, :level, :name, :children, :seq_id, :node_id
|
|
256
|
+
|
|
257
|
+
# Constructor for TaxNode objects.
|
|
258
|
+
def initialize(parent, level, name, seq_id, node_id)
|
|
259
|
+
@parent = parent # Parent node.
|
|
260
|
+
@level = level # Taxonomic level.
|
|
261
|
+
@name = name # Taxonomic name.
|
|
262
|
+
@kmers = Set.new # Kmer set.
|
|
263
|
+
@seq_id = seq_id # Sequ id (a representative seq for debugging).
|
|
264
|
+
@node_id = node_id # Node id.
|
|
265
|
+
@children = {} # Child node hash.
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
# Returns parent node id if a parent exist, else nil.
|
|
269
|
+
def parent_id
|
|
270
|
+
@parent.node_id if @parent
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
# Returns an array of children node ids.
|
|
274
|
+
def children_ids
|
|
275
|
+
ids = []
|
|
276
|
+
|
|
277
|
+
@children.each_value { |child| ids << child.id }
|
|
278
|
+
|
|
279
|
+
ids
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
# Getter method for node children.
|
|
283
|
+
def [](key)
|
|
284
|
+
@children[key]
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
# Setter method for node children.
|
|
288
|
+
def []=(key, value)
|
|
289
|
+
@children[key] = value
|
|
290
|
+
end
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
# Class for searching sequences in a taxonomic database. The database
|
|
295
|
+
# consists a taxonomic tree index and indices for each taxonomic level
|
|
296
|
+
# saved in the following files:
|
|
297
|
+
# * taxonomy_tax_index.dat - return node for a given node id.
|
|
298
|
+
# * taxonomy_kmer_index.dat - return list of node ids for a given level and
|
|
299
|
+
# kmer.
|
|
300
|
+
class Search
|
|
301
|
+
MAX_COUNT = 200_000
|
|
302
|
+
MAX_HITS = 2_000 # Max num of shared oligos between two sequences.
|
|
303
|
+
BYTES_IN_INT = 4
|
|
304
|
+
BYTES_IN_HIT = 2 * BYTES_IN_INT
|
|
305
|
+
|
|
306
|
+
# Constructor for initializing a Search object.
|
|
307
|
+
def initialize(options)
|
|
308
|
+
@options = options
|
|
309
|
+
|
|
310
|
+
symbols = %i(kmer_size step_size dir prefix consensus coverage hits_max)
|
|
311
|
+
|
|
312
|
+
symbols.each do |opt|
|
|
313
|
+
fail TaxonomyError, "missing #{opt} option" unless @options[opt]
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
@count_ary = BioDSL::CAry.new(MAX_COUNT, BYTES_IN_INT)
|
|
317
|
+
@hit_ary = BioDSL::CAry.new(MAX_HITS, BYTES_IN_HIT)
|
|
318
|
+
@tax_index = load_tax_index
|
|
319
|
+
@kmer_index = load_kmer_index
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
# Method to execute a search for a given sequence entry. First the
|
|
323
|
+
# sequence is broken down into unique kmers of a given kmer_size
|
|
324
|
+
# overlapping with a given step_size. See Taxonomy::Index.add.
|
|
325
|
+
# Now, for each taxonomic level, starting from species all nodes
|
|
326
|
+
# for each kmer is looked up in the database. The nodes containing
|
|
327
|
+
# most kmers are considered hits. If there are no hits at a taxonomic
|
|
328
|
+
# level, we move to the next level. Hits are sorted according to how
|
|
329
|
+
# many kmers matched this particular node and a consensus taxonomy
|
|
330
|
+
# string is determined. Hits are also filtered with the following
|
|
331
|
+
# options:
|
|
332
|
+
# * hits_max - Include maximally this number of hits in the consensus.
|
|
333
|
+
# * best_only - Include only the best scoring hits in the consensus.
|
|
334
|
+
# That is if a hit consists of 344 kmers out of 345
|
|
335
|
+
# possible, only hits with 344 kmers are included.
|
|
336
|
+
# * coverage - Filter hits based on kmer coverage. If a hit contains
|
|
337
|
+
# fewer kmers than the total amount of kmers x coverage
|
|
338
|
+
# it will be filtered.
|
|
339
|
+
# * consensus - For a number of hits accept consensus at a given level
|
|
340
|
+
# if within this percentage.
|
|
341
|
+
def execute(entry)
|
|
342
|
+
kmers = entry.to_kmers(kmer_size: @options[:kmer_size],
|
|
343
|
+
step_size: @options[:step_size])
|
|
344
|
+
|
|
345
|
+
puts "DEBUG Q: #{entry.seq_name}" if BioDSL.debug
|
|
346
|
+
|
|
347
|
+
TAX_LEVELS.reverse.each do |level|
|
|
348
|
+
kmers_lookup(kmers, level)
|
|
349
|
+
|
|
350
|
+
hit_count = hits_select_C(@count_ary.ary, @count_ary.count,
|
|
351
|
+
@hit_ary.ary, kmers.size,
|
|
352
|
+
(@options[:best_only] ? 1 : 0),
|
|
353
|
+
@options[:coverage])
|
|
354
|
+
hit_count = @options[:hits_max] if @options[:hits_max] < hit_count
|
|
355
|
+
|
|
356
|
+
if hit_count == 0
|
|
357
|
+
puts "DEBUG no hits @ #{level}" if BioDSL.debug
|
|
358
|
+
else
|
|
359
|
+
puts "DEBUG hit(s) @ #{level}" if BioDSL.debug
|
|
360
|
+
taxpaths = []
|
|
361
|
+
|
|
362
|
+
(0...hit_count).each do |i|
|
|
363
|
+
start = BYTES_IN_HIT * i
|
|
364
|
+
stop = BYTES_IN_HIT * i + BYTES_IN_HIT
|
|
365
|
+
|
|
366
|
+
node_id, count = @hit_ary.ary[start...stop].unpack('II')
|
|
367
|
+
|
|
368
|
+
taxpath = TaxPath.new(node_id, count, kmers.size, @tax_index)
|
|
369
|
+
|
|
370
|
+
if BioDSL.debug
|
|
371
|
+
seq_id = @tax_index[node_id].seq_id
|
|
372
|
+
puts "DEBUG S_ID: #{seq_id} KMERS: [#{count}/#{kmers.size}] \
|
|
373
|
+
#{taxpath}"
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
taxpaths << taxpath
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
return Result.new(hit_count, compile_consensus(taxpaths, hit_count).
|
|
380
|
+
tr('_', ' '))
|
|
381
|
+
end
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
Result.new(0, 'Unclassified')
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
private
|
|
388
|
+
|
|
389
|
+
# Method to load and return the tax_index from file.
|
|
390
|
+
def load_tax_index
|
|
391
|
+
tax_index = {}
|
|
392
|
+
file = File.join(@options[:dir], "#{@options[:prefix]}_tax_index.dat")
|
|
393
|
+
File.open(file) do |ios|
|
|
394
|
+
ios.each do |line|
|
|
395
|
+
line.chomp!
|
|
396
|
+
|
|
397
|
+
next if line[0] == '#'
|
|
398
|
+
|
|
399
|
+
seq_id, node_id, level, name, parent_id = line.split("\t")
|
|
400
|
+
|
|
401
|
+
tax_index[node_id.to_i] = Node.new(seq_id.to_i, node_id.to_i,
|
|
402
|
+
level.to_sym, name,
|
|
403
|
+
parent_id.to_i)
|
|
404
|
+
end
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
tax_index
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
# Method to load and return the kmer_index from file.
|
|
411
|
+
def load_kmer_index
|
|
412
|
+
kmer_index = Hash.new { |h, k| h[k] = {} }
|
|
413
|
+
file = File.join(@options[:dir], "#{@options[:prefix]}_kmer_index.dat")
|
|
414
|
+
File.open(file) do |ios|
|
|
415
|
+
ios.each do |line|
|
|
416
|
+
line.chomp!
|
|
417
|
+
|
|
418
|
+
next if line[0] == '#'
|
|
419
|
+
|
|
420
|
+
level, kmer, nodes = line.split("\t")
|
|
421
|
+
|
|
422
|
+
kmer_index[level.to_sym][kmer.to_i] = nodes.split(';').map(&:to_i).
|
|
423
|
+
pack('I*')
|
|
424
|
+
end
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
kmer_index
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
# Method that given a list of kmers and a taxonomic level
|
|
431
|
+
# lookups all the nodes for each kmer and increment the
|
|
432
|
+
# count array posisions for all nodes. The lookup for each
|
|
433
|
+
# kmer is initially done from a database, but subsequent
|
|
434
|
+
# lookups for that particular kmer are cached.
|
|
435
|
+
def kmers_lookup(kmers, level)
|
|
436
|
+
@count_ary.zero!
|
|
437
|
+
|
|
438
|
+
kmers.each do |kmer|
|
|
439
|
+
next unless @kmer_index[level]
|
|
440
|
+
|
|
441
|
+
if (nodes = @kmer_index[level][kmer])
|
|
442
|
+
increment_C(@count_ary.ary, nodes, nodes.size / BYTES_IN_INT)
|
|
443
|
+
end
|
|
444
|
+
end
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
# Method that given a list of taxonomic paths determines a consensus for
|
|
448
|
+
# each taxonomic level. E.g. for the kingdom level if 60% of the taxpaths
|
|
449
|
+
# indicate 'Bacteria' and the consensus is 50% then the consensus for the
|
|
450
|
+
# kingdom level will be reported as 'Bacteria(60)'. If the name at any
|
|
451
|
+
# level consists of multiple words they are treated independently. E.g if
|
|
452
|
+
# we have three taxpath at the species level with the names:
|
|
453
|
+
#
|
|
454
|
+
# * Escherichia coli K-12
|
|
455
|
+
# * Escherichia coli sp. AC3432
|
|
456
|
+
# * Escherichia coli sp. AC1232
|
|
457
|
+
#
|
|
458
|
+
# The corresponding consensus for that level will be reported as
|
|
459
|
+
# 'Escherichia coli sp.(100/100/66)'. The forth word in the last two
|
|
460
|
+
# taxonomy strings (AC3432 and AC1232) have a consensus below 50% and are
|
|
461
|
+
# ignored.
|
|
462
|
+
def compile_consensus(taxpaths, hit_size)
|
|
463
|
+
consensus = []
|
|
464
|
+
tax_hash = decompose_consensus(taxpaths)
|
|
465
|
+
|
|
466
|
+
tax_hash.each do |level, subhash|
|
|
467
|
+
cons = []
|
|
468
|
+
scores = []
|
|
469
|
+
|
|
470
|
+
subhash.each_value do |subsubhash|
|
|
471
|
+
subsubhash.sort_by { |_, count| count }.reverse.
|
|
472
|
+
each do |subname, count|
|
|
473
|
+
if count >= hit_size * @options[:consensus]
|
|
474
|
+
cons << subname
|
|
475
|
+
scores << ((count / hit_size.to_f) * 100).to_i
|
|
476
|
+
end
|
|
477
|
+
end
|
|
478
|
+
end
|
|
479
|
+
|
|
480
|
+
break if cons.empty?
|
|
481
|
+
|
|
482
|
+
consensus << "#{level.upcase}##{cons.join('_')}(#{scores.join('/')})"
|
|
483
|
+
end
|
|
484
|
+
|
|
485
|
+
if consensus.empty?
|
|
486
|
+
'Unclassified'
|
|
487
|
+
else
|
|
488
|
+
consensus.join(';')
|
|
489
|
+
end
|
|
490
|
+
end
|
|
491
|
+
|
|
492
|
+
# Method that given a list of taxonomic paths splits these into a data
|
|
493
|
+
# structure appropriate for subsequence determination of the taxonomic
|
|
494
|
+
# consensus.
|
|
495
|
+
def decompose_consensus(taxpaths)
|
|
496
|
+
tax_hash = Hash.new do |h1, k1|
|
|
497
|
+
h1[k1] = Hash.new { |h2, k2| h2[k2] = Hash.new(0) }
|
|
498
|
+
end
|
|
499
|
+
|
|
500
|
+
taxpaths.each do |taxpath|
|
|
501
|
+
taxpath.nodes[1..-1].each do |node| # Ignoring root level, start at 1
|
|
502
|
+
node.name.split('_').each_with_index do |subname, i|
|
|
503
|
+
tax_hash[node.level][i][subname] += 1
|
|
504
|
+
end
|
|
505
|
+
end
|
|
506
|
+
end
|
|
507
|
+
|
|
508
|
+
tax_hash
|
|
509
|
+
end
|
|
510
|
+
|
|
511
|
+
inline do |builder|
|
|
512
|
+
# Struct for a 'hit' containing two pieces of information:
|
|
513
|
+
# * node_id - Node id for this particular node.
|
|
514
|
+
# * count - Number of kmers matching this particular node.
|
|
515
|
+
builder.prefix %(
|
|
516
|
+
typedef struct
|
|
517
|
+
{
|
|
518
|
+
unsigned int node_id;
|
|
519
|
+
unsigned int count;
|
|
520
|
+
} hit;
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
# Qsort hit struct comparision function for sorting
|
|
524
|
+
# hits according to count (highest count first).
|
|
525
|
+
# Returns negative if a > b and positive if b > a.
|
|
526
|
+
builder.prefix %{
|
|
527
|
+
int hit_cmp_by_count_C(const void *a, const void *b)
|
|
528
|
+
{
|
|
529
|
+
hit *ia = (hit *) a;
|
|
530
|
+
hit *ib = (hit *) b;
|
|
531
|
+
|
|
532
|
+
return (int) (ib->count - ia->count);
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
# Method to select only the best hits from the hit ary, which is sorted
|
|
537
|
+
# according to count (highest count first).
|
|
538
|
+
builder.prefix %{
|
|
539
|
+
void hits_select_best_only_C(
|
|
540
|
+
hit *hit_ary, // hit array.
|
|
541
|
+
unsigned int *hit_ary_len // hit array length.
|
|
542
|
+
)
|
|
543
|
+
{
|
|
544
|
+
unsigned int i = 0;
|
|
545
|
+
unsigned int max = 0;
|
|
546
|
+
|
|
547
|
+
max = hit_ary[i].count;
|
|
548
|
+
|
|
549
|
+
i++;
|
|
550
|
+
|
|
551
|
+
while ((i < *hit_ary_len) && (hit_ary[i].count == max)){
|
|
552
|
+
i++;
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
*hit_ary_len = i;
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
# Method for incrementing the count_ary. Each position in the count_ary
|
|
560
|
+
# corresponds to a node_id. The value at the position that is
|
|
561
|
+
# incremented corresponds to the number of shared kmers between this
|
|
562
|
+
# node id and the query sequence.
|
|
563
|
+
builder.c %{
|
|
564
|
+
void increment_C(
|
|
565
|
+
VALUE _count_ary, // Count ary.
|
|
566
|
+
VALUE _nodes_ary, // Nodes ary.
|
|
567
|
+
VALUE _length // Nodes ary length.
|
|
568
|
+
)
|
|
569
|
+
{
|
|
570
|
+
int *count_ary = (int *) StringValuePtr(_count_ary);
|
|
571
|
+
int *nodes_ary = (int *) StringValuePtr(_nodes_ary);
|
|
572
|
+
int length = FIX2INT(_length);
|
|
573
|
+
int i = 0;
|
|
574
|
+
|
|
575
|
+
for (i = 0; i < length; i++) {
|
|
576
|
+
count_ary[nodes_ary[i]]++;
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
# Method for selecting hits based from the count_ary. Hits are selected
|
|
582
|
+
# on a number of specified parameters:
|
|
583
|
+
# * best_only - if this is true only top scoring hits are reported.
|
|
584
|
+
# * coverage - Filter hits based on kmer coverage. If a hit contains
|
|
585
|
+
# fewer kmers than the total amount of kmers x coverage
|
|
586
|
+
# it will be filtered.
|
|
587
|
+
# The resulting hit_ary is sorted according to count (highest count
|
|
588
|
+
# first) and the size of the hit_ary is returned.
|
|
589
|
+
builder.c %{
|
|
590
|
+
VALUE hits_select_C(
|
|
591
|
+
VALUE _count_ary, // Count ary.
|
|
592
|
+
VALUE _count_ary_len, // Count ary length.
|
|
593
|
+
VALUE _hit_ary, // Hit ary.
|
|
594
|
+
VALUE _kmers_size, // Number of kmers.
|
|
595
|
+
VALUE _best_only, // Option best_only
|
|
596
|
+
VALUE _coverage // Option coverage
|
|
597
|
+
)
|
|
598
|
+
{
|
|
599
|
+
int *count_ary = (int *) StringValuePtr(_count_ary);
|
|
600
|
+
int count_ary_len = FIX2INT(_count_ary_len);
|
|
601
|
+
hit *hit_ary = (hit *) StringValuePtr(_hit_ary);
|
|
602
|
+
int kmers_size = FIX2INT(_kmers_size);
|
|
603
|
+
int best_only = FIX2INT(_best_only);
|
|
604
|
+
double coverage = NUM2DBL(_coverage);
|
|
605
|
+
|
|
606
|
+
hit new_hit = {0, 0};
|
|
607
|
+
int count = 0;
|
|
608
|
+
int i = 0;
|
|
609
|
+
unsigned int j = 0;
|
|
610
|
+
|
|
611
|
+
for (i = 0; i < count_ary_len; i++)
|
|
612
|
+
{
|
|
613
|
+
if ((count = count_ary[i]))
|
|
614
|
+
{
|
|
615
|
+
if (count >= kmers_size * coverage)
|
|
616
|
+
{
|
|
617
|
+
new_hit.node_id = i;
|
|
618
|
+
new_hit.count = count;
|
|
619
|
+
|
|
620
|
+
hit_ary[j] = new_hit;
|
|
621
|
+
|
|
622
|
+
j++;
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
if (j > 1)
|
|
628
|
+
{
|
|
629
|
+
qsort(hit_ary, j, sizeof(hit), hit_cmp_by_count_C);
|
|
630
|
+
|
|
631
|
+
if (best_only) {
|
|
632
|
+
hits_select_best_only_C(hit_ary, &j);
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
return UINT2NUM(j);
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
end
|
|
640
|
+
|
|
641
|
+
# Structure for taxonomic tree nodes.
|
|
642
|
+
Node = Struct.new(:seq_id, :node_id, :level, :name, :parent_id)
|
|
643
|
+
|
|
644
|
+
# Structure for holding the search result.
|
|
645
|
+
Result = Struct.new(:hits, :taxonomy)
|
|
646
|
+
|
|
647
|
+
# Class holding methods for manipulating tanomic paths.
|
|
648
|
+
class TaxPath
|
|
649
|
+
attr_reader :nodes
|
|
650
|
+
|
|
651
|
+
# Constructor method for TaxPath objects.
|
|
652
|
+
def initialize(node_id, kmers_observed, kmers_total, tax_index)
|
|
653
|
+
@node_id = node_id
|
|
654
|
+
@kmers_observed = kmers_observed
|
|
655
|
+
@kmers_total = kmers_total
|
|
656
|
+
@tax_index = tax_index
|
|
657
|
+
@nodes = taxonomy_backtrack
|
|
658
|
+
end
|
|
659
|
+
|
|
660
|
+
# Method that returns a list of nodes for a given node_id and all
|
|
661
|
+
# parent ids up the taxonomy tree.
|
|
662
|
+
def taxonomy_backtrack
|
|
663
|
+
nodes = []
|
|
664
|
+
|
|
665
|
+
node_id = @node_id
|
|
666
|
+
|
|
667
|
+
while (node = @tax_index[node_id])
|
|
668
|
+
nodes << node
|
|
669
|
+
|
|
670
|
+
break if node.level == :r # At root level
|
|
671
|
+
|
|
672
|
+
node_id = node.parent_id
|
|
673
|
+
end
|
|
674
|
+
|
|
675
|
+
nodes.reverse
|
|
676
|
+
end
|
|
677
|
+
|
|
678
|
+
# Returns formatted taxonomy string.
|
|
679
|
+
def to_s
|
|
680
|
+
levels = []
|
|
681
|
+
|
|
682
|
+
@nodes[1..-1].each do |node|
|
|
683
|
+
levels << "#{node.level.upcase}##{node.name}"
|
|
684
|
+
end
|
|
685
|
+
|
|
686
|
+
levels.join(';')
|
|
687
|
+
end
|
|
688
|
+
end
|
|
689
|
+
end
|
|
690
|
+
end
|
|
691
|
+
end
|