BioDSL 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,691 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ class TaxonomyError < StandardError; end
30
+
31
+ # Module containing classes for creating a taxonomic database and searching
32
+ # this.
33
+ module Taxonomy
34
+ require 'narray'
35
+
36
+ TAX_LEVELS = [:r, :k, :p, :c, :o, :f, :g, :s]
37
+
38
+ # rubocop: disable ClassLength
39
+
40
+ # Class for creating and databasing an index of a taxonomic tree. This is
41
+ # done in two steps. 1) A temporary tree is creating using the taxonomic
42
+ # strings from the sequence names in a FASTA file. 2) A simplistic tree
43
+ # is constructed from the temporary tree allowing this to be saved to files.
44
+ # The resulting index consists of the following files:
45
+ # * taxonomy_tax_index.dat - return node for a given node id.
46
+ # * taxonomy_kmer_index.dat - return list of node ids for a given level and
47
+ # kmer.
48
+ class Index
49
+ require 'set'
50
+
51
+ attr_reader :size, :node_id
52
+ alias_method :size, :node_id
53
+
54
+ # Constructor Index object.
55
+ def initialize(options)
56
+ @options = options # Option hash
57
+ @seq_id = 0 # Sequence id
58
+ @node_id = 0 # Node id
59
+ @tree = TaxNode.new(nil, :r, 'root', nil, @node_id) # Root node
60
+ @node_id += 1
61
+
62
+ %i(kmer_size step_size output_dir prefix).each do |option|
63
+ fail TaxonomyError, "missing #{option} option" unless @options[option]
64
+ end
65
+ end
66
+
67
+ # Method to add a Sequence entry to the taxonomic tree. The sequence name
68
+ # contain a taxonomic string.
69
+ #
70
+ # Example entry:
71
+ # seq_name: K#Bacteria;P#Proteobacteria;C#Gammaproteobacteria; \
72
+ # O#Vibrionales;F#Vibrionaceae;G#Vibrio;S#Vibrio
73
+ # seq: UCCUACGGGAGGCAGCAGUGGGGAAUAUUGCACAAUGGGCGCAAGCCUGA \
74
+ # UGCAGCCAUGCCGCGUGUAUGAAGGCCUUCGGGUUGUAACUC ...
75
+ #
76
+ # The sequence is reduced to a list of oligos of a given size and a given
77
+ # step size, e.g. 8 and 1, respectively:
78
+ #
79
+ # UCCUACGG
80
+ # CCUACGGG
81
+ # CUACGGGA
82
+ # UACGGGAG
83
+ # ACGGGAGG
84
+ # ...
85
+ #
86
+ # Each oligo is encoded as an kmer (integer) by encoding two bits per
87
+ # nucleotide:
88
+ #
89
+ # A = 00
90
+ # U = 01
91
+ # C = 10
92
+ # G = 11
93
+ #
94
+ # E.g. UCCUACGG = 0110100100101111 = 26927
95
+ #
96
+ # For each node in the tree a set is kept containing information of
97
+ # all observed oligos for that particular node. Thus all child nodes
98
+ # contain a subset of oligos compared to the parent node.
99
+ def add(entry)
100
+ node = @tree
101
+ old_name = false
102
+ tax_levels = entry.seq_name.split(';')
103
+
104
+ if tax_levels.size != TAX_LEVELS.size - 1
105
+ fail TaxonomyError, "Wrong number of tax levels in #{entry.seq_name}"
106
+ end
107
+
108
+ tax_levels.each_with_index do |tax_level, i|
109
+ level, name = tax_level.split('#')
110
+
111
+ if level.downcase.to_sym != TAX_LEVELS[i + 1]
112
+ fail TaxonomyError, "Unexpected tax id in #{entry.seq_name}"
113
+ end
114
+
115
+ if name
116
+ if i > 0 && !old_name
117
+ fail TaxonomyError, "Gapped tax level info in #{entry.seq_name}"
118
+ end
119
+
120
+ if (child = node[name])
121
+ else
122
+ child = TaxNode.new(node, level.downcase.to_sym, name, @seq_id,
123
+ @node_id)
124
+ @node_id += 1
125
+ end
126
+
127
+ if leaf?(tax_levels, i)
128
+ kmers = entry.to_kmers(kmer_size: @options[:kmer_size],
129
+ step_size: @options[:step_size])
130
+ child.kmers |= Set.new(kmers)
131
+ end
132
+
133
+ node[name] = child
134
+ node = node[name]
135
+ end
136
+
137
+ old_name = name
138
+ end
139
+
140
+ @seq_id += 1
141
+
142
+ self
143
+ end
144
+
145
+ # Remap and save taxonomic tree to index files.
146
+ def save
147
+ tree_union(@tree)
148
+
149
+ save_kmer_index
150
+ save_tax_index
151
+ end
152
+
153
+ # Testing method to get a node given an id. Returns nil if node wasn't
154
+ # found.
155
+ def get_node(id)
156
+ queue = [@tree]
157
+
158
+ until queue.empty?
159
+ node = queue.shift
160
+
161
+ return node if node.node_id == id
162
+
163
+ node.children.each_value do |child|
164
+ queue.unshift(child) unless child.nil?
165
+ end
166
+ end
167
+
168
+ nil
169
+ end
170
+
171
+ # Method that traverses the tax tree and populate all parent nodes with
172
+ # the union of all kmers from the patents children.
173
+ def tree_union(node = @tree)
174
+ node.children.each_value { |child| tree_union(child) }
175
+
176
+ node.children.each_value do |child|
177
+ if node.kmers.nil? && child.kmers.nil?
178
+ elsif node.kmers.nil?
179
+ node.kmers = child.kmers
180
+ else
181
+ node.kmers |= child.kmers if child.kmers
182
+ end
183
+ end
184
+ end
185
+
186
+ private
187
+
188
+ # Method that determines if a node is a leaf or not.
189
+ def leaf?(tax_levels, i)
190
+ if tax_levels[i + 1] && tax_levels[i + 1].split('#')[1]
191
+ false
192
+ else
193
+ true
194
+ end
195
+ end
196
+
197
+ # Save tax index to file.
198
+ def save_tax_index
199
+ file = File.join(@options[:output_dir],
200
+ "#{@options[:prefix]}_tax_index.dat")
201
+ File.open(file, 'wb') do |ios|
202
+ ios.puts %w(#SEQ_ID NODE_ID LEVEL NAME PARENT_ID).join("\t")
203
+ queue = [@tree]
204
+
205
+ until queue.empty?
206
+ node = queue.shift
207
+
208
+ ios.puts [node.seq_id, node.node_id, node.level, node.name,
209
+ node.parent_id].join("\t")
210
+
211
+ node.children.each_value do |child|
212
+ queue.unshift(child) unless child.nil?
213
+ end
214
+ end
215
+ end
216
+ end
217
+
218
+ # Construct and save kmer index to file. This is done BFS style one
219
+ # taxonomic level at a time to save memory.
220
+ def save_kmer_index
221
+ file = File.join(@options[:output_dir],
222
+ "#{@options[:prefix]}_kmer_index.dat")
223
+ File.open(file, 'wb') do |ios|
224
+ ios.puts %w(#LEVEL KMER NODES).join("\t")
225
+
226
+ level = 0
227
+ queue = [@tree]
228
+
229
+ until queue.empty?
230
+ kmer_index = Hash.new { |h, k| h[k] = [] }
231
+ new_queue = []
232
+
233
+ queue.each do |node|
234
+ node.kmers.to_a.map { |kmer| kmer_index[kmer] << node.node_id }
235
+ node.children.each_value { |child| child && new_queue << child }
236
+ end
237
+
238
+ kmer_index.keys.sort.each do |kmer|
239
+ nodes = kmer_index[kmer].sort.join(';')
240
+
241
+ ios.puts [TAX_LEVELS[level], kmer, nodes].join("\t")
242
+ end
243
+
244
+ queue = new_queue
245
+ level += 1
246
+ end
247
+
248
+ kmer_index
249
+ end
250
+ end
251
+
252
+ # Class for the nodes used for constructing the taxonomic tree.
253
+ class TaxNode
254
+ attr_accessor :kmers
255
+ attr_reader :parent, :level, :name, :children, :seq_id, :node_id
256
+
257
+ # Constructor for TaxNode objects.
258
+ def initialize(parent, level, name, seq_id, node_id)
259
+ @parent = parent # Parent node.
260
+ @level = level # Taxonomic level.
261
+ @name = name # Taxonomic name.
262
+ @kmers = Set.new # Kmer set.
263
+ @seq_id = seq_id # Sequ id (a representative seq for debugging).
264
+ @node_id = node_id # Node id.
265
+ @children = {} # Child node hash.
266
+ end
267
+
268
+ # Returns parent node id if a parent exist, else nil.
269
+ def parent_id
270
+ @parent.node_id if @parent
271
+ end
272
+
273
+ # Returns an array of children node ids.
274
+ def children_ids
275
+ ids = []
276
+
277
+ @children.each_value { |child| ids << child.id }
278
+
279
+ ids
280
+ end
281
+
282
+ # Getter method for node children.
283
+ def [](key)
284
+ @children[key]
285
+ end
286
+
287
+ # Setter method for node children.
288
+ def []=(key, value)
289
+ @children[key] = value
290
+ end
291
+ end
292
+ end
293
+
294
+ # Class for searching sequences in a taxonomic database. The database
295
+ # consists a taxonomic tree index and indices for each taxonomic level
296
+ # saved in the following files:
297
+ # * taxonomy_tax_index.dat - return node for a given node id.
298
+ # * taxonomy_kmer_index.dat - return list of node ids for a given level and
299
+ # kmer.
300
+ class Search
301
+ MAX_COUNT = 200_000
302
+ MAX_HITS = 2_000 # Max num of shared oligos between two sequences.
303
+ BYTES_IN_INT = 4
304
+ BYTES_IN_HIT = 2 * BYTES_IN_INT
305
+
306
+ # Constructor for initializing a Search object.
307
+ def initialize(options)
308
+ @options = options
309
+
310
+ symbols = %i(kmer_size step_size dir prefix consensus coverage hits_max)
311
+
312
+ symbols.each do |opt|
313
+ fail TaxonomyError, "missing #{opt} option" unless @options[opt]
314
+ end
315
+
316
+ @count_ary = BioDSL::CAry.new(MAX_COUNT, BYTES_IN_INT)
317
+ @hit_ary = BioDSL::CAry.new(MAX_HITS, BYTES_IN_HIT)
318
+ @tax_index = load_tax_index
319
+ @kmer_index = load_kmer_index
320
+ end
321
+
322
+ # Method to execute a search for a given sequence entry. First the
323
+ # sequence is broken down into unique kmers of a given kmer_size
324
+ # overlapping with a given step_size. See Taxonomy::Index.add.
325
+ # Now, for each taxonomic level, starting from species all nodes
326
+ # for each kmer is looked up in the database. The nodes containing
327
+ # most kmers are considered hits. If there are no hits at a taxonomic
328
+ # level, we move to the next level. Hits are sorted according to how
329
+ # many kmers matched this particular node and a consensus taxonomy
330
+ # string is determined. Hits are also filtered with the following
331
+ # options:
332
+ # * hits_max - Include maximally this number of hits in the consensus.
333
+ # * best_only - Include only the best scoring hits in the consensus.
334
+ # That is if a hit consists of 344 kmers out of 345
335
+ # possible, only hits with 344 kmers are included.
336
+ # * coverage - Filter hits based on kmer coverage. If a hit contains
337
+ # fewer kmers than the total amount of kmers x coverage
338
+ # it will be filtered.
339
+ # * consensus - For a number of hits accept consensus at a given level
340
+ # if within this percentage.
341
+ def execute(entry)
342
+ kmers = entry.to_kmers(kmer_size: @options[:kmer_size],
343
+ step_size: @options[:step_size])
344
+
345
+ puts "DEBUG Q: #{entry.seq_name}" if BioDSL.debug
346
+
347
+ TAX_LEVELS.reverse.each do |level|
348
+ kmers_lookup(kmers, level)
349
+
350
+ hit_count = hits_select_C(@count_ary.ary, @count_ary.count,
351
+ @hit_ary.ary, kmers.size,
352
+ (@options[:best_only] ? 1 : 0),
353
+ @options[:coverage])
354
+ hit_count = @options[:hits_max] if @options[:hits_max] < hit_count
355
+
356
+ if hit_count == 0
357
+ puts "DEBUG no hits @ #{level}" if BioDSL.debug
358
+ else
359
+ puts "DEBUG hit(s) @ #{level}" if BioDSL.debug
360
+ taxpaths = []
361
+
362
+ (0...hit_count).each do |i|
363
+ start = BYTES_IN_HIT * i
364
+ stop = BYTES_IN_HIT * i + BYTES_IN_HIT
365
+
366
+ node_id, count = @hit_ary.ary[start...stop].unpack('II')
367
+
368
+ taxpath = TaxPath.new(node_id, count, kmers.size, @tax_index)
369
+
370
+ if BioDSL.debug
371
+ seq_id = @tax_index[node_id].seq_id
372
+ puts "DEBUG S_ID: #{seq_id} KMERS: [#{count}/#{kmers.size}] \
373
+ #{taxpath}"
374
+ end
375
+
376
+ taxpaths << taxpath
377
+ end
378
+
379
+ return Result.new(hit_count, compile_consensus(taxpaths, hit_count).
380
+ tr('_', ' '))
381
+ end
382
+ end
383
+
384
+ Result.new(0, 'Unclassified')
385
+ end
386
+
387
+ private
388
+
389
+ # Method to load and return the tax_index from file.
390
+ def load_tax_index
391
+ tax_index = {}
392
+ file = File.join(@options[:dir], "#{@options[:prefix]}_tax_index.dat")
393
+ File.open(file) do |ios|
394
+ ios.each do |line|
395
+ line.chomp!
396
+
397
+ next if line[0] == '#'
398
+
399
+ seq_id, node_id, level, name, parent_id = line.split("\t")
400
+
401
+ tax_index[node_id.to_i] = Node.new(seq_id.to_i, node_id.to_i,
402
+ level.to_sym, name,
403
+ parent_id.to_i)
404
+ end
405
+ end
406
+
407
+ tax_index
408
+ end
409
+
410
+ # Method to load and return the kmer_index from file.
411
+ def load_kmer_index
412
+ kmer_index = Hash.new { |h, k| h[k] = {} }
413
+ file = File.join(@options[:dir], "#{@options[:prefix]}_kmer_index.dat")
414
+ File.open(file) do |ios|
415
+ ios.each do |line|
416
+ line.chomp!
417
+
418
+ next if line[0] == '#'
419
+
420
+ level, kmer, nodes = line.split("\t")
421
+
422
+ kmer_index[level.to_sym][kmer.to_i] = nodes.split(';').map(&:to_i).
423
+ pack('I*')
424
+ end
425
+ end
426
+
427
+ kmer_index
428
+ end
429
+
430
+ # Method that given a list of kmers and a taxonomic level
431
+ # lookups all the nodes for each kmer and increment the
432
+ # count array posisions for all nodes. The lookup for each
433
+ # kmer is initially done from a database, but subsequent
434
+ # lookups for that particular kmer are cached.
435
+ def kmers_lookup(kmers, level)
436
+ @count_ary.zero!
437
+
438
+ kmers.each do |kmer|
439
+ next unless @kmer_index[level]
440
+
441
+ if (nodes = @kmer_index[level][kmer])
442
+ increment_C(@count_ary.ary, nodes, nodes.size / BYTES_IN_INT)
443
+ end
444
+ end
445
+ end
446
+
447
+ # Method that given a list of taxonomic paths determines a consensus for
448
+ # each taxonomic level. E.g. for the kingdom level if 60% of the taxpaths
449
+ # indicate 'Bacteria' and the consensus is 50% then the consensus for the
450
+ # kingdom level will be reported as 'Bacteria(60)'. If the name at any
451
+ # level consists of multiple words they are treated independently. E.g if
452
+ # we have three taxpath at the species level with the names:
453
+ #
454
+ # * Escherichia coli K-12
455
+ # * Escherichia coli sp. AC3432
456
+ # * Escherichia coli sp. AC1232
457
+ #
458
+ # The corresponding consensus for that level will be reported as
459
+ # 'Escherichia coli sp.(100/100/66)'. The forth word in the last two
460
+ # taxonomy strings (AC3432 and AC1232) have a consensus below 50% and are
461
+ # ignored.
462
+ def compile_consensus(taxpaths, hit_size)
463
+ consensus = []
464
+ tax_hash = decompose_consensus(taxpaths)
465
+
466
+ tax_hash.each do |level, subhash|
467
+ cons = []
468
+ scores = []
469
+
470
+ subhash.each_value do |subsubhash|
471
+ subsubhash.sort_by { |_, count| count }.reverse.
472
+ each do |subname, count|
473
+ if count >= hit_size * @options[:consensus]
474
+ cons << subname
475
+ scores << ((count / hit_size.to_f) * 100).to_i
476
+ end
477
+ end
478
+ end
479
+
480
+ break if cons.empty?
481
+
482
+ consensus << "#{level.upcase}##{cons.join('_')}(#{scores.join('/')})"
483
+ end
484
+
485
+ if consensus.empty?
486
+ 'Unclassified'
487
+ else
488
+ consensus.join(';')
489
+ end
490
+ end
491
+
492
+ # Method that given a list of taxonomic paths splits these into a data
493
+ # structure appropriate for subsequence determination of the taxonomic
494
+ # consensus.
495
+ def decompose_consensus(taxpaths)
496
+ tax_hash = Hash.new do |h1, k1|
497
+ h1[k1] = Hash.new { |h2, k2| h2[k2] = Hash.new(0) }
498
+ end
499
+
500
+ taxpaths.each do |taxpath|
501
+ taxpath.nodes[1..-1].each do |node| # Ignoring root level, start at 1
502
+ node.name.split('_').each_with_index do |subname, i|
503
+ tax_hash[node.level][i][subname] += 1
504
+ end
505
+ end
506
+ end
507
+
508
+ tax_hash
509
+ end
510
+
511
+ inline do |builder|
512
+ # Struct for a 'hit' containing two pieces of information:
513
+ # * node_id - Node id for this particular node.
514
+ # * count - Number of kmers matching this particular node.
515
+ builder.prefix %(
516
+ typedef struct
517
+ {
518
+ unsigned int node_id;
519
+ unsigned int count;
520
+ } hit;
521
+ )
522
+
523
+ # Qsort hit struct comparision function for sorting
524
+ # hits according to count (highest count first).
525
+ # Returns negative if a > b and positive if b > a.
526
+ builder.prefix %{
527
+ int hit_cmp_by_count_C(const void *a, const void *b)
528
+ {
529
+ hit *ia = (hit *) a;
530
+ hit *ib = (hit *) b;
531
+
532
+ return (int) (ib->count - ia->count);
533
+ }
534
+ }
535
+
536
+ # Method to select only the best hits from the hit ary, which is sorted
537
+ # according to count (highest count first).
538
+ builder.prefix %{
539
+ void hits_select_best_only_C(
540
+ hit *hit_ary, // hit array.
541
+ unsigned int *hit_ary_len // hit array length.
542
+ )
543
+ {
544
+ unsigned int i = 0;
545
+ unsigned int max = 0;
546
+
547
+ max = hit_ary[i].count;
548
+
549
+ i++;
550
+
551
+ while ((i < *hit_ary_len) && (hit_ary[i].count == max)){
552
+ i++;
553
+ }
554
+
555
+ *hit_ary_len = i;
556
+ }
557
+ }
558
+
559
+ # Method for incrementing the count_ary. Each position in the count_ary
560
+ # corresponds to a node_id. The value at the position that is
561
+ # incremented corresponds to the number of shared kmers between this
562
+ # node id and the query sequence.
563
+ builder.c %{
564
+ void increment_C(
565
+ VALUE _count_ary, // Count ary.
566
+ VALUE _nodes_ary, // Nodes ary.
567
+ VALUE _length // Nodes ary length.
568
+ )
569
+ {
570
+ int *count_ary = (int *) StringValuePtr(_count_ary);
571
+ int *nodes_ary = (int *) StringValuePtr(_nodes_ary);
572
+ int length = FIX2INT(_length);
573
+ int i = 0;
574
+
575
+ for (i = 0; i < length; i++) {
576
+ count_ary[nodes_ary[i]]++;
577
+ }
578
+ }
579
+ }
580
+
581
+ # Method for selecting hits based from the count_ary. Hits are selected
582
+ # on a number of specified parameters:
583
+ # * best_only - if this is true only top scoring hits are reported.
584
+ # * coverage - Filter hits based on kmer coverage. If a hit contains
585
+ # fewer kmers than the total amount of kmers x coverage
586
+ # it will be filtered.
587
+ # The resulting hit_ary is sorted according to count (highest count
588
+ # first) and the size of the hit_ary is returned.
589
+ builder.c %{
590
+ VALUE hits_select_C(
591
+ VALUE _count_ary, // Count ary.
592
+ VALUE _count_ary_len, // Count ary length.
593
+ VALUE _hit_ary, // Hit ary.
594
+ VALUE _kmers_size, // Number of kmers.
595
+ VALUE _best_only, // Option best_only
596
+ VALUE _coverage // Option coverage
597
+ )
598
+ {
599
+ int *count_ary = (int *) StringValuePtr(_count_ary);
600
+ int count_ary_len = FIX2INT(_count_ary_len);
601
+ hit *hit_ary = (hit *) StringValuePtr(_hit_ary);
602
+ int kmers_size = FIX2INT(_kmers_size);
603
+ int best_only = FIX2INT(_best_only);
604
+ double coverage = NUM2DBL(_coverage);
605
+
606
+ hit new_hit = {0, 0};
607
+ int count = 0;
608
+ int i = 0;
609
+ unsigned int j = 0;
610
+
611
+ for (i = 0; i < count_ary_len; i++)
612
+ {
613
+ if ((count = count_ary[i]))
614
+ {
615
+ if (count >= kmers_size * coverage)
616
+ {
617
+ new_hit.node_id = i;
618
+ new_hit.count = count;
619
+
620
+ hit_ary[j] = new_hit;
621
+
622
+ j++;
623
+ }
624
+ }
625
+ }
626
+
627
+ if (j > 1)
628
+ {
629
+ qsort(hit_ary, j, sizeof(hit), hit_cmp_by_count_C);
630
+
631
+ if (best_only) {
632
+ hits_select_best_only_C(hit_ary, &j);
633
+ }
634
+ }
635
+
636
+ return UINT2NUM(j);
637
+ }
638
+ }
639
+ end
640
+
641
+ # Structure for taxonomic tree nodes.
642
+ Node = Struct.new(:seq_id, :node_id, :level, :name, :parent_id)
643
+
644
+ # Structure for holding the search result.
645
+ Result = Struct.new(:hits, :taxonomy)
646
+
647
+ # Class holding methods for manipulating tanomic paths.
648
+ class TaxPath
649
+ attr_reader :nodes
650
+
651
+ # Constructor method for TaxPath objects.
652
+ def initialize(node_id, kmers_observed, kmers_total, tax_index)
653
+ @node_id = node_id
654
+ @kmers_observed = kmers_observed
655
+ @kmers_total = kmers_total
656
+ @tax_index = tax_index
657
+ @nodes = taxonomy_backtrack
658
+ end
659
+
660
+ # Method that returns a list of nodes for a given node_id and all
661
+ # parent ids up the taxonomy tree.
662
+ def taxonomy_backtrack
663
+ nodes = []
664
+
665
+ node_id = @node_id
666
+
667
+ while (node = @tax_index[node_id])
668
+ nodes << node
669
+
670
+ break if node.level == :r # At root level
671
+
672
+ node_id = node.parent_id
673
+ end
674
+
675
+ nodes.reverse
676
+ end
677
+
678
+ # Returns formatted taxonomy string.
679
+ def to_s
680
+ levels = []
681
+
682
+ @nodes[1..-1].each do |node|
683
+ levels << "#{node.level.upcase}##{node.name}"
684
+ end
685
+
686
+ levels.join(';')
687
+ end
688
+ end
689
+ end
690
+ end
691
+ end