BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,691 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ class TaxonomyError < StandardError; end
30
+
31
+ # Module containing classes for creating a taxonomic database and searching
32
+ # this.
33
+ module Taxonomy
34
+ require 'narray'
35
+
36
+ TAX_LEVELS = [:r, :k, :p, :c, :o, :f, :g, :s]
37
+
38
+ # rubocop: disable ClassLength
39
+
40
+ # Class for creating and databasing an index of a taxonomic tree. This is
41
+ # done in two steps. 1) A temporary tree is creating using the taxonomic
42
+ # strings from the sequence names in a FASTA file. 2) A simplistic tree
43
+ # is constructed from the temporary tree allowing this to be saved to files.
44
+ # The resulting index consists of the following files:
45
+ # * taxonomy_tax_index.dat - return node for a given node id.
46
+ # * taxonomy_kmer_index.dat - return list of node ids for a given level and
47
+ # kmer.
48
+ class Index
49
+ require 'set'
50
+
51
+ attr_reader :size, :node_id
52
+ alias_method :size, :node_id
53
+
54
+ # Constructor Index object.
55
+ def initialize(options)
56
+ @options = options # Option hash
57
+ @seq_id = 0 # Sequence id
58
+ @node_id = 0 # Node id
59
+ @tree = TaxNode.new(nil, :r, 'root', nil, @node_id) # Root node
60
+ @node_id += 1
61
+
62
+ %i(kmer_size step_size output_dir prefix).each do |option|
63
+ fail TaxonomyError, "missing #{option} option" unless @options[option]
64
+ end
65
+ end
66
+
67
+ # Method to add a Sequence entry to the taxonomic tree. The sequence name
68
+ # contain a taxonomic string.
69
+ #
70
+ # Example entry:
71
+ # seq_name: K#Bacteria;P#Proteobacteria;C#Gammaproteobacteria; \
72
+ # O#Vibrionales;F#Vibrionaceae;G#Vibrio;S#Vibrio
73
+ # seq: UCCUACGGGAGGCAGCAGUGGGGAAUAUUGCACAAUGGGCGCAAGCCUGA \
74
+ # UGCAGCCAUGCCGCGUGUAUGAAGGCCUUCGGGUUGUAACUC ...
75
+ #
76
+ # The sequence is reduced to a list of oligos of a given size and a given
77
+ # step size, e.g. 8 and 1, respectively:
78
+ #
79
+ # UCCUACGG
80
+ # CCUACGGG
81
+ # CUACGGGA
82
+ # UACGGGAG
83
+ # ACGGGAGG
84
+ # ...
85
+ #
86
+ # Each oligo is encoded as an kmer (integer) by encoding two bits per
87
+ # nucleotide:
88
+ #
89
+ # A = 00
90
+ # U = 01
91
+ # C = 10
92
+ # G = 11
93
+ #
94
+ # E.g. UCCUACGG = 0110100100101111 = 26927
95
+ #
96
+ # For each node in the tree a set is kept containing information of
97
+ # all observed oligos for that particular node. Thus all child nodes
98
+ # contain a subset of oligos compared to the parent node.
99
+ def add(entry)
100
+ node = @tree
101
+ old_name = false
102
+ tax_levels = entry.seq_name.split(';')
103
+
104
+ if tax_levels.size != TAX_LEVELS.size - 1
105
+ fail TaxonomyError, "Wrong number of tax levels in #{entry.seq_name}"
106
+ end
107
+
108
+ tax_levels.each_with_index do |tax_level, i|
109
+ level, name = tax_level.split('#')
110
+
111
+ if level.downcase.to_sym != TAX_LEVELS[i + 1]
112
+ fail TaxonomyError, "Unexpected tax id in #{entry.seq_name}"
113
+ end
114
+
115
+ if name
116
+ if i > 0 && !old_name
117
+ fail TaxonomyError, "Gapped tax level info in #{entry.seq_name}"
118
+ end
119
+
120
+ if (child = node[name])
121
+ else
122
+ child = TaxNode.new(node, level.downcase.to_sym, name, @seq_id,
123
+ @node_id)
124
+ @node_id += 1
125
+ end
126
+
127
+ if leaf?(tax_levels, i)
128
+ kmers = entry.to_kmers(kmer_size: @options[:kmer_size],
129
+ step_size: @options[:step_size])
130
+ child.kmers |= Set.new(kmers)
131
+ end
132
+
133
+ node[name] = child
134
+ node = node[name]
135
+ end
136
+
137
+ old_name = name
138
+ end
139
+
140
+ @seq_id += 1
141
+
142
+ self
143
+ end
144
+
145
+ # Remap and save taxonomic tree to index files.
146
+ def save
147
+ tree_union(@tree)
148
+
149
+ save_kmer_index
150
+ save_tax_index
151
+ end
152
+
153
+ # Testing method to get a node given an id. Returns nil if node wasn't
154
+ # found.
155
+ def get_node(id)
156
+ queue = [@tree]
157
+
158
+ until queue.empty?
159
+ node = queue.shift
160
+
161
+ return node if node.node_id == id
162
+
163
+ node.children.each_value do |child|
164
+ queue.unshift(child) unless child.nil?
165
+ end
166
+ end
167
+
168
+ nil
169
+ end
170
+
171
+ # Method that traverses the tax tree and populate all parent nodes with
172
+ # the union of all kmers from the patents children.
173
+ def tree_union(node = @tree)
174
+ node.children.each_value { |child| tree_union(child) }
175
+
176
+ node.children.each_value do |child|
177
+ if node.kmers.nil? && child.kmers.nil?
178
+ elsif node.kmers.nil?
179
+ node.kmers = child.kmers
180
+ else
181
+ node.kmers |= child.kmers if child.kmers
182
+ end
183
+ end
184
+ end
185
+
186
+ private
187
+
188
+ # Method that determines if a node is a leaf or not.
189
+ def leaf?(tax_levels, i)
190
+ if tax_levels[i + 1] && tax_levels[i + 1].split('#')[1]
191
+ false
192
+ else
193
+ true
194
+ end
195
+ end
196
+
197
+ # Save tax index to file.
198
+ def save_tax_index
199
+ file = File.join(@options[:output_dir],
200
+ "#{@options[:prefix]}_tax_index.dat")
201
+ File.open(file, 'wb') do |ios|
202
+ ios.puts %w(#SEQ_ID NODE_ID LEVEL NAME PARENT_ID).join("\t")
203
+ queue = [@tree]
204
+
205
+ until queue.empty?
206
+ node = queue.shift
207
+
208
+ ios.puts [node.seq_id, node.node_id, node.level, node.name,
209
+ node.parent_id].join("\t")
210
+
211
+ node.children.each_value do |child|
212
+ queue.unshift(child) unless child.nil?
213
+ end
214
+ end
215
+ end
216
+ end
217
+
218
+ # Construct and save kmer index to file. This is done BFS style one
219
+ # taxonomic level at a time to save memory.
220
+ def save_kmer_index
221
+ file = File.join(@options[:output_dir],
222
+ "#{@options[:prefix]}_kmer_index.dat")
223
+ File.open(file, 'wb') do |ios|
224
+ ios.puts %w(#LEVEL KMER NODES).join("\t")
225
+
226
+ level = 0
227
+ queue = [@tree]
228
+
229
+ until queue.empty?
230
+ kmer_index = Hash.new { |h, k| h[k] = [] }
231
+ new_queue = []
232
+
233
+ queue.each do |node|
234
+ node.kmers.to_a.map { |kmer| kmer_index[kmer] << node.node_id }
235
+ node.children.each_value { |child| child && new_queue << child }
236
+ end
237
+
238
+ kmer_index.keys.sort.each do |kmer|
239
+ nodes = kmer_index[kmer].sort.join(';')
240
+
241
+ ios.puts [TAX_LEVELS[level], kmer, nodes].join("\t")
242
+ end
243
+
244
+ queue = new_queue
245
+ level += 1
246
+ end
247
+
248
+ kmer_index
249
+ end
250
+ end
251
+
252
+ # Class for the nodes used for constructing the taxonomic tree.
253
+ class TaxNode
254
+ attr_accessor :kmers
255
+ attr_reader :parent, :level, :name, :children, :seq_id, :node_id
256
+
257
+ # Constructor for TaxNode objects.
258
+ def initialize(parent, level, name, seq_id, node_id)
259
+ @parent = parent # Parent node.
260
+ @level = level # Taxonomic level.
261
+ @name = name # Taxonomic name.
262
+ @kmers = Set.new # Kmer set.
263
+ @seq_id = seq_id # Sequ id (a representative seq for debugging).
264
+ @node_id = node_id # Node id.
265
+ @children = {} # Child node hash.
266
+ end
267
+
268
+ # Returns parent node id if a parent exist, else nil.
269
+ def parent_id
270
+ @parent.node_id if @parent
271
+ end
272
+
273
+ # Returns an array of children node ids.
274
+ def children_ids
275
+ ids = []
276
+
277
+ @children.each_value { |child| ids << child.id }
278
+
279
+ ids
280
+ end
281
+
282
+ # Getter method for node children.
283
+ def [](key)
284
+ @children[key]
285
+ end
286
+
287
+ # Setter method for node children.
288
+ def []=(key, value)
289
+ @children[key] = value
290
+ end
291
+ end
292
+ end
293
+
294
+ # Class for searching sequences in a taxonomic database. The database
295
+ # consists a taxonomic tree index and indices for each taxonomic level
296
+ # saved in the following files:
297
+ # * taxonomy_tax_index.dat - return node for a given node id.
298
+ # * taxonomy_kmer_index.dat - return list of node ids for a given level and
299
+ # kmer.
300
+ class Search
301
+ MAX_COUNT = 200_000
302
+ MAX_HITS = 2_000 # Max num of shared oligos between two sequences.
303
+ BYTES_IN_INT = 4
304
+ BYTES_IN_HIT = 2 * BYTES_IN_INT
305
+
306
+ # Constructor for initializing a Search object.
307
+ def initialize(options)
308
+ @options = options
309
+
310
+ symbols = %i(kmer_size step_size dir prefix consensus coverage hits_max)
311
+
312
+ symbols.each do |opt|
313
+ fail TaxonomyError, "missing #{opt} option" unless @options[opt]
314
+ end
315
+
316
+ @count_ary = BioDSL::CAry.new(MAX_COUNT, BYTES_IN_INT)
317
+ @hit_ary = BioDSL::CAry.new(MAX_HITS, BYTES_IN_HIT)
318
+ @tax_index = load_tax_index
319
+ @kmer_index = load_kmer_index
320
+ end
321
+
322
+ # Method to execute a search for a given sequence entry. First the
323
+ # sequence is broken down into unique kmers of a given kmer_size
324
+ # overlapping with a given step_size. See Taxonomy::Index.add.
325
+ # Now, for each taxonomic level, starting from species all nodes
326
+ # for each kmer is looked up in the database. The nodes containing
327
+ # most kmers are considered hits. If there are no hits at a taxonomic
328
+ # level, we move to the next level. Hits are sorted according to how
329
+ # many kmers matched this particular node and a consensus taxonomy
330
+ # string is determined. Hits are also filtered with the following
331
+ # options:
332
+ # * hits_max - Include maximally this number of hits in the consensus.
333
+ # * best_only - Include only the best scoring hits in the consensus.
334
+ # That is if a hit consists of 344 kmers out of 345
335
+ # possible, only hits with 344 kmers are included.
336
+ # * coverage - Filter hits based on kmer coverage. If a hit contains
337
+ # fewer kmers than the total amount of kmers x coverage
338
+ # it will be filtered.
339
+ # * consensus - For a number of hits accept consensus at a given level
340
+ # if within this percentage.
341
+ def execute(entry)
342
+ kmers = entry.to_kmers(kmer_size: @options[:kmer_size],
343
+ step_size: @options[:step_size])
344
+
345
+ puts "DEBUG Q: #{entry.seq_name}" if BioDSL.debug
346
+
347
+ TAX_LEVELS.reverse.each do |level|
348
+ kmers_lookup(kmers, level)
349
+
350
+ hit_count = hits_select_C(@count_ary.ary, @count_ary.count,
351
+ @hit_ary.ary, kmers.size,
352
+ (@options[:best_only] ? 1 : 0),
353
+ @options[:coverage])
354
+ hit_count = @options[:hits_max] if @options[:hits_max] < hit_count
355
+
356
+ if hit_count == 0
357
+ puts "DEBUG no hits @ #{level}" if BioDSL.debug
358
+ else
359
+ puts "DEBUG hit(s) @ #{level}" if BioDSL.debug
360
+ taxpaths = []
361
+
362
+ (0...hit_count).each do |i|
363
+ start = BYTES_IN_HIT * i
364
+ stop = BYTES_IN_HIT * i + BYTES_IN_HIT
365
+
366
+ node_id, count = @hit_ary.ary[start...stop].unpack('II')
367
+
368
+ taxpath = TaxPath.new(node_id, count, kmers.size, @tax_index)
369
+
370
+ if BioDSL.debug
371
+ seq_id = @tax_index[node_id].seq_id
372
+ puts "DEBUG S_ID: #{seq_id} KMERS: [#{count}/#{kmers.size}] \
373
+ #{taxpath}"
374
+ end
375
+
376
+ taxpaths << taxpath
377
+ end
378
+
379
+ return Result.new(hit_count, compile_consensus(taxpaths, hit_count).
380
+ tr('_', ' '))
381
+ end
382
+ end
383
+
384
+ Result.new(0, 'Unclassified')
385
+ end
386
+
387
+ private
388
+
389
+ # Method to load and return the tax_index from file.
390
+ def load_tax_index
391
+ tax_index = {}
392
+ file = File.join(@options[:dir], "#{@options[:prefix]}_tax_index.dat")
393
+ File.open(file) do |ios|
394
+ ios.each do |line|
395
+ line.chomp!
396
+
397
+ next if line[0] == '#'
398
+
399
+ seq_id, node_id, level, name, parent_id = line.split("\t")
400
+
401
+ tax_index[node_id.to_i] = Node.new(seq_id.to_i, node_id.to_i,
402
+ level.to_sym, name,
403
+ parent_id.to_i)
404
+ end
405
+ end
406
+
407
+ tax_index
408
+ end
409
+
410
+ # Method to load and return the kmer_index from file.
411
+ def load_kmer_index
412
+ kmer_index = Hash.new { |h, k| h[k] = {} }
413
+ file = File.join(@options[:dir], "#{@options[:prefix]}_kmer_index.dat")
414
+ File.open(file) do |ios|
415
+ ios.each do |line|
416
+ line.chomp!
417
+
418
+ next if line[0] == '#'
419
+
420
+ level, kmer, nodes = line.split("\t")
421
+
422
+ kmer_index[level.to_sym][kmer.to_i] = nodes.split(';').map(&:to_i).
423
+ pack('I*')
424
+ end
425
+ end
426
+
427
+ kmer_index
428
+ end
429
+
430
+ # Method that given a list of kmers and a taxonomic level
431
+ # lookups all the nodes for each kmer and increment the
432
+ # count array posisions for all nodes. The lookup for each
433
+ # kmer is initially done from a database, but subsequent
434
+ # lookups for that particular kmer are cached.
435
+ def kmers_lookup(kmers, level)
436
+ @count_ary.zero!
437
+
438
+ kmers.each do |kmer|
439
+ next unless @kmer_index[level]
440
+
441
+ if (nodes = @kmer_index[level][kmer])
442
+ increment_C(@count_ary.ary, nodes, nodes.size / BYTES_IN_INT)
443
+ end
444
+ end
445
+ end
446
+
447
+ # Method that given a list of taxonomic paths determines a consensus for
448
+ # each taxonomic level. E.g. for the kingdom level if 60% of the taxpaths
449
+ # indicate 'Bacteria' and the consensus is 50% then the consensus for the
450
+ # kingdom level will be reported as 'Bacteria(60)'. If the name at any
451
+ # level consists of multiple words they are treated independently. E.g if
452
+ # we have three taxpath at the species level with the names:
453
+ #
454
+ # * Escherichia coli K-12
455
+ # * Escherichia coli sp. AC3432
456
+ # * Escherichia coli sp. AC1232
457
+ #
458
+ # The corresponding consensus for that level will be reported as
459
+ # 'Escherichia coli sp.(100/100/66)'. The forth word in the last two
460
+ # taxonomy strings (AC3432 and AC1232) have a consensus below 50% and are
461
+ # ignored.
462
+ def compile_consensus(taxpaths, hit_size)
463
+ consensus = []
464
+ tax_hash = decompose_consensus(taxpaths)
465
+
466
+ tax_hash.each do |level, subhash|
467
+ cons = []
468
+ scores = []
469
+
470
+ subhash.each_value do |subsubhash|
471
+ subsubhash.sort_by { |_, count| count }.reverse.
472
+ each do |subname, count|
473
+ if count >= hit_size * @options[:consensus]
474
+ cons << subname
475
+ scores << ((count / hit_size.to_f) * 100).to_i
476
+ end
477
+ end
478
+ end
479
+
480
+ break if cons.empty?
481
+
482
+ consensus << "#{level.upcase}##{cons.join('_')}(#{scores.join('/')})"
483
+ end
484
+
485
+ if consensus.empty?
486
+ 'Unclassified'
487
+ else
488
+ consensus.join(';')
489
+ end
490
+ end
491
+
492
+ # Method that given a list of taxonomic paths splits these into a data
493
+ # structure appropriate for subsequence determination of the taxonomic
494
+ # consensus.
495
+ def decompose_consensus(taxpaths)
496
+ tax_hash = Hash.new do |h1, k1|
497
+ h1[k1] = Hash.new { |h2, k2| h2[k2] = Hash.new(0) }
498
+ end
499
+
500
+ taxpaths.each do |taxpath|
501
+ taxpath.nodes[1..-1].each do |node| # Ignoring root level, start at 1
502
+ node.name.split('_').each_with_index do |subname, i|
503
+ tax_hash[node.level][i][subname] += 1
504
+ end
505
+ end
506
+ end
507
+
508
+ tax_hash
509
+ end
510
+
511
+ inline do |builder|
512
+ # Struct for a 'hit' containing two pieces of information:
513
+ # * node_id - Node id for this particular node.
514
+ # * count - Number of kmers matching this particular node.
515
+ builder.prefix %(
516
+ typedef struct
517
+ {
518
+ unsigned int node_id;
519
+ unsigned int count;
520
+ } hit;
521
+ )
522
+
523
+ # Qsort hit struct comparision function for sorting
524
+ # hits according to count (highest count first).
525
+ # Returns negative if a > b and positive if b > a.
526
+ builder.prefix %{
527
+ int hit_cmp_by_count_C(const void *a, const void *b)
528
+ {
529
+ hit *ia = (hit *) a;
530
+ hit *ib = (hit *) b;
531
+
532
+ return (int) (ib->count - ia->count);
533
+ }
534
+ }
535
+
536
+ # Method to select only the best hits from the hit ary, which is sorted
537
+ # according to count (highest count first).
538
+ builder.prefix %{
539
+ void hits_select_best_only_C(
540
+ hit *hit_ary, // hit array.
541
+ unsigned int *hit_ary_len // hit array length.
542
+ )
543
+ {
544
+ unsigned int i = 0;
545
+ unsigned int max = 0;
546
+
547
+ max = hit_ary[i].count;
548
+
549
+ i++;
550
+
551
+ while ((i < *hit_ary_len) && (hit_ary[i].count == max)){
552
+ i++;
553
+ }
554
+
555
+ *hit_ary_len = i;
556
+ }
557
+ }
558
+
559
+ # Method for incrementing the count_ary. Each position in the count_ary
560
+ # corresponds to a node_id. The value at the position that is
561
+ # incremented corresponds to the number of shared kmers between this
562
+ # node id and the query sequence.
563
+ builder.c %{
564
+ void increment_C(
565
+ VALUE _count_ary, // Count ary.
566
+ VALUE _nodes_ary, // Nodes ary.
567
+ VALUE _length // Nodes ary length.
568
+ )
569
+ {
570
+ int *count_ary = (int *) StringValuePtr(_count_ary);
571
+ int *nodes_ary = (int *) StringValuePtr(_nodes_ary);
572
+ int length = FIX2INT(_length);
573
+ int i = 0;
574
+
575
+ for (i = 0; i < length; i++) {
576
+ count_ary[nodes_ary[i]]++;
577
+ }
578
+ }
579
+ }
580
+
581
+ # Method for selecting hits based from the count_ary. Hits are selected
582
+ # on a number of specified parameters:
583
+ # * best_only - if this is true only top scoring hits are reported.
584
+ # * coverage - Filter hits based on kmer coverage. If a hit contains
585
+ # fewer kmers than the total amount of kmers x coverage
586
+ # it will be filtered.
587
+ # The resulting hit_ary is sorted according to count (highest count
588
+ # first) and the size of the hit_ary is returned.
589
+ builder.c %{
590
+ VALUE hits_select_C(
591
+ VALUE _count_ary, // Count ary.
592
+ VALUE _count_ary_len, // Count ary length.
593
+ VALUE _hit_ary, // Hit ary.
594
+ VALUE _kmers_size, // Number of kmers.
595
+ VALUE _best_only, // Option best_only
596
+ VALUE _coverage // Option coverage
597
+ )
598
+ {
599
+ int *count_ary = (int *) StringValuePtr(_count_ary);
600
+ int count_ary_len = FIX2INT(_count_ary_len);
601
+ hit *hit_ary = (hit *) StringValuePtr(_hit_ary);
602
+ int kmers_size = FIX2INT(_kmers_size);
603
+ int best_only = FIX2INT(_best_only);
604
+ double coverage = NUM2DBL(_coverage);
605
+
606
+ hit new_hit = {0, 0};
607
+ int count = 0;
608
+ int i = 0;
609
+ unsigned int j = 0;
610
+
611
+ for (i = 0; i < count_ary_len; i++)
612
+ {
613
+ if ((count = count_ary[i]))
614
+ {
615
+ if (count >= kmers_size * coverage)
616
+ {
617
+ new_hit.node_id = i;
618
+ new_hit.count = count;
619
+
620
+ hit_ary[j] = new_hit;
621
+
622
+ j++;
623
+ }
624
+ }
625
+ }
626
+
627
+ if (j > 1)
628
+ {
629
+ qsort(hit_ary, j, sizeof(hit), hit_cmp_by_count_C);
630
+
631
+ if (best_only) {
632
+ hits_select_best_only_C(hit_ary, &j);
633
+ }
634
+ }
635
+
636
+ return UINT2NUM(j);
637
+ }
638
+ }
639
+ end
640
+
641
+ # Structure for taxonomic tree nodes.
642
+ Node = Struct.new(:seq_id, :node_id, :level, :name, :parent_id)
643
+
644
+ # Structure for holding the search result.
645
+ Result = Struct.new(:hits, :taxonomy)
646
+
647
+ # Class holding methods for manipulating tanomic paths.
648
+ class TaxPath
649
+ attr_reader :nodes
650
+
651
+ # Constructor method for TaxPath objects.
652
+ def initialize(node_id, kmers_observed, kmers_total, tax_index)
653
+ @node_id = node_id
654
+ @kmers_observed = kmers_observed
655
+ @kmers_total = kmers_total
656
+ @tax_index = tax_index
657
+ @nodes = taxonomy_backtrack
658
+ end
659
+
660
+ # Method that returns a list of nodes for a given node_id and all
661
+ # parent ids up the taxonomy tree.
662
+ def taxonomy_backtrack
663
+ nodes = []
664
+
665
+ node_id = @node_id
666
+
667
+ while (node = @tax_index[node_id])
668
+ nodes << node
669
+
670
+ break if node.level == :r # At root level
671
+
672
+ node_id = node.parent_id
673
+ end
674
+
675
+ nodes.reverse
676
+ end
677
+
678
+ # Returns formatted taxonomy string.
679
+ def to_s
680
+ levels = []
681
+
682
+ @nodes[1..-1].each do |node|
683
+ levels << "#{node.level.upcase}##{node.name}"
684
+ end
685
+
686
+ levels.join(';')
687
+ end
688
+ end
689
+ end
690
+ end
691
+ end