BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,217 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Classify sequences in the stream.
30
+ #
31
+ # +classify_seq+ searches sequences in the stream against a pre-indexed
32
+ # (using +index_taxonomy+) database. The database consists a taxonomic tree
33
+ # index and indices for each taxonomic level saved in the following files
34
+ # (here using the prefix "taxonomy"):
35
+ #
36
+ # * taxonomy_tax_index.dat - return node for a given node id.
37
+ # * taxonomy_kmer_index.dat - return list of node ids for a given level
38
+ # and kmer.
39
+ #
40
+ # Each sequence is broken down into unique kmers of a given kmer_size
41
+ # overlapping with a given step_size - see +index_taxonomy+. Now, for each
42
+ # taxonomic level, starting from species all nodes for each kmer is looked
43
+ # up in the database. The nodes containing most kmers are considered hits.
44
+ # If there are no hits at a taxonomic level, we move to the next level. Hits
45
+ # are sorted according to how many kmers matched this particular node and a
46
+ # consensus taxonomy string is determined. Hits are also filtered with the
47
+ # following options:
48
+ #
49
+ # * hits_max - Include maximally this number of hits in the consensus.
50
+ # * best_only - Include only the best scoring hits in the consensus.
51
+ # That is if a hit consists of 344 kmers out of 345
52
+ # possible, only hits with 344 kmers are included.
53
+ # * coverage - Filter hits based on kmer coverage. If a hit contains
54
+ # fewer kmers than the total amount of kmers x coverage
55
+ # it will be filtered.
56
+ # * consensus - For a number of hits accept consensus at a given level
57
+ # if within this percentage.
58
+ #
59
+ # The output of +classify_seq+ are sequence type records with the
60
+ # additional keys:
61
+ #
62
+ # * TAXONOMY_HITS - The number of hits used in the consensus.
63
+ # * TAXONOMY - The taxonomy string.
64
+ #
65
+ # The consensus is determined from a list of taxonomic strings, i.e. the
66
+ # TAXONOMIC_HITS, and is composed of a consensus for each taxonomic level.
67
+ # E.g. for the kingdom level if 60% of the taxonomic strings indicate
68
+ # 'Bacteria' and the consensus is 50% then the consensus for the kingdom
69
+ # level will be reported as 'Bacteria(60)'. If the name at any level
70
+ # consists of multiple words they are treated independently. E.g if we have
71
+ # three taxonomic strings at the species level with the names:
72
+ #
73
+ # * Escherichia coli K-12
74
+ # * Escherichia coli sp. AC3432
75
+ # * Escherichia coli sp. AC1232
76
+ #
77
+ # The corresponding consensus for that level will be reported as
78
+ # 'Escherichia coli sp.(100/100/66)'. The forth word in the last two
79
+ # taxonomy strings (AC3432 and AC1232) have a consensus below 50% and are
80
+ # ignored.
81
+ #
82
+ # == Usage
83
+ #
84
+ # classify_seq(<dir: <dir>>[, prefix: <string>[, kmer_size: <uint>
85
+ # [, step_size: <uint>[, hits_max: <uint>[, consensus:
86
+ # <float>[, coverage: <float>[, best_only: <bool>]]]]]]])
87
+ #
88
+ # === Options
89
+ #
90
+ # * dir: <dir> - Directory containing taxonomy files.
91
+ # * prefix: <string> - Taxonomy files prefix (default="taxonomy").
92
+ # * kmer_size: <uint> - Kmer size (default=8).
93
+ # * step_size: <uint> - Step size (default=1).
94
+ # * hits_max: <uint> - Maximum hits to include in consensus (default=50).
95
+ # * consensus: <float> - Consensus cutoff (default=0.51).
96
+ # * coverage: <float> - Coverate cutoff (default=0.9).
97
+ # * best_only: <bool> - Only use best hits for consensus (default=true).
98
+ #
99
+ # == Examples
100
+ #
101
+ # To classify a bunch of OTU sequences in the file +otus.fna+ we do:
102
+ #
103
+ # BP.new.
104
+ # read_fasta(input: "otus.fna").
105
+ # classify_seq(dir: "RDP11_3").
106
+ # write_table(keys: [:SEQ_NAME, :TAXONOMY_HITS, :TAXONOMY]).
107
+ # run
108
+ #
109
+ # OTU_0 1 K#Bacteria(100);P#Proteobacteria(100);C#Gammaproteobacteria...
110
+ # OTU_1 1 K#Bacteria(100);P#Proteobacteria(100);C#Gammaproteobacteria...
111
+ # OTU_2 1 K#Bacteria(100);P#Proteobacteria(100);C#Gammaproteobacteria...
112
+ # OTU_3 1 K#Bacteria(100);P#Proteobacteria(100);C#Gammaproteobacteria...
113
+ # OTU_4 2 K#Bacteria(100);P#Fusobacteria(100);C#Fusobacteriia(100);O#...
114
+ class ClassifySeq
115
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
116
+ residues_out)
117
+
118
+ # Constructor for the ClassifySeq class.
119
+ #
120
+ # @param [Hash] options Options hash.
121
+ # @option options [String] :dir Directory path with indexes.
122
+ # @option options [String] :prefix Index prefix.
123
+ # @option options [Integer] :kmer_size Kmer size.
124
+ # @option options [Integer] :step_size Step size.
125
+ # @option options [Integer] :hits_max Max hits to report per sequence.
126
+ # @option options [Float] :consensus Taxonomy string consensus percent.
127
+ # @option options [Float] :coverage Kmer coverage filter percent.
128
+ # @option options [Boolean] :best_only Flag to report best hit only.
129
+ #
130
+ # @return [ClassifySeq] Returns an instance of the class.
131
+ def initialize(options)
132
+ @options = options
133
+
134
+ check_options
135
+ defaults
136
+ end
137
+
138
+ # Return a lambda for the ClassifySeq command.
139
+ #
140
+ # @return [Proc] Returns the command lambda.
141
+ def lmb
142
+ lambda do |input, output, status|
143
+ status_init(status, STATS)
144
+
145
+ @status[:sequences_in] = 0
146
+
147
+ search = BioDSL::Taxonomy::Search.new(@options)
148
+
149
+ input.each_with_index do |record, i|
150
+ @status[:records_in] += 1
151
+
152
+ classify_seq(record, i, search) if record.key? :SEQ
153
+
154
+ output << record
155
+ @status[:records_out] += 1
156
+ end
157
+ end
158
+ end
159
+
160
+ private
161
+
162
+ # Check options.
163
+ def check_options
164
+ options_allowed(@options, :dir, :prefix, :kmer_size, :step_size,
165
+ :hits_max, :consensus, :coverage, :best_only)
166
+ options_required(@options, :dir)
167
+ options_dirs_exist(@options, :dir)
168
+ options_allowed_values(@options, best_only: [nil, true, false])
169
+
170
+ run_assertions
171
+ end
172
+
173
+ # Run assertions.
174
+ def run_assertions
175
+ options_assert(@options, ':kmer_size > 0')
176
+ options_assert(@options, ':kmer_size <= 12')
177
+ options_assert(@options, ':step_size > 0')
178
+ options_assert(@options, ':step_size <= 12')
179
+ options_assert(@options, ':hits_max > 0')
180
+ options_assert(@options, ':consensus > 0')
181
+ options_assert(@options, ':consensus <= 1')
182
+ options_assert(@options, ':coverage > 0')
183
+ options_assert(@options, ':coverage <= 1')
184
+ end
185
+
186
+ # Set default options.
187
+ def defaults
188
+ @options[:prefix] ||= 'taxonomy'
189
+ @options[:kmer_size] ||= 8
190
+ @options[:step_size] ||= 1
191
+ @options[:hits_max] ||= 50
192
+ @options[:consensus] ||= 0.51
193
+ @options[:coverage] ||= 0.9
194
+ @options[:best_only] = true if @options[:best_only].nil?
195
+ end
196
+
197
+ # Execute classfication of a sequence containing record.
198
+ #
199
+ # @param record [Hash] BioDSL record.
200
+ # @param i [Fixnum] Record number,
201
+ # @param search [BioDSL::Taxonomy::Search] Search object.
202
+ def classify_seq(record, i, search)
203
+ @status[:sequences_in] += 1
204
+ @status[:sequences_out] += 1
205
+ @status[:residues_in] += record[:SEQ].length
206
+ @status[:residues_out] += record[:SEQ].length
207
+ seq_name = record[:SEQ_NAME] || i.to_s
208
+
209
+ result = search.execute(BioDSL::Seq.new(seq_name: seq_name,
210
+ seq: record[:SEQ]))
211
+
212
+ record[:TAXONOMY] = result.taxonomy
213
+ record[:TAXONOMY_HITS] = result.hits
214
+ record[:RECORD_TYPE] = 'taxonomy'
215
+ end
216
+ end
217
+ end
@@ -0,0 +1,226 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Run classify_seq_mothur on sequences in the stream.
30
+ #
31
+ # This is a wrapper for the +mothur+ command +classify.seqs()+. Basically,
32
+ # it classifies sequences in the stream given a database file and a taxonomy
33
+ # file which can be downloaded here:
34
+ #
35
+ # http://www.mothur.org/w/images/5/59/Trainset9_032012.pds.zip
36
+ #
37
+ # Please refer to the manual:
38
+ #
39
+ # http://www.mothur.org/wiki/Classify.seqs
40
+ #
41
+ # Mothur must be installed for +classify_seq_mothurs+ to work. Read more here:
42
+ #
43
+ # http://www.mothur.org/
44
+ #
45
+ # == Usage
46
+ #
47
+ # classify_seq_mothur(<database: <file>>, <taxonomy: <file>>
48
+ # [, confidence: <uint>[, cpus: <uint>]])
49
+ #
50
+ # === Options
51
+ #
52
+ # * database: <file> - Database to search.
53
+ # * taxonomy: <file> - Taxonomy file for mapping names.
54
+ # * confidence: <uint> - Confidence threshold (defualt=80).
55
+ # * cpus: <uint> - Number of CPU cores to use (default=1).
56
+ #
57
+ # == Examples
58
+ #
59
+ # To classify a bunch of OTU sequences in the file +otus.fna+ we do:
60
+ #
61
+ # database = "trainset9_032012.pds.fasta"
62
+ # taxonomy = "trainset9_032012.pds.tax"
63
+ #
64
+ # BP.new.
65
+ # read_fasta(input: "otus.fna").
66
+ # classify_seq_mothur(database: database, taxonomy: taxonomy).
67
+ # grab(exact: true, keys: :RECORD_TYPE, select: "taxonomy").
68
+ # write_table(output: "classified.tab", header: true, force: true,
69
+ # skip: [:RECORD_TYPE]).
70
+ # run
71
+ class ClassifySeqMothur
72
+ require 'English'
73
+ require 'BioDSL/helpers/aux_helper'
74
+
75
+ include AuxHelper
76
+
77
+ STATS = %i(records_in records_out sequences_in sequences_out
78
+ residues_in residues_out)
79
+
80
+ # Constructor for ClassifySeqMothur.
81
+ #
82
+ # @param options [Hash] Options hash.
83
+ # @option options [String] :database Path to database file.
84
+ # @option options [String] :taxonomy Path to taxonomy file.
85
+ # @option options [Integer] :confidence Confidence cutoff.
86
+ # @option options [Integer] :cpus Number of CPUs to use.
87
+ #
88
+ # @return [ClassifySeqMothur] Instance of class.
89
+ def initialize(options)
90
+ @options = options
91
+
92
+ aux_exist('mothur')
93
+ check_options
94
+ defaults
95
+ end
96
+
97
+ # Command lambda for ClassifySeqMothur.
98
+ #
99
+ # @return [Proc] Lambda for the command.
100
+ def lmb
101
+ lambda do |input, output, status|
102
+ status_init(status, STATS)
103
+
104
+ TmpDir.create('input.fasta') do |tmp_in, tmp_dir|
105
+ process_input(input, output, tmp_in)
106
+ run_mothur(tmp_dir, tmp_in)
107
+ tmp_out = Dir.glob("#{tmp_dir}/input.*.taxonomy").first
108
+ process_output(output, tmp_out)
109
+ end
110
+ end
111
+ end
112
+
113
+ private
114
+
115
+ # Check options.
116
+ def check_options
117
+ options_allowed(@options, :database, :taxonomy, :confidence, :cpus)
118
+ options_required(@options, :database, :taxonomy)
119
+ options_files_exist(@options, :database, :taxonomy)
120
+ options_assert(@options, ':confidence > 0')
121
+ options_assert(@options, ':confidence <= 100')
122
+ options_assert(@options, ':cpus >= 1')
123
+ options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
124
+
125
+ defaults
126
+ end
127
+
128
+ # Set default options.
129
+ def defaults
130
+ @options[:confidence] ||= 80
131
+ @options[:cpus] ||= 1
132
+ end
133
+
134
+ # Process input data and save sequences to a temporary file for
135
+ # classifcation.
136
+ #
137
+ # @param input [Enumerator] Input stream.
138
+ # @param output [Enumerator::Yielder] Output stream.
139
+ # @param tmp_in [String] Path to temporary FASTA file.
140
+ def process_input(input, output, tmp_in)
141
+ BioDSL::Fasta.open(tmp_in, 'w') do |ios|
142
+ input.each_with_index do |record, i|
143
+ @status[:records_in] += 1
144
+
145
+ if record[:SEQ]
146
+ @status[:sequences_in] += 1
147
+ @status[:sequences_out] += 1
148
+ @status[:residues_in] += record[:SEQ].length
149
+ @status[:records_out] += record[:SEQ].length
150
+ seq_name = record[:SEQ_NAME] || i.to_s
151
+
152
+ entry = BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ])
153
+
154
+ ios.puts entry.to_fasta
155
+ end
156
+
157
+ output << record
158
+ @status[:records_out] += 1
159
+ end
160
+ end
161
+ end
162
+
163
+ # Run Mothur using a system call.
164
+ #
165
+ # @param tmp_dir [String] Path to temporary dir.
166
+ # @param tmp_in [String] Path to input file.
167
+ #
168
+ # @raise [RunTimeError] If system call fails.
169
+ def run_mothur(tmp_dir, tmp_in)
170
+ cmd = <<-CMD.gsub(/^\s+\|/, '').delete("\n")
171
+ |mothur "#set.dir(input=#{tmp_dir});
172
+ |set.dir(output=#{tmp_dir});
173
+ |classify.seqs(fasta=#{tmp_in},
174
+ |reference=#{@options[:database]},
175
+ |taxonomy=#{@options[:taxonomy]},
176
+ |method=wang,
177
+ |processors=#{@options[:cpus]})"
178
+ CMD
179
+
180
+ BioDSL.verbose ? system(cmd) : system("#{cmd} > /dev/null 2>&1")
181
+
182
+ fail 'Mothur failed' unless $CHILD_STATUS.success?
183
+ end
184
+
185
+ # Parse mothur classfication output and emit to stream.
186
+ #
187
+ # @param output [Enumerator::Yielder] Output stream.
188
+ # @param tmp_out [String] Path to file with classfication result.
189
+ def process_output(output, tmp_out)
190
+ BioDSL::CSV.open(tmp_out) do |ios|
191
+ ios.each_hash do |new_record|
192
+ new_record[:SEQ_NAME] = new_record[:V0]
193
+ new_record[:TAXONOMY] = new_record[:V1]
194
+ new_record[:TAXONOMY].tr!('"', '')
195
+ new_record.delete(:V0)
196
+ new_record.delete(:V1)
197
+ new_record[:TAXONOMY] = confidence_filter(new_record)
198
+ new_record[:RECORD_TYPE] = 'taxonomy'
199
+ output << new_record
200
+ @status[:records_out] += 1
201
+ end
202
+ end
203
+ end
204
+
205
+ # Filter taxonomic leveles based on the confidence.
206
+ #
207
+ # @param record [Hash] BioDSL record with taxonomy.
208
+ #
209
+ # @return [String] Return taxonomic string.
210
+ def confidence_filter(record)
211
+ new_levels = []
212
+
213
+ record[:TAXONOMY].split(';').each do |level|
214
+ next unless level =~ /^([^(]+)\((\d+)\)$/
215
+ name = Regexp.last_match(1)
216
+ confidence = Regexp.last_match(2).to_i
217
+
218
+ if confidence >= @options[:confidence]
219
+ new_levels << "#{name}(#{confidence})"
220
+ end
221
+ end
222
+
223
+ new_levels.empty? ? 'Unclassified' : new_levels.join(';')
224
+ end
225
+ end
226
+ end