BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,286 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Run uclust on sequences in the stream.
30
+ #
31
+ # This is a wrapper for the +usearch+ tool to run the program uclust.
32
+ # Basically sequence type records are clustered de-novo and records containing
33
+ # sequence and cluster information is output. If the +align+ option is given
34
+ # the sequnces will be aligned.
35
+ #
36
+ # Please refer to the manual:
37
+ #
38
+ # http://www.drive5.com/usearch/manual/cmd_cluster_smallmem.html
39
+ #
40
+ # Usearch 7.0 must be installed for +usearch+ to work. Read more here:
41
+ #
42
+ # http://www.drive5.com/usearch/
43
+ #
44
+ # == Usage
45
+ #
46
+ # uclust(<identity: float>, <strand: "plus|both">[, align: <bool>
47
+ # [, cpus: <uint>]])
48
+ #
49
+ # === Options
50
+ #
51
+ # * identity: <float> - Similarity for matching in percent between 0.0 and
52
+ # 1.0.
53
+ # * strand: <string> - For nucleotide search report hits from plus or both
54
+ # strands.
55
+ # * align: <bool> - Align sequences.
56
+ # * cpus: <uint> - Number of CPU cores to use (default=1).
57
+ #
58
+ # == Examples
59
+ #
60
+ # rubocop: disable ClassLength
61
+ class Uclust
62
+ require 'BioDSL/helpers/aux_helper'
63
+
64
+ include AuxHelper
65
+
66
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
67
+ residues_out clusters_out)
68
+
69
+ # Constructor for Uclust.
70
+ #
71
+ # @param options [Hash] Options hash.
72
+ # @option options [Float] :identity
73
+ # @option options [String,Symbol] :strand
74
+ # @option options [Boolean] :align
75
+ # @option options [Integer] :cpus
76
+ #
77
+ # @return [Uclust] Class instance.
78
+ def initialize(options)
79
+ @options = options
80
+ @options[:cpus] ||= 1
81
+
82
+ aux_exist('usearch')
83
+ check_options
84
+ end
85
+
86
+ # Return command lambda for uclust.
87
+ #
88
+ # @return [Proc] Command lambda.
89
+ def lmb
90
+ lambda do |input, output, status|
91
+ status_init(status, STATS)
92
+
93
+ TmpDir.create('rec', 'in', 'out') do |tmp_rec, tmp_in, tmp_out|
94
+ process_input(input, output, tmp_rec, tmp_in)
95
+
96
+ run_uclust(tmp_in, tmp_out)
97
+
98
+ if @options[:align]
99
+ process_output_align(output, tmp_out)
100
+ else
101
+ process_output(output, tmp_rec, tmp_out)
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ private
108
+
109
+ # Check options.
110
+ def check_options
111
+ options_allowed(@options, :identity, :strand, :align, :cpus)
112
+ options_required(@options, :identity, :strand)
113
+ options_allowed_values(@options, strand: ['plus', 'both', :plus, :both])
114
+ options_allowed_values(@options, align: [nil, false, true])
115
+ options_assert(@options, ':identity > 0.0')
116
+ options_assert(@options, ':identity <= 1.0')
117
+ options_assert(@options, ':cpus >= 1')
118
+ options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
119
+ end
120
+
121
+ # Process input data and serialize all records into a temporary file and all
122
+ # records containing sequence to a temporary FASTA file.
123
+ #
124
+ # @param input [Enumerator] Input stream
125
+ # @param output [Enumerator::Yeilder] Output stream.
126
+ # @param tmp_rec [String] Path to serialized records file.
127
+ # @param tmp_in [String] Path to input file.
128
+ def process_input(input, output, tmp_rec, tmp_in)
129
+ File.open(tmp_rec, 'wb') do |ios_rec|
130
+ BioDSL::Serializer.new(ios_rec) do |s|
131
+ BioDSL::Fasta.open(tmp_in, 'w') do |ios|
132
+ process_input_records(input, output, ios, s)
133
+ end
134
+ end
135
+ end
136
+ end
137
+
138
+ # Iterate over records in the input stream and serialize all records. Also,
139
+ # records with sequence are saved in a FASTA file or emitted to the output
140
+ # stream if the record contains no sequence.
141
+ #
142
+ # @param input [Enumerator] Input stream
143
+ # @param output [Enumerator::Yeilder] Output stream.
144
+ # @param ios [Fasta::IO] Output stream to a FASTA file
145
+ # @param serializer [BioDSL::Serializer] Serializer IO.
146
+ def process_input_records(input, output, ios, serializer)
147
+ input.each_with_index do |record, i|
148
+ @status[:records_in] += 1
149
+
150
+ if record[:SEQ]
151
+ output_entry(ios, record, i)
152
+ else
153
+ @status[:records_out] += 1
154
+ output << record
155
+ end
156
+
157
+ serializer << record
158
+ end
159
+ end
160
+
161
+ # Save a BioDSL record to a FASTA file.
162
+ #
163
+ # @param ios [Fasta::IO] Output stream to a FASTA file
164
+ # @param record [Hash] BioDSL record.
165
+ # @param i [Integer] Record index.
166
+ def output_entry(ios, record, i)
167
+ @status[:sequences_in] += 1
168
+
169
+ record[:SEQ_NAME] ||= i.to_s
170
+
171
+ entry = BioDSL::Seq.new(seq_name: record[:SEQ_NAME], seq: record[:SEQ])
172
+
173
+ @status[:residues_in] += entry.length
174
+
175
+ ios.puts entry.to_fasta
176
+ end
177
+
178
+ # Run the uclust command.
179
+ #
180
+ # @param tmp_in [String] Path to input file.
181
+ # @param tmp_out [String] Path to output file.
182
+ #
183
+ # @raise [BioDSL::UsearchError] if command fails.
184
+ def run_uclust(tmp_in, tmp_out)
185
+ uclust_opts = {
186
+ input: tmp_in,
187
+ output: tmp_out,
188
+ strand: @options[:strand],
189
+ identity: @options[:identity],
190
+ align: @options[:align],
191
+ cpus: @options[:cpus],
192
+ verbose: @options[:verbose]
193
+ }
194
+
195
+ BioDSL::Usearch.cluster_smallmem(uclust_opts)
196
+ rescue BioDSL::UsearchError => e
197
+ raise unless e.message =~ /Empty input file/
198
+ end
199
+
200
+ # Parse uclust output file and return a hash with Q_ID as key and the uclust
201
+ # record as value.
202
+ #
203
+ # @param tmp_out [String] Path to output file.
204
+ #
205
+ # @return [Hash] Q_ID as keys and Uclust records.
206
+ def parse_output(tmp_out)
207
+ results = {}
208
+
209
+ BioDSL::Usearch.open(tmp_out) do |ios|
210
+ ios.each(:uc) do |record|
211
+ record[:RECORD_TYPE] = 'uclust'
212
+
213
+ results[record[:Q_ID]] = record
214
+ end
215
+ end
216
+
217
+ results
218
+ end
219
+
220
+ # Parse MSA alignment data from uclust output file and emit to the output
221
+ # stream.
222
+ #
223
+ # @param output [Enumerator::Yeilder] Output stream.
224
+ # @param tmp_out [String] Path to uclust output file.
225
+ def process_output_align(output, tmp_out)
226
+ BioDSL::Fasta.open(tmp_out) do |ios|
227
+ ios.each do |entry|
228
+ if entry.seq_name == 'consensus'
229
+ @status[:clusters_out] += 1
230
+ else
231
+ record = {RECORD_TYPE: 'uclust', CLUSTER: @status[:clusters_out]}
232
+ record.merge!(entry.to_bp)
233
+
234
+ output << record
235
+ @status[:records_out] += 1
236
+ @status[:sequences_out] += 1
237
+ @status[:residues_out] += entry.length
238
+ end
239
+ end
240
+ end
241
+ end
242
+
243
+ # Parse results form uclust and merge with serialized data and output to the
244
+ # output stream.
245
+ #
246
+ # @param output [Enumerator::Yeilder] Output stream.
247
+ # @param tmp_rec [String] Path to serialized records file.
248
+ # @param tmp_out [String] Path to uclust output file.
249
+ def process_output(output, tmp_rec, tmp_out)
250
+ results = parse_output(tmp_out)
251
+
252
+ File.open(tmp_rec, 'rb') do |ios_rec|
253
+ BioDSL::Serializer.new(ios_rec) do |s|
254
+ process_output_serial(s, results, output)
255
+ end
256
+ end
257
+ end
258
+
259
+ # Deserialize records from temporary file, merge these with cluster data and
260
+ # emit to the output stream.
261
+ #
262
+ # @param serializer [BioDSL::Serializer]
263
+ # Serializer IO.
264
+ #
265
+ # @param results [Hash]
266
+ # Results from uclust with Q_ID as key and uclust record as value
267
+ #
268
+ # @param output [Enumerator::Yeilder]
269
+ # Output stream.
270
+ def process_output_serial(serializer, results, output)
271
+ serializer.each do |record|
272
+ next unless record[:SEQ_NAME]
273
+
274
+ if (r = results[record[:SEQ_NAME]])
275
+ output << record.merge(r)
276
+ @status[:records_out] += 1
277
+ @status[:sequences_out] += 1
278
+ @status[:residues_out] += record[:SEQ].length
279
+ else
280
+ fail BioDSL::UsearchError, 'Sequence name: ' \
281
+ "#{record[:SEQ_NAME]} not found in uclust results"
282
+ end
283
+ end
284
+ end
285
+ end
286
+ end
@@ -0,0 +1,145 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Select unique or non-unique records based on the value of a given key.
30
+ #
31
+ # _unique_values+ selects records from the stream by checking values of a
32
+ # given key. If a duplicate record exists based on the given key, it will only
33
+ # output one record (the first). If the +invert+ option is used, then
34
+ # non-unique records are selected.
35
+ #
36
+ # == Usage
37
+ #
38
+ # unique_values(<key: <string>[, invert: <bool>])
39
+ #
40
+ # === Options
41
+ #
42
+ # * key: <string> - Key for which the value is checked for uniqueness.
43
+ # * invert: <bool> - Select non-unique records (default=false).
44
+ #
45
+ # == Examples
46
+ #
47
+ # Consider the following two column table in the file `test.tab`:
48
+ #
49
+ # Human H1
50
+ # Human H2
51
+ # Human H3
52
+ # Dog D1
53
+ # Dog D2
54
+ # Mouse M1
55
+ #
56
+ # To output only unique values for the first column we first read the table
57
+ # with +read_table+ and then pass the result to +unique_values+:
58
+ #
59
+ # BP.new.read_table(input: "test.tab").unique_values(key: :V0).dump.run
60
+ #
61
+ # {:V0=>"Human", :V1=>"H1"}
62
+ # {:V0=>"Dog", :V1=>"D1"}
63
+ # {:V0=>"Mouse", :V1=>"M1"}
64
+ #
65
+ # To output duplicate records instead use the +invert+ options:
66
+ #
67
+ # BP.new.
68
+ # read_table(input: "test.tab").
69
+ # unique_values(key: :V0, invert: true).
70
+ # dump.
71
+ # run
72
+ #
73
+ # {:V0=>"Human", :V1=>"H2"}
74
+ # {:V0=>"Human", :V1=>"H3"}
75
+ # {:V0=>"Dog", :V1=>"D2"}
76
+ class UniqueValues
77
+ require 'set'
78
+
79
+ STATS = %i(records_in records_out)
80
+
81
+ # Constructor for UniqueValues.
82
+ #
83
+ # @param options [Hash] Options hash.
84
+ # @option options [String,Symbol] :key
85
+ # @option options [Boolean] :invert
86
+ #
87
+ # @return [UniqueValues] Class instance.
88
+ def initialize(options)
89
+ @options = options
90
+ @lookup = Set.new
91
+ @key = options[:key].to_sym
92
+ @invert = options[:invert]
93
+
94
+ check_options
95
+ end
96
+
97
+ # Return command lambda for unique_values
98
+ #
99
+ # @return [Proc] Command lambda.
100
+ def lmb
101
+ lambda do |input, output, status|
102
+ status_init(status, STATS)
103
+
104
+ input.each do |record|
105
+ @status[:records_in] += 1
106
+
107
+ if output_record?(record)
108
+ output << record
109
+ @status[:records_out] += 1
110
+ end
111
+ end
112
+ end
113
+ end
114
+
115
+ private
116
+
117
+ # Check options.
118
+ def check_options
119
+ options_allowed(@options, :key, :invert)
120
+ options_required(@options, :key)
121
+ options_allowed_values(@options, invert: [true, false, nil])
122
+ end
123
+
124
+ # rubocop: disable Metrics/CyclomaticComplexity
125
+
126
+ # Determine if a record should be output or not. If the wanted key is not
127
+ # present in the record it will be output. If the value is unique the record
128
+ # will be output, unless the +invert+ option was used which will result in
129
+ # non-unique records to be output.
130
+ #
131
+ # @param record [Hash] BioDSL record.
132
+ #
133
+ # @return [Boolean]
134
+ def output_record?(record)
135
+ return true unless (value = record[@key])
136
+
137
+ value = value.to_sym if value.is_a? String
138
+ found = @lookup.include?(value)
139
+
140
+ @lookup.add(value) unless found
141
+
142
+ found && @invert || !found && !@invert
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,171 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Run usearch_global on sequences in the stream.
30
+ #
31
+ # This is a wrapper for the +usearch+ tool to run the program usearch_global.
32
+ # Basically sequence type records are searched against a reference database
33
+ # and records with hit information are output.
34
+ #
35
+ # Please refer to the manual:
36
+ #
37
+ # http://drive5.com/usearch/manual/usearch_global.html
38
+ #
39
+ # Usearch 7.0 must be installed for +usearch+ to work. Read more here:
40
+ #
41
+ # http://www.drive5.com/usearch/
42
+ #
43
+ # == Usage
44
+ #
45
+ # usearch_global(<database: <file>, <identity: float>,
46
+ # <strand: "plus|both">[, cpus: <uint>])
47
+ #
48
+ # === Options
49
+ #
50
+ # * database: <file> - Database to search (in FASTA format).
51
+ # * identity: <float> - Similarity for matching in percent between 0.0 and
52
+ # 1.0.
53
+ # * strand: <string> - For nucleotide search report hits from plus or both
54
+ # strands.
55
+ # * cpus: <uint> - Number of CPU cores to use (default=1).
56
+ #
57
+ # == Examples
58
+ #
59
+ class UsearchGlobal
60
+ require 'BioDSL/helpers/aux_helper'
61
+
62
+ include AuxHelper
63
+
64
+ STATS = %i(records_in records_out sequences_in hits_out)
65
+
66
+ # Constructor for UsearchGlobal.
67
+ #
68
+ # @param options [Hash] Options hash.
69
+ # @option options [String] :database
70
+ # @option options [Float] :identity
71
+ # @option options [String,Symbol] :strand
72
+ # @option options [Integer] :cpus
73
+ #
74
+ # @return [UsearchGlobal] Class instance.
75
+ def initialize(options)
76
+ @options = options
77
+ @options[:cpus] ||= 1
78
+
79
+ aux_exist('usearch')
80
+ check_options
81
+ end
82
+
83
+ # Return command lambda for usearch_global.
84
+ #
85
+ # @return [Proc] Command lambda.
86
+ def lmb
87
+ lambda do |input, output, status|
88
+ status_init(status, STATS)
89
+
90
+ TmpDir.create('in', 'out') do |tmp_in, tmp_out|
91
+ process_input(input, output, tmp_in)
92
+ run_usearch_global(tmp_in, tmp_out)
93
+ process_output(output, tmp_out)
94
+ end
95
+ end
96
+ end
97
+
98
+ private
99
+
100
+ # Check options.
101
+ def check_options
102
+ options_allowed(@options, :database, :identity, :strand, :cpus)
103
+ options_required(@options, :database, :identity)
104
+ options_allowed_values(@options, strand: ['plus', 'both', :plus, :both])
105
+ options_files_exist(@options, :database)
106
+ options_assert(@options, ':identity > 0.0')
107
+ options_assert(@options, ':identity <= 1.0')
108
+ options_assert(@options, ':cpus >= 1')
109
+ options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
110
+ end
111
+
112
+ # Process input and emit to the output stream while saving all records
113
+ # containing sequences to a temporary FASTA file.
114
+ #
115
+ # @param input [Enumerator] Input stream.
116
+ # @param output [Enumerator::Yielder] Output stream.
117
+ # @param tmp_in [String] Path to temporary file.
118
+ def process_input(input, output, tmp_in)
119
+ BioDSL::Fasta.open(tmp_in, 'w') do |ios|
120
+ input.each_with_index do |record, i|
121
+ @status[:records_in] += 1
122
+
123
+ output << record
124
+
125
+ @status[:records_out] += 1
126
+
127
+ next unless record[:SEQ]
128
+
129
+ @status[:sequences_in] += 1
130
+ seq_name = record[:SEQ_NAME] || i.to_s
131
+
132
+ entry = BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ])
133
+
134
+ ios.puts entry.to_fasta
135
+ end
136
+ end
137
+ end
138
+
139
+ # Run usearch global on the input file and save results in the output file.
140
+ def run_usearch_global(tmp_in, tmp_out)
141
+ run_opts = {
142
+ input: tmp_in,
143
+ output: tmp_out,
144
+ database: @options[:database],
145
+ strand: @options[:strand],
146
+ identity: @options[:identity],
147
+ cpus: @options[:cpus],
148
+ verbose: @options[:verbose]
149
+ }
150
+
151
+ BioDSL::Usearch.usearch_global(run_opts)
152
+ rescue BioDSL::UsearchError => e
153
+ raise unless e.message =~ /Empty input file/
154
+ end
155
+
156
+ # Parse usearch output file and emit records to the output stream.
157
+ #
158
+ # @param output [Enumerator::Yielder] Output stream.
159
+ # @param tmp_out [String] Path to output file.
160
+ def process_output(output, tmp_out)
161
+ BioDSL::Usearch.open(tmp_out) do |ios|
162
+ ios.each(:uc) do |record|
163
+ record[:RECORD_TYPE] = 'usearch'
164
+ output << record
165
+ @status[:hits_out] += 1
166
+ @status[:records_out] += 1
167
+ end
168
+ end
169
+ end
170
+ end
171
+ end