BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,253 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Remove gaps from sequences or gap only columns in alignments.
30
+ #
31
+ # +degap_seq+ remove gaps from sequences (the letters ~-_.). If the option
32
+ # +columns_only+ is used then gaps from aligned sequences will be removed, if
33
+ # and only if the the entire columns consists of gaps.
34
+ #
35
+ # == Usage
36
+ #
37
+ # degap_seq([columns_only: <bool>])
38
+ #
39
+ # === Options
40
+ #
41
+ # * columns_only: <bool> - Remove gap columns only (default=false).
42
+ #
43
+ # == Examples
44
+ #
45
+ # Consider the following FASTA entries in the file `test.fna`:
46
+ #
47
+ # >test1
48
+ # A-G~T.C_
49
+ # >test2
50
+ # AGG_T-C~
51
+ #
52
+ # To remove all gaps from all sequences do:
53
+ #
54
+ # BP.new.read_fasta(input: "test.fna").degap_seq.dump.run
55
+ #
56
+ # {:SEQ_NAME=>"test1", :SEQ=>"AGTC", :SEQ_LEN=>4}
57
+ # {:SEQ_NAME=>"test2", :SEQ=>"AGGTC", :SEQ_LEN=>5}
58
+ #
59
+ #
60
+ # To remove all gap-only columns use the +columns_only+ option:
61
+ #
62
+ # BP.new.
63
+ # read_fasta(input: "test.fna").
64
+ # degap_seq(columns_only: true).
65
+ # dump.
66
+ # run
67
+ #
68
+ # {:SEQ_NAME=>"test1", :SEQ=>"A-GTC", :SEQ_LEN=>5}
69
+ # {:SEQ_NAME=>"test2", :SEQ=>"AGGTC", :SEQ_LEN=>5}
70
+ #
71
+ # rubocop:disable ClassLength
72
+ class DegapSeq
73
+ require 'narray'
74
+
75
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
76
+ residues_out)
77
+
78
+ # Constructor for DegapSeq.
79
+ #
80
+ # @param options [Hash] Options Hash.
81
+ #
82
+ # @option options [Boolean] :columns_only
83
+ # Flag indicating that only gap-columns only shoule be removed.
84
+ #
85
+ # @return [DegapSeq] Instance of DegapSeq.
86
+ def initialize(options)
87
+ @options = options
88
+ @indels = BioDSL::Seq::INDELS.sort.join('')
89
+ @na_mask = nil
90
+ @max_len = nil
91
+ @count = 0
92
+
93
+ check_options
94
+ end
95
+
96
+ # Return the command lambda for DegapSeq.
97
+ #
98
+ # @return [Proc] Command lambda.
99
+ def lmb
100
+ lambda do |input, output, status|
101
+ status_init(status, STATS)
102
+
103
+ if @options[:columns_only]
104
+ degap_columns(input, output)
105
+ status[:columns_removed] = @na_mask.count_false
106
+ else
107
+ degap_all(input, output)
108
+ end
109
+ end
110
+ end
111
+
112
+ private
113
+
114
+ # Check options.
115
+ def check_options
116
+ options_allowed(@options, :columns_only)
117
+ options_allowed_values(@options, columns_only: [true, false, nil])
118
+ end
119
+
120
+ # Remove all gap-only columns from all sequences in input stream and output
121
+ # to output stream.
122
+ #
123
+ # @param input [Enumerator] Input stream.
124
+ # @param output [Enumerator::Yeilder] Output stream.
125
+ def degap_columns(input, output)
126
+ TmpDir.create('degap_seq') do |tmp_file, _|
127
+ process_input(input, tmp_file)
128
+ create_mask
129
+ process_output(output, tmp_file)
130
+ end
131
+ end
132
+
133
+ # Serialize all input record to a temporary file and at the same time add
134
+ # all sequence type records to the gap mask.
135
+ #
136
+ # @param input [Enumerator] Input stream.
137
+ # @param tmp_file [String] Path to temporary file.
138
+ def process_input(input, tmp_file)
139
+ File.open(tmp_file, 'wb') do |ios|
140
+ BioDSL::Serializer.new(ios) do |s|
141
+ input.each do |record|
142
+ @status[:records_in] += 1
143
+
144
+ if (seq = record[:SEQ])
145
+ mask_add(seq)
146
+ @count += 1
147
+ end
148
+
149
+ s << record
150
+ end
151
+ end
152
+ end
153
+ end
154
+
155
+ # Add sequence gaps to mask.
156
+ #
157
+ # @param seq [String] Sequences.
158
+ def mask_add(seq)
159
+ @status[:sequences_in] += 1
160
+ @status[:residues_in] += seq.length
161
+
162
+ @max_len ||= seq.length
163
+
164
+ check_length(seq)
165
+
166
+ @na_mask ||= NArray.int(seq.length)
167
+ na_seq = NArray.to_na(seq, 'byte')
168
+ @indels.each_char { |c| @na_mask += na_seq.eq(c.ord) }
169
+ end
170
+
171
+ # Check if sequence length match max_len.
172
+ #
173
+ # @param seq [String] Sequences.
174
+ #
175
+ # @raise [BioDSL::SeqError] if sequence length and max_len don't match.
176
+ def check_length(seq)
177
+ return if @max_len == seq.length
178
+ fail BioDSL::SeqError,
179
+ "Uneven seq lengths: #{@max_len} != #{seq.length}"
180
+ end
181
+
182
+ # Create a mask for all-gap columns.
183
+ def create_mask
184
+ @na_mask = @na_mask.ne @count
185
+ end
186
+
187
+ # Read all serialized records from the temporary file and emit to the output
188
+ # stream records with degapped sequences.
189
+ #
190
+ # @param output [Enumerator::Yeilder] Output stream.
191
+ # @param tmp_file [String] Path to temporary file.
192
+ def process_output(output, tmp_file)
193
+ File.open(tmp_file, 'rb') do |ios|
194
+ BioDSL::Serializer.new(ios) do |s|
195
+ s.each do |record|
196
+ remove_residues(record) if record[:SEQ]
197
+
198
+ output << record
199
+ @status[:records_out] += 1
200
+ end
201
+ end
202
+ end
203
+ end
204
+
205
+ # Given a BioDSL record containing sequence information
206
+ # remove all residues based on the na_mask.
207
+ #
208
+ # @param record [Hash] BioDSL record.
209
+ def remove_residues(record)
210
+ na_seq = NArray.to_na(record[:SEQ], 'byte')
211
+ record[:SEQ] = na_seq[@na_mask].to_s
212
+ record[:SEQ_LEN] = record[:SEQ].length
213
+
214
+ @status[:sequences_out] += 1
215
+ @status[:residues_out] += record[:SEQ].length
216
+ end
217
+
218
+ # Remove all gaps from all sequences in input stream and output to output
219
+ # stream.
220
+ #
221
+ # @param input [Enumerator] Input stream.
222
+ # @param output [Enumerator::Yeilder] Output stream.
223
+ def degap_all(input, output)
224
+ input.each do |record|
225
+ @status[:records_in] += 1
226
+
227
+ degap_seq(record) if record.key? :SEQ
228
+
229
+ output << record
230
+
231
+ @status[:records_out] += 1
232
+ end
233
+ end
234
+
235
+ # Given a BioDSL record with sequence information, remove all gaps from
236
+ # the sequence.
237
+ #
238
+ # @param record [Hash] BioDSL record.
239
+ def degap_seq(record)
240
+ entry = BioDSL::Seq.new_bp(record)
241
+
242
+ @status[:sequences_in] += 1
243
+ @status[:residues_in] += entry.length
244
+
245
+ entry.seq.delete!(@indels)
246
+
247
+ @status[:sequences_out] += 1
248
+ @status[:residues_out] += entry.length
249
+
250
+ record.merge! entry.to_bp
251
+ end
252
+ end
253
+ end
@@ -0,0 +1,168 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Dereplicate sequences in the stream.
30
+ #
31
+ # +dereplicate_seq+ removes all duplicate sequence records. Dereplicated
32
+ # sequences are output along with the count of replicates. Using the
33
+ # +ignore_case+ option disables the default case sensitive sequence matching.
34
+ #
35
+ # == Usage
36
+ #
37
+ # dereplicate_seq([ignore_case: <bool>])
38
+ #
39
+ # === Options
40
+ #
41
+ # * ignore_case: <bool> - Ignore sequence case.
42
+ #
43
+ # == Examples
44
+ #
45
+ # Consider the following FASTA file test.fna:
46
+ #
47
+ # >test1
48
+ # ATGC
49
+ # >test2
50
+ # ATGC
51
+ # >test3
52
+ # GCAT
53
+ #
54
+ # To dereplicate all sequences we use +read_fasta+ and +dereplicate_seq+:
55
+ #
56
+ # BP.new.read_fasta(input: "test.fna").dereplicate_seq.dump.run
57
+ #
58
+ # {:SEQ_NAME=>"test1", :SEQ=>"ATGC", :SEQ_LEN=>4, :SEQ_COUNT=>2}
59
+ # {:SEQ_NAME=>"test3", :SEQ=>"GCAT", :SEQ_LEN=>4, :SEQ_COUNT=>1}
60
+ class DereplicateSeq
61
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
62
+ residues_out)
63
+
64
+ # Constructor for the DereplicateSeq class.
65
+ #
66
+ # @param options [Hash] Options hash.
67
+ # @option options [Boolean] :ignore_case Ignore sequence case.
68
+ #
69
+ # @return [DereplicateSeq] Class intance.
70
+ def initialize(options)
71
+ @options = options
72
+ @lookup = {}
73
+
74
+ check_options
75
+ end
76
+
77
+ # Return the command lambda for DereplicateSeq.
78
+ #
79
+ # @return [Proc] Command lambda.
80
+ def lmb
81
+ lambda do |input, output, status|
82
+ status_init(status, STATS)
83
+
84
+ TmpDir.create('dereplicate_seq') do |tmp_file, _|
85
+ process_input(input, output, tmp_file)
86
+ process_output(output, tmp_file)
87
+ end
88
+ end
89
+ end
90
+
91
+ private
92
+
93
+ # Check options.
94
+ def check_options
95
+ options_allowed(@options, :ignore_case)
96
+ options_allowed_values(@options, ignore_case: [nil, true, false])
97
+ end
98
+
99
+ # Process input stream and serialize all records with sequence information.
100
+ # All other records are emitted to the output stream.
101
+ #
102
+ # @param input [Enumerator] Input stream.
103
+ # @param output [Enumerator::Yielder] Output stream.
104
+ # @param tmp_file [String] Path to temporary file.
105
+ def process_input(input, output, tmp_file)
106
+ File.open(tmp_file, 'wb') do |ios|
107
+ BioDSL::Serializer.new(ios) do |s|
108
+ input.each do |record|
109
+ @status[:records_in] += 1
110
+
111
+ if record.key? :SEQ
112
+ serialize(record, s)
113
+ else
114
+ output << record
115
+
116
+ @status[:records_out] += 1
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
122
+
123
+ # Serialize records with unique sequences and keep a count of how many time
124
+ # each sequence was encountered.
125
+ #
126
+ # @param record [Hash] BioDSL record.
127
+ # @param s [BioDSL::Serializer] Serializer.
128
+ def serialize(record, s)
129
+ @status[:sequences_in] += 1
130
+
131
+ seq = record[:SEQ].dup
132
+ @status[:residues_in] += seq.length
133
+ seq.downcase! if @options[:ignore_case]
134
+ key = seq.to_sym
135
+
136
+ unless @lookup[key]
137
+ s << record
138
+
139
+ @lookup[key] = 0
140
+ end
141
+
142
+ @lookup[key] += 1
143
+ end
144
+
145
+ # Read all serialized records from tmp file and emit to the output stream
146
+ # along with the sequence count.
147
+ #
148
+ # @param output [Enumerator::Yielder] Output stream.
149
+ # @param tmp_file [String] Path to tmp file.
150
+ def process_output(output, tmp_file)
151
+ File.open(tmp_file, 'rb') do |ios|
152
+ BioDSL::Serializer.new(ios) do |s|
153
+ s.each do |record|
154
+ seq = record[:SEQ].dup
155
+ @status[:residues_out] += seq.length
156
+ seq.downcase! if @options[:ignore_case]
157
+ record[:SEQ_COUNT] = @lookup[seq.to_sym]
158
+
159
+ output << record
160
+
161
+ @status[:records_out] += 1
162
+ @status[:sequences_out] += 1
163
+ end
164
+ end
165
+ end
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,157 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Dump records in stream to STDOUT.
30
+ #
31
+ # +dump+ outputs records from the stream to STDOUT.
32
+ #
33
+ # == Usage
34
+ #
35
+ # dump([first: <uint> |last: <uint>])
36
+ #
37
+ # === Options
38
+ #
39
+ # * first <uint> - Only dump the first number of records.
40
+ # * last <uint> - Only dump the last number of records.
41
+ #
42
+ # == Examples
43
+ #
44
+ # To dump all records in the stream:
45
+ #
46
+ # dump
47
+ #
48
+ # To dump only the _first_ 10 records:
49
+ #
50
+ # dump(first: 10)
51
+ #
52
+ # To dump only the _last_ 10 records:
53
+ #
54
+ # dump(last: 10)
55
+ class Dump
56
+ STATS = %i(records_in records_out)
57
+
58
+ # Constructor for the Dump class.
59
+ #
60
+ # @param [Hash] options Options hash.
61
+ # @option options [Integer] :first Dump first number of records.
62
+ # @option options [Integer] :last Dump last number of records.
63
+ #
64
+ # @return [Dump] Returns an instance of the Dump class.
65
+ def initialize(options)
66
+ @options = options
67
+
68
+ check_options
69
+ end
70
+
71
+ # Return a lambda for the dump command.
72
+ #
73
+ # @return [Proc] Returns the dump command lambda.
74
+ def lmb
75
+ lambda do |input, output, status|
76
+ status_init(status, STATS)
77
+
78
+ if @options[:first]
79
+ dump_first(input, output)
80
+ elsif @options[:last]
81
+ dump_last(input, output)
82
+ else
83
+ dump_all(input, output)
84
+ end
85
+ end
86
+ end
87
+
88
+ private
89
+
90
+ # Check the options and return a lambda for the command.
91
+ def check_options
92
+ options_allowed(@options, :first, :last)
93
+ options_unique(@options, :first, :last)
94
+ options_assert(@options, ':first > 0')
95
+ options_assert(@options, ':last > 0')
96
+ end
97
+
98
+ # Dump the first number of records.
99
+ #
100
+ # @param input [Enumerator::Yielder] Input stream.
101
+ # @param output [Enumerator::Yielder] Output stream.
102
+ def dump_first(input, output)
103
+ input.first(@options[:first]).each do |record|
104
+ @status[:records_in] += 1
105
+
106
+ puts record
107
+
108
+ if output
109
+ output << record
110
+ @status[:records_out] += 1
111
+ end
112
+ end
113
+ end
114
+
115
+ # Dump the last number of records.
116
+ #
117
+ # @param input [Enumerator::Yielder] Input stream.
118
+ # @param output [Enumerator::Yielder] Output stream.
119
+ def dump_last(input, output)
120
+ buffer = []
121
+ last = @options[:last]
122
+
123
+ input.each do |record|
124
+ @status[:records_in] += 1
125
+
126
+ buffer << record
127
+ buffer.shift if buffer.size > last
128
+ end
129
+
130
+ buffer.each do |record|
131
+ puts record
132
+
133
+ if output
134
+ output << record
135
+ @status[:records_out] += 1
136
+ end
137
+ end
138
+ end
139
+
140
+ # Dump all records.
141
+ #
142
+ # @param input [Enumerator::Yielder] Input stream.
143
+ # @param output [Enumerator::Yielder] Output stream.
144
+ def dump_all(input, output)
145
+ input.each do |record|
146
+ @status[:records_in] += 1
147
+
148
+ puts record
149
+
150
+ if output
151
+ output << record
152
+ @status[:records_out] += 1
153
+ end
154
+ end
155
+ end
156
+ end
157
+ end