BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,329 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Read tabular data from one or more files.
30
+ #
31
+ # Tabular input can be read with +read_table+ which will read in chosen rows
32
+ # and chosen columns (separated by a given delimiter) from a table in ASCII
33
+ # text format.
34
+ #
35
+ # If no +keys+ option is given and there is a comment line beginning with #
36
+ # the fields here will be used as keys. Subsequence lines beginning with #
37
+ # will be ignored.
38
+ #
39
+ # If a comment line is present beginning with a # the options +select+ and
40
+ # +reject+ can be used to chose what columns to read.
41
+ #
42
+ # == Usage
43
+ # read_table(input: <glob>[, first: <uint>|last: <uint>][, select: <list>
44
+ # |, reject: <list>[, keys: <list>][, skip: <uint>
45
+ # [, delimiter: <string>]]])
46
+ #
47
+ # === Options
48
+ # * input <glob> - Input file or file glob expression.
49
+ # * first <uint> - Only read in the _first_ number of entries.
50
+ # * last <uint> - Only read in the _last_ number of entries.
51
+ # * select <list> - List of column indexes or header keys to read.
52
+ # * reject <list> - List of column indexes or header keys to skip.
53
+ # * keys <list> - List of key identifiers to use for each column.
54
+ # * skip <uint> - Number of initial lines to skip (default=0).
55
+ # * delimiter <string> - Delimter to use for separating columsn
56
+ # (default="\s+").
57
+ #
58
+ # == Examples
59
+ #
60
+ # To read all entries from a file:
61
+ #
62
+ # read_table(input: "test.tab")
63
+ #
64
+ # To read all entries from a gzipped file:
65
+ #
66
+ # read_table(input: "test.tab.gz")
67
+ #
68
+ # To read in only 10 records from a file:
69
+ #
70
+ # read_table(input: "test.tab", first: 10)
71
+ #
72
+ # To read in the last 10 records from a file:
73
+ #
74
+ # read_table(input: "test.tab", last: 10)
75
+ #
76
+ # To read all entries from multiple files:
77
+ #
78
+ # read_table(input: "test1.tab,test2.tab")
79
+ #
80
+ # To read entries from multiple files using a glob expression:
81
+ #
82
+ # read_table(input: "*.tab")
83
+ #
84
+ # Consider the following table from the file from the file test.tab:
85
+ #
86
+ # #Organism Sequence Count
87
+ # Human ATACGTCAG 23524
88
+ # Dog AGCATGAC 2442
89
+ # Mouse GACTG 234
90
+ # Cat AAATGCA 2342
91
+ #
92
+ # Reading the entire table will result in 4 records, one for each row,
93
+ # where the keys Organism, Sequence and Count are taken from the comment
94
+ # line prefixe with #:
95
+ #
96
+ # BP.new.read_tab(input: "test.tab").dump.run
97
+ #
98
+ # {:Organism=>"Human", :Sequence=>"ATACGTCAG", :Count=>23524}
99
+ # {:Organism=>"Dog", :Sequence=>"AGCATGAC", :Count=>2442}
100
+ # {:Organism=>"Mouse", :Sequence=>"GACTG", :Count=>234}
101
+ # {:Organism=>"Cat", :Sequence=>"AAATGCA", :Count=>2342}
102
+ #
103
+ # However, if the first line is skipped using the +skip+ option the keys
104
+ # will default to V0, V1, V2 ... Vn:
105
+ #
106
+ # BP.new.read_table(input: "test.tab", skip: 1).dump.run
107
+ #
108
+ # {:V0=>"Human", :V1=>"ATACGTCAG", :V2=>23524}
109
+ # {:V0=>"Dog", :V1=>"AGCATGAC", :V2=>2442}
110
+ # {:V0=>"Mouse", :V1=>"GACTG", :V2=>234}
111
+ # {:V0=>"Cat", :V1=>"AAATGCA", :V2=>2342}
112
+ #
113
+ # To explicitly name the columns (or the keys) use the +keys+ option:
114
+ #
115
+ # BP.new.
116
+ # read_table(input: "test.tab", skip: 1, keys: [:ORGANISM, :SEQ, :COUNT]).
117
+ # dump.
118
+ # run
119
+ #
120
+ # {:ORGANISM=>"Human", :SEQ=>"ATACGTCAG", :COUNT=>23524}
121
+ # {:ORGANISM=>"Dog", :SEQ=>"AGCATGAC", :COUNT=>2442}
122
+ # {:ORGANISM=>"Mouse", :SEQ=>"GACTG", :COUNT=>234}
123
+ # {:ORGANISM=>"Cat", :SEQ=>"AAATGCA", :COUNT=>2342}
124
+ #
125
+ # It is possible to select a subset of columns to read by using the
126
+ # +select+ option which takes a comma separated list of columns numbers
127
+ # (first column is designated 0) or header keys as (requires header)
128
+ # argument. So to read in only the sequence and the count so that the
129
+ # count comes before the sequence do:
130
+ #
131
+ # BP.new.read_table(input: "test.tab", skip: 1, select: [2, 1]).dump.run
132
+ #
133
+ # {:V0=>23524, :V1=>"ATACGTCAG"}
134
+ # {:V0=>2442, :V1=>"AGCATGAC"}
135
+ # {:V0=>234, :V1=>"GACTG"}
136
+ # {:V0=>2342, :V1=>"AAATGCA"}
137
+ #
138
+ # Alternatively, if a header line was present in the file:
139
+ #
140
+ # #Organism Sequence Count
141
+ #
142
+ # Then the header keys can be used:
143
+ #
144
+ # BP.new.
145
+ # read_table(input: "test.tab", skip: 1, select: [:Count, :Sequence]).
146
+ # dump.
147
+ # run
148
+ #
149
+ # {:Count=>23524, :Sequence=>"ATACGTCAG"}
150
+ # {:Count=>2442, :Sequence=>"AGCATGAC"}
151
+ # {:Count=>234, :Sequence=>"GACTG"}
152
+ # {:Count=>2342, :Sequence=>"AAATGCA"}
153
+ #
154
+ # Likewise, it is possible to reject specified columns from being read
155
+ # using the +reject+ option:
156
+ #
157
+ # BP.new.read_table(input: "test.tab", skip: 1, reject: [2, 1]).dump.run
158
+ #
159
+ # {:V0=>"Human"}
160
+ # {:V0=>"Dog"}
161
+ # {:V0=>"Mouse"}
162
+ # {:V0=>"Cat"}
163
+ #
164
+ # And again, the header keys can be used if a header is present:
165
+ #
166
+ # BP.new.
167
+ # read_table(input: "test.tab", skip: 1, reject: [:Count, :Sequence]).
168
+ # dump.
169
+ # run
170
+ #
171
+ # {:Organism=>"Human"}
172
+ # {:Organism=>"Dog"}
173
+ # {:Organism=>"Mouse"}
174
+ # {:Organism=>"Cat"}
175
+ #
176
+ # rubocop: disable ClassLength
177
+ class ReadTable
178
+ STATS = %i(records_in records_out)
179
+
180
+ # Constructor for ReadTable.
181
+ #
182
+ # @param options [Hash] Options hash.
183
+ # @option options [String] :input
184
+ # @option options [Integer] :first
185
+ # @option options [Integer] :last
186
+ # @option options [Array] :keys
187
+ # @option options [Integer] :skip
188
+ # @option options [String] :delimiter
189
+ # @option options [Boolean] :select
190
+ # @option options [Boolean] :reject
191
+ #
192
+ # @return [ReadTable] Class instance.
193
+ def initialize(options)
194
+ @options = options
195
+ @keys = options[:keys] ? options[:keys].map(&:to_sym) : nil
196
+ @skip = options[:skip] || 0
197
+ @buffer = []
198
+
199
+ check_options
200
+ end
201
+
202
+ # Return command lambda for ReadTable
203
+ #
204
+ # @return [Proc] Command lambda.
205
+ def lmb
206
+ lambda do |input, output, status|
207
+ status_init(status, STATS)
208
+
209
+ process_input(input, output)
210
+
211
+ case
212
+ when @options[:first] then read_first(output)
213
+ when @options[:last] then read_last(output)
214
+ else read_all(output)
215
+ end
216
+ end
217
+ end
218
+
219
+ private
220
+
221
+ # Check options.
222
+ def check_options
223
+ options_allowed(@options, :input, :first, :last, :keys, :skip, :delimiter,
224
+ :select, :reject)
225
+ options_required(@options, :input)
226
+ options_files_exist(@options, :input)
227
+ options_unique(@options, :first, :last)
228
+ options_unique(@options, :select, :reject)
229
+ options_list_unique(@options, :keys, :select, :reject)
230
+ options_assert(@options, ':first >= 0')
231
+ options_assert(@options, ':last >= 0')
232
+ options_assert(@options, ':skip >= 0')
233
+ end
234
+
235
+ # Return a hash with options for CVS#each_hash.
236
+ #
237
+ # @return [Hash] Read table options.
238
+ def read_options
239
+ {delimiter: @options[:delimiter],
240
+ select: @options[:select],
241
+ reject: @options[:reject]}
242
+ end
243
+
244
+ # Read :first entries from input files and emit to output stream.
245
+ #
246
+ # @param output [Enumerator::Yeilder] Output stream.
247
+ def read_first(output)
248
+ options_glob(@options[:input]).each do |file|
249
+ BioDSL::CSV.open(file) do |ios|
250
+ ios.skip(@skip)
251
+
252
+ ios.each_hash(read_options) do |record|
253
+ output << record
254
+ @status[:records_out] += 1
255
+ return if @status[:records_out] >= @options[:first]
256
+ end
257
+ end
258
+ end
259
+ end
260
+
261
+ # Read :last entries from input files and emit to output stream.
262
+ #
263
+ # @param output [Enumerator::Yeilder] Output stream.
264
+ def read_last(output)
265
+ options_glob(@options[:input]).each do |file|
266
+ BioDSL::CSV.open(file) do |ios|
267
+ ios.skip(@skip)
268
+
269
+ ios.each_hash(read_options) do |record|
270
+ @buffer << record
271
+ @buffer.shift if @buffer.size > @options[:last]
272
+ end
273
+ end
274
+ end
275
+
276
+ output_buffer(output)
277
+ end
278
+
279
+ # Read all entries from input files and emit to output stream.
280
+ #
281
+ # @param output [Enumerator::Yeilder] Output stream.
282
+ def read_all(output)
283
+ options_glob(@options[:input]).each do |file|
284
+ BioDSL::CSV.open(file) do |ios|
285
+ ios.skip(@skip)
286
+
287
+ ios.each_hash(read_options) do |record|
288
+ replace_keys(record) if @keys
289
+ output << record
290
+ @status[:records_out] += 1
291
+ end
292
+ end
293
+ end
294
+ end
295
+
296
+ # Replace the keys of a given record.
297
+ #
298
+ # @param record [Hash] BioDSL record.
299
+ def replace_keys(record)
300
+ record.first(@keys.size).each_with_index do |(k, v), i|
301
+ record[@keys[i]] = v
302
+ record.delete k
303
+ end
304
+ end
305
+
306
+ # Output all record in the buffer to the output stream.
307
+ #
308
+ # @param output [Enumerator::Yielder] Output stream.
309
+ def output_buffer(output)
310
+ @buffer.each do |record|
311
+ output << record
312
+ @status[:records_out] += 1
313
+ end
314
+ end
315
+
316
+ # Emit all records from the input stream to the output stream.
317
+ #
318
+ # @param input [Enumerator] Input stream.
319
+ # @param output [Enumerator::Yielder] Output stream.
320
+ def process_input(input, output)
321
+ return unless output
322
+ input.each do |record|
323
+ output << record
324
+ @status[:records_in] += 1
325
+ @status[:records_out] += 1
326
+ end
327
+ end
328
+ end
329
+ end
@@ -0,0 +1,113 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Reverse sequences in the stream.
30
+ #
31
+ # +reverse_seq+ reverses sequences in the stream. If a SCORES key is found
32
+ # then the SCORES are also reversed.
33
+ #
34
+ # +reverse_seq+ can be used together with +complment_seq+ to reverse-
35
+ # complement sequences.
36
+ #
37
+ # == Usage
38
+ #
39
+ # reverse_seq()
40
+ #
41
+ # === Options
42
+ #
43
+ # == Examples
44
+ #
45
+ # Consider the following FASTQ entry in the file test.fq:
46
+ #
47
+ # @M02529:88:000000000-AC0WY:1:1101:12879:1928 2:N:0:185
48
+ # TTGTAAAACGACGGCCAGTG
49
+ # +
50
+ # >>>>>FFFFD@A?A0AE0FG
51
+ #
52
+ # To reverse the sequence simply do:
53
+ #
54
+ # BP.new.read_fastq(input:"test.fq").reverse_seq.dump.run
55
+ #
56
+ # {:SEQ_NAME=>"M02529:88:000000000-AC0WY:1:1101:12879:1928 2:N:0:185",
57
+ # :SEQ=>"GTGACCGGCAGCAAAATGTT",
58
+ # :SEQ_LEN=>20,
59
+ # :SCORES=>"GF0EA0A?A@DFFFF>>>>>"}
60
+ class ReverseSeq
61
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
62
+ residues_out)
63
+
64
+ # Constructor for ReverseSeq.
65
+ #
66
+ # @param options [Hash] Options hash.
67
+ #
68
+ # @return [ReverseSeq] Class instance.
69
+ def initialize(options)
70
+ @options = options
71
+
72
+ check_options
73
+ end
74
+
75
+ # Return command lambda for reverse_seq.
76
+ #
77
+ # @return [Proc] Command lambda.
78
+ def lmb
79
+ lambda do |input, output, status|
80
+ status_init(status, STATS)
81
+
82
+ input.each do |record|
83
+ @status[:records_in] += 1
84
+ reverse(record) if record[:SEQ]
85
+ output << record
86
+ @status[:records_out] += 1
87
+ end
88
+ end
89
+ end
90
+
91
+ private
92
+
93
+ # Check options.
94
+ def check_options
95
+ options_allowed(@options, nil)
96
+ end
97
+
98
+ # Reverse sequence.
99
+ #
100
+ # @param record [Hash] BioDSL record.
101
+ def reverse(record)
102
+ entry = BioDSL::Seq.new_bp(record)
103
+ entry.reverse!
104
+
105
+ @status[:sequences_in] += 1
106
+ @status[:sequences_out] += 1
107
+ @status[:residues_in] += entry.length
108
+ @status[:residues_out] += entry.length
109
+
110
+ record.merge! entry.to_bp
111
+ end
112
+ end
113
+ end