BioDSL 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,329 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Read tabular data from one or more files.
30
+ #
31
+ # Tabular input can be read with +read_table+ which will read in chosen rows
32
+ # and chosen columns (separated by a given delimiter) from a table in ASCII
33
+ # text format.
34
+ #
35
+ # If no +keys+ option is given and there is a comment line beginning with #
36
+ # the fields here will be used as keys. Subsequence lines beginning with #
37
+ # will be ignored.
38
+ #
39
+ # If a comment line is present beginning with a # the options +select+ and
40
+ # +reject+ can be used to chose what columns to read.
41
+ #
42
+ # == Usage
43
+ # read_table(input: <glob>[, first: <uint>|last: <uint>][, select: <list>
44
+ # |, reject: <list>[, keys: <list>][, skip: <uint>
45
+ # [, delimiter: <string>]]])
46
+ #
47
+ # === Options
48
+ # * input <glob> - Input file or file glob expression.
49
+ # * first <uint> - Only read in the _first_ number of entries.
50
+ # * last <uint> - Only read in the _last_ number of entries.
51
+ # * select <list> - List of column indexes or header keys to read.
52
+ # * reject <list> - List of column indexes or header keys to skip.
53
+ # * keys <list> - List of key identifiers to use for each column.
54
+ # * skip <uint> - Number of initial lines to skip (default=0).
55
+ # * delimiter <string> - Delimter to use for separating columsn
56
+ # (default="\s+").
57
+ #
58
+ # == Examples
59
+ #
60
+ # To read all entries from a file:
61
+ #
62
+ # read_table(input: "test.tab")
63
+ #
64
+ # To read all entries from a gzipped file:
65
+ #
66
+ # read_table(input: "test.tab.gz")
67
+ #
68
+ # To read in only 10 records from a file:
69
+ #
70
+ # read_table(input: "test.tab", first: 10)
71
+ #
72
+ # To read in the last 10 records from a file:
73
+ #
74
+ # read_table(input: "test.tab", last: 10)
75
+ #
76
+ # To read all entries from multiple files:
77
+ #
78
+ # read_table(input: "test1.tab,test2.tab")
79
+ #
80
+ # To read entries from multiple files using a glob expression:
81
+ #
82
+ # read_table(input: "*.tab")
83
+ #
84
+ # Consider the following table from the file from the file test.tab:
85
+ #
86
+ # #Organism Sequence Count
87
+ # Human ATACGTCAG 23524
88
+ # Dog AGCATGAC 2442
89
+ # Mouse GACTG 234
90
+ # Cat AAATGCA 2342
91
+ #
92
+ # Reading the entire table will result in 4 records, one for each row,
93
+ # where the keys Organism, Sequence and Count are taken from the comment
94
+ # line prefixe with #:
95
+ #
96
+ # BP.new.read_tab(input: "test.tab").dump.run
97
+ #
98
+ # {:Organism=>"Human", :Sequence=>"ATACGTCAG", :Count=>23524}
99
+ # {:Organism=>"Dog", :Sequence=>"AGCATGAC", :Count=>2442}
100
+ # {:Organism=>"Mouse", :Sequence=>"GACTG", :Count=>234}
101
+ # {:Organism=>"Cat", :Sequence=>"AAATGCA", :Count=>2342}
102
+ #
103
+ # However, if the first line is skipped using the +skip+ option the keys
104
+ # will default to V0, V1, V2 ... Vn:
105
+ #
106
+ # BP.new.read_table(input: "test.tab", skip: 1).dump.run
107
+ #
108
+ # {:V0=>"Human", :V1=>"ATACGTCAG", :V2=>23524}
109
+ # {:V0=>"Dog", :V1=>"AGCATGAC", :V2=>2442}
110
+ # {:V0=>"Mouse", :V1=>"GACTG", :V2=>234}
111
+ # {:V0=>"Cat", :V1=>"AAATGCA", :V2=>2342}
112
+ #
113
+ # To explicitly name the columns (or the keys) use the +keys+ option:
114
+ #
115
+ # BP.new.
116
+ # read_table(input: "test.tab", skip: 1, keys: [:ORGANISM, :SEQ, :COUNT]).
117
+ # dump.
118
+ # run
119
+ #
120
+ # {:ORGANISM=>"Human", :SEQ=>"ATACGTCAG", :COUNT=>23524}
121
+ # {:ORGANISM=>"Dog", :SEQ=>"AGCATGAC", :COUNT=>2442}
122
+ # {:ORGANISM=>"Mouse", :SEQ=>"GACTG", :COUNT=>234}
123
+ # {:ORGANISM=>"Cat", :SEQ=>"AAATGCA", :COUNT=>2342}
124
+ #
125
+ # It is possible to select a subset of columns to read by using the
126
+ # +select+ option which takes a comma separated list of columns numbers
127
+ # (first column is designated 0) or header keys as (requires header)
128
+ # argument. So to read in only the sequence and the count so that the
129
+ # count comes before the sequence do:
130
+ #
131
+ # BP.new.read_table(input: "test.tab", skip: 1, select: [2, 1]).dump.run
132
+ #
133
+ # {:V0=>23524, :V1=>"ATACGTCAG"}
134
+ # {:V0=>2442, :V1=>"AGCATGAC"}
135
+ # {:V0=>234, :V1=>"GACTG"}
136
+ # {:V0=>2342, :V1=>"AAATGCA"}
137
+ #
138
+ # Alternatively, if a header line was present in the file:
139
+ #
140
+ # #Organism Sequence Count
141
+ #
142
+ # Then the header keys can be used:
143
+ #
144
+ # BP.new.
145
+ # read_table(input: "test.tab", skip: 1, select: [:Count, :Sequence]).
146
+ # dump.
147
+ # run
148
+ #
149
+ # {:Count=>23524, :Sequence=>"ATACGTCAG"}
150
+ # {:Count=>2442, :Sequence=>"AGCATGAC"}
151
+ # {:Count=>234, :Sequence=>"GACTG"}
152
+ # {:Count=>2342, :Sequence=>"AAATGCA"}
153
+ #
154
+ # Likewise, it is possible to reject specified columns from being read
155
+ # using the +reject+ option:
156
+ #
157
+ # BP.new.read_table(input: "test.tab", skip: 1, reject: [2, 1]).dump.run
158
+ #
159
+ # {:V0=>"Human"}
160
+ # {:V0=>"Dog"}
161
+ # {:V0=>"Mouse"}
162
+ # {:V0=>"Cat"}
163
+ #
164
+ # And again, the header keys can be used if a header is present:
165
+ #
166
+ # BP.new.
167
+ # read_table(input: "test.tab", skip: 1, reject: [:Count, :Sequence]).
168
+ # dump.
169
+ # run
170
+ #
171
+ # {:Organism=>"Human"}
172
+ # {:Organism=>"Dog"}
173
+ # {:Organism=>"Mouse"}
174
+ # {:Organism=>"Cat"}
175
+ #
176
+ # rubocop: disable ClassLength
177
+ class ReadTable
178
+ STATS = %i(records_in records_out)
179
+
180
+ # Constructor for ReadTable.
181
+ #
182
+ # @param options [Hash] Options hash.
183
+ # @option options [String] :input
184
+ # @option options [Integer] :first
185
+ # @option options [Integer] :last
186
+ # @option options [Array] :keys
187
+ # @option options [Integer] :skip
188
+ # @option options [String] :delimiter
189
+ # @option options [Boolean] :select
190
+ # @option options [Boolean] :reject
191
+ #
192
+ # @return [ReadTable] Class instance.
193
+ def initialize(options)
194
+ @options = options
195
+ @keys = options[:keys] ? options[:keys].map(&:to_sym) : nil
196
+ @skip = options[:skip] || 0
197
+ @buffer = []
198
+
199
+ check_options
200
+ end
201
+
202
+ # Return command lambda for ReadTable
203
+ #
204
+ # @return [Proc] Command lambda.
205
+ def lmb
206
+ lambda do |input, output, status|
207
+ status_init(status, STATS)
208
+
209
+ process_input(input, output)
210
+
211
+ case
212
+ when @options[:first] then read_first(output)
213
+ when @options[:last] then read_last(output)
214
+ else read_all(output)
215
+ end
216
+ end
217
+ end
218
+
219
+ private
220
+
221
+ # Check options.
222
+ def check_options
223
+ options_allowed(@options, :input, :first, :last, :keys, :skip, :delimiter,
224
+ :select, :reject)
225
+ options_required(@options, :input)
226
+ options_files_exist(@options, :input)
227
+ options_unique(@options, :first, :last)
228
+ options_unique(@options, :select, :reject)
229
+ options_list_unique(@options, :keys, :select, :reject)
230
+ options_assert(@options, ':first >= 0')
231
+ options_assert(@options, ':last >= 0')
232
+ options_assert(@options, ':skip >= 0')
233
+ end
234
+
235
+ # Return a hash with options for CVS#each_hash.
236
+ #
237
+ # @return [Hash] Read table options.
238
+ def read_options
239
+ {delimiter: @options[:delimiter],
240
+ select: @options[:select],
241
+ reject: @options[:reject]}
242
+ end
243
+
244
+ # Read :first entries from input files and emit to output stream.
245
+ #
246
+ # @param output [Enumerator::Yeilder] Output stream.
247
+ def read_first(output)
248
+ options_glob(@options[:input]).each do |file|
249
+ BioDSL::CSV.open(file) do |ios|
250
+ ios.skip(@skip)
251
+
252
+ ios.each_hash(read_options) do |record|
253
+ output << record
254
+ @status[:records_out] += 1
255
+ return if @status[:records_out] >= @options[:first]
256
+ end
257
+ end
258
+ end
259
+ end
260
+
261
+ # Read :last entries from input files and emit to output stream.
262
+ #
263
+ # @param output [Enumerator::Yeilder] Output stream.
264
+ def read_last(output)
265
+ options_glob(@options[:input]).each do |file|
266
+ BioDSL::CSV.open(file) do |ios|
267
+ ios.skip(@skip)
268
+
269
+ ios.each_hash(read_options) do |record|
270
+ @buffer << record
271
+ @buffer.shift if @buffer.size > @options[:last]
272
+ end
273
+ end
274
+ end
275
+
276
+ output_buffer(output)
277
+ end
278
+
279
+ # Read all entries from input files and emit to output stream.
280
+ #
281
+ # @param output [Enumerator::Yeilder] Output stream.
282
+ def read_all(output)
283
+ options_glob(@options[:input]).each do |file|
284
+ BioDSL::CSV.open(file) do |ios|
285
+ ios.skip(@skip)
286
+
287
+ ios.each_hash(read_options) do |record|
288
+ replace_keys(record) if @keys
289
+ output << record
290
+ @status[:records_out] += 1
291
+ end
292
+ end
293
+ end
294
+ end
295
+
296
+ # Replace the keys of a given record.
297
+ #
298
+ # @param record [Hash] BioDSL record.
299
+ def replace_keys(record)
300
+ record.first(@keys.size).each_with_index do |(k, v), i|
301
+ record[@keys[i]] = v
302
+ record.delete k
303
+ end
304
+ end
305
+
306
+ # Output all record in the buffer to the output stream.
307
+ #
308
+ # @param output [Enumerator::Yielder] Output stream.
309
+ def output_buffer(output)
310
+ @buffer.each do |record|
311
+ output << record
312
+ @status[:records_out] += 1
313
+ end
314
+ end
315
+
316
+ # Emit all records from the input stream to the output stream.
317
+ #
318
+ # @param input [Enumerator] Input stream.
319
+ # @param output [Enumerator::Yielder] Output stream.
320
+ def process_input(input, output)
321
+ return unless output
322
+ input.each do |record|
323
+ output << record
324
+ @status[:records_in] += 1
325
+ @status[:records_out] += 1
326
+ end
327
+ end
328
+ end
329
+ end
@@ -0,0 +1,113 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Reverse sequences in the stream.
30
+ #
31
+ # +reverse_seq+ reverses sequences in the stream. If a SCORES key is found
32
+ # then the SCORES are also reversed.
33
+ #
34
+ # +reverse_seq+ can be used together with +complment_seq+ to reverse-
35
+ # complement sequences.
36
+ #
37
+ # == Usage
38
+ #
39
+ # reverse_seq()
40
+ #
41
+ # === Options
42
+ #
43
+ # == Examples
44
+ #
45
+ # Consider the following FASTQ entry in the file test.fq:
46
+ #
47
+ # @M02529:88:000000000-AC0WY:1:1101:12879:1928 2:N:0:185
48
+ # TTGTAAAACGACGGCCAGTG
49
+ # +
50
+ # >>>>>FFFFD@A?A0AE0FG
51
+ #
52
+ # To reverse the sequence simply do:
53
+ #
54
+ # BP.new.read_fastq(input:"test.fq").reverse_seq.dump.run
55
+ #
56
+ # {:SEQ_NAME=>"M02529:88:000000000-AC0WY:1:1101:12879:1928 2:N:0:185",
57
+ # :SEQ=>"GTGACCGGCAGCAAAATGTT",
58
+ # :SEQ_LEN=>20,
59
+ # :SCORES=>"GF0EA0A?A@DFFFF>>>>>"}
60
+ class ReverseSeq
61
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
62
+ residues_out)
63
+
64
+ # Constructor for ReverseSeq.
65
+ #
66
+ # @param options [Hash] Options hash.
67
+ #
68
+ # @return [ReverseSeq] Class instance.
69
+ def initialize(options)
70
+ @options = options
71
+
72
+ check_options
73
+ end
74
+
75
+ # Return command lambda for reverse_seq.
76
+ #
77
+ # @return [Proc] Command lambda.
78
+ def lmb
79
+ lambda do |input, output, status|
80
+ status_init(status, STATS)
81
+
82
+ input.each do |record|
83
+ @status[:records_in] += 1
84
+ reverse(record) if record[:SEQ]
85
+ output << record
86
+ @status[:records_out] += 1
87
+ end
88
+ end
89
+ end
90
+
91
+ private
92
+
93
+ # Check options.
94
+ def check_options
95
+ options_allowed(@options, nil)
96
+ end
97
+
98
+ # Reverse sequence.
99
+ #
100
+ # @param record [Hash] BioDSL record.
101
+ def reverse(record)
102
+ entry = BioDSL::Seq.new_bp(record)
103
+ entry.reverse!
104
+
105
+ @status[:sequences_in] += 1
106
+ @status[:sequences_out] += 1
107
+ @status[:residues_in] += entry.length
108
+ @status[:residues_out] += entry.length
109
+
110
+ record.merge! entry.to_bp
111
+ end
112
+ end
113
+ end