BioDSL 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,285 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # rubocop: disable LineLength
29
+ module BioDSL
30
+ # == Create a histogram with mean sequence quality scores.
31
+ #
32
+ # +plot_scores+ creates a histogram of the mean values per base of the quality
33
+ # scores from sequence data.
34
+ #
35
+ # Plotting is done using GNUplot which allows for different types of output
36
+ # the default one being crufty ASCII graphics.
37
+ #
38
+ # If plotting scores from sequences of variable length you can use the +count+
39
+ # option to co-plot the relative count at each base position. This allow you
40
+ # to detect areas with a low relative count showing a high mean score.
41
+ #
42
+ # GNUplot must be installed for plot_scores to work. Read more here:
43
+ #
44
+ # http://www.gnuplot.info/
45
+ #
46
+ # == Usage
47
+ #
48
+ # plot_scores([count: <bool>[, output: <file>[, force: <bool>
49
+ # [, terminal: <string>[, title: <string>
50
+ # [, xlabel: <string>[, ylabel: <string>
51
+ # [, test: <bool>]]]]]]]])
52
+ #
53
+ # === Options
54
+ #
55
+ # * count: <bool> - Add line plot of relative counts.
56
+ # * output: <file> - Output file.
57
+ # * force: <bool> - Force overwrite existing output file.
58
+ # * terminal: <string> - Terminal for output: dumb|post|svg|x11|aqua|png|pdf
59
+ # (default=dumb).
60
+ # * title: <string> - Plot title (default="Histogram").
61
+ # * xlabel: <string> - X-axis label (default=<key>).
62
+ # * ylabel: <string> - Y-axis label (default="n").
63
+ # * test: <bool> - Output Gnuplot script instread of plot.
64
+ #
65
+ # == Examples
66
+ #
67
+ # Here we plot the mean quality scores from a FASTQ file:
68
+ #
69
+ # read_fastq(input: "test.fq").plot_scores.run
70
+ #
71
+ # Mean Quality Scores
72
+ # + + + + + +
73
+ # 40 ++-------------+------------+-------------+-------------+------------+++
74
+ # | ***************** mean score ****** |
75
+ # 35 ++ *********************** ++
76
+ # ****************************** ** |
77
+ # 30 +********************************* * ++
78
+ # ************************************* * |
79
+ # 25 +*************************************** * ++
80
+ # ****************************************** ***** |
81
+ # 20 +**************************************************** ** * * ++
82
+ # ******************************************************************** *
83
+ # 15 +**********************************************************************+
84
+ # **********************************************************************
85
+ # 10 +**********************************************************************+
86
+ # **********************************************************************
87
+ # 5 +**********************************************************************+
88
+ # **********************************************************************
89
+ # 0 +**********************************************************************+
90
+ # + + + + + +
91
+ # 0 50 100 150 200 250
92
+ # Sequence position
93
+ #
94
+ # To render X11 output (i.e. instant view) use the +terminal+ option:
95
+ #
96
+ # read_fastq(input: "test.fq").
97
+ # plot_scores(terminal: :x11).run
98
+ #
99
+ # To generate a PNG image and save to file:
100
+ #
101
+ # read_fastq(input: "test.fq").
102
+ # plot_scores(terminal: :png, output: "plot.png").run
103
+ #
104
+ # rubocop: enable LineLength
105
+ # rubocop: disable ClassLength
106
+ class PlotScores
107
+ require 'gnuplotter'
108
+ require 'narray'
109
+ require 'BioDSL/helpers/aux_helper'
110
+
111
+ include AuxHelper
112
+
113
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
114
+ residues_out)
115
+
116
+ SCORES_MAX = 100_000 # Maximum score string length.
117
+
118
+ # Constructor for PlotScores.
119
+ #
120
+ # @param options [Hash] Options hash.
121
+ # @option options [Boolean] :count
122
+ # @option options [String] :output
123
+ # @option options [Boolean] :force
124
+ # @option options [Symbol] :terminal
125
+ # @option options [String] :title
126
+ # @option options [String] :xlabel
127
+ # @option options [String] :ylabel
128
+ # @option options [Boolean] :ylogscale
129
+ # @option options [Boolean] :test
130
+ #
131
+ # @return [PlotScores] Class instance.
132
+ def initialize(options)
133
+ @options = options
134
+ @scores_vec = NArray.int(SCORES_MAX)
135
+ @count_vec = NArray.int(SCORES_MAX)
136
+ @max = 0
137
+
138
+ aux_exist('gnuplot')
139
+ check_options
140
+ default
141
+ end
142
+
143
+ # Return command lambda for plot_scores.
144
+ #
145
+ # @return [Proc] Command lambda.
146
+ def lmb
147
+ lambda do |input, output, status|
148
+ status_init(status, STATS)
149
+
150
+ input.each do |record|
151
+ @status[:records_in] += 1
152
+
153
+ collect_plot_data(record)
154
+
155
+ write_output(output, record)
156
+ end
157
+
158
+ prepare_plot_data
159
+
160
+ plot_defaults
161
+ plot_scores
162
+ plot_count
163
+ plot_output
164
+ end
165
+ end
166
+
167
+ private
168
+
169
+ # Check options.
170
+ def check_options
171
+ options_allowed(@options, :count, :output, :force, :terminal, :title,
172
+ :xlabel, :ylabel, :ylogscale, :test)
173
+ options_allowed_values(@options, count: [true, false])
174
+ options_allowed_values(@options, test: [true, false])
175
+ options_allowed_values(@options, terminal: [:dumb, :post, :svg, :x11,
176
+ :aqua, :png, :pdf])
177
+ options_files_exist_force(@options, :output)
178
+ end
179
+
180
+ # Set default options.
181
+ def default
182
+ @options[:terminal] ||= :dumb
183
+ @options[:title] ||= 'Mean Quality Scores'
184
+ @options[:xlabel] ||= 'Sequence Position'
185
+ @options[:ylabel] ||= 'Mean Score'
186
+ end
187
+
188
+ # Collect plot data from a given record.
189
+ #
190
+ # @param record [Hash] BioDSL record.
191
+ def collect_plot_data(record)
192
+ scores = record[:SCORES]
193
+ return unless scores && scores.length > 0
194
+
195
+ check_length(scores)
196
+
197
+ score_vec = NArray.to_na(scores, 'byte') - Seq::SCORE_BASE
198
+ @scores_vec[0...scores.length] += score_vec
199
+ @count_vec[0...scores.length] += 1
200
+
201
+ @max = scores.length if scores.length > @max
202
+ end
203
+
204
+ # Check if the scores string is longer than SCORES_MAX.
205
+ #
206
+ # @raise [BioDSLError] if too long.
207
+ def check_length(scores)
208
+ return unless scores.length > SCORES_MAX
209
+ msg = "score string too long: #{scores.length} > #{SCORES_MAX}"
210
+ fail BioDSLError, msg
211
+ end
212
+
213
+ # Prepare data to plot.
214
+ def prepare_plot_data
215
+ @max = 1 if @max == 0 # ugly fix to avaid index error
216
+
217
+ count_vec = @count_vec[0...@max].to_f
218
+ count_vec *= (Seq::SCORE_MAX / @count_vec.max(0).to_f)
219
+
220
+ @x = (1..@max).to_a
221
+ @y1 = mean_vec.to_a
222
+ @y2 = count_vec.to_a
223
+ end
224
+
225
+ # Calculate the mean scores vector.
226
+ #
227
+ # @return [NArray] NArray with mean scores.
228
+ def mean_vec
229
+ @scores_vec[0...@max].to_f / @count_vec[0...@max]
230
+ end
231
+
232
+ # Set plot defaults
233
+ def plot_defaults
234
+ @gp = GnuPlotter.new
235
+ @gp.set terminal: @options[:terminal]
236
+ @gp.set title: @options[:title]
237
+ @gp.set xlabel: @options[:xlabel]
238
+ @gp.set ylabel: @options[:ylabel]
239
+ @gp.set output: @options[:output] if @options[:output]
240
+ @gp.set xrange: "[#{@x.min - 1}:#{@x.max + 1}]"
241
+ @gp.set yrange: "[#{Seq::SCORE_MIN}:#{Seq::SCORE_MAX}]"
242
+ @gp.set style: 'fill solid 0.5 border'
243
+ @gp.set xtics: 'out'
244
+ @gp.set ytics: 'out'
245
+ end
246
+
247
+ # Plot scores data.
248
+ def plot_scores
249
+ style = {with: 'boxes lc rgb "red"', title: '"mean score"'}
250
+
251
+ @gp.add_dataset(style) do |plotter|
252
+ @x.zip(@y1).each { |e| plotter << e }
253
+ end
254
+ end
255
+
256
+ # Plot count data.
257
+ def plot_count
258
+ return unless @options[:count]
259
+
260
+ style = {with: 'lines lt rgb "black"', title: '"relative count"'}
261
+
262
+ @gp.add_dataset(style) do |plotter|
263
+ @x.zip(@y2).each { |e| plotter << e }
264
+ end
265
+ end
266
+
267
+ # Output plot
268
+ def plot_output
269
+ if @options[:test]
270
+ $stderr.puts @gp.to_gp
271
+ elsif @options[:terminal] == :dumb
272
+ puts @gp.plot
273
+ else
274
+ @gp.plot
275
+ end
276
+ end
277
+
278
+ # Write record to output.
279
+ def write_output(output, record)
280
+ return unless output
281
+ output << record
282
+ @status[:records_out] += 1
283
+ end
284
+ end
285
+ end
@@ -0,0 +1,153 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Pick number of rand om records from the stream.
30
+ #
31
+ # +random+ can be used to pick a random number of records from the stream.
32
+ # Note that the order of records is preserved.
33
+ #
34
+ # Using the `pair: true` option allows random picking of interleaved
35
+ # paired-end sequence records.
36
+ #
37
+ # == Usage
38
+ #
39
+ # random(<number: <uint>[, pairs: <bool>])
40
+ #
41
+ # === Options
42
+ #
43
+ # * number: <uint> - Number of records to pick.
44
+ # * pairs: <bool> - Preserve interleaved pair order.
45
+ #
46
+ # == Examples
47
+ #
48
+ # To pick some random records from the stream do:
49
+ #
50
+ # BP.new.
51
+ # read_fasta(input: "in.fna").
52
+ # random(number: 10_000).
53
+ # write_fasta(output: "out.fna").
54
+ # run
55
+ class Random
56
+ STATS = %i(records_in records_out)
57
+
58
+ # Constructor for Randowm.
59
+ #
60
+ # @param options [Hash] Options hash.
61
+ #
62
+ # @option options [Fixnum] :number
63
+ # @option options [Boolean] :pairs
64
+ #
65
+ # @return [Random] Class instance.
66
+ def initialize(options)
67
+ @options = options
68
+ @wanted = nil
69
+
70
+ check_options
71
+ end
72
+
73
+ # Return command lambda for random.
74
+ #
75
+ # @return [Proc] Command lambda.
76
+ def lmb
77
+ lambda do |input, output, status|
78
+ status_init(status, STATS)
79
+
80
+ TmpDir.create('random') do |file, _|
81
+ process_input(input, file)
82
+ decide_wanted
83
+ process_output(output, file)
84
+ end
85
+ end
86
+ end
87
+
88
+ private
89
+
90
+ # Check options.
91
+ def check_options
92
+ options_allowed(@options, :number, :pairs)
93
+ options_required(@options, :number)
94
+ options_allowed_values(@options, pairs: [nil, true, false])
95
+ options_assert(@options, ':number > 0')
96
+ end
97
+
98
+ # Serialize records from input
99
+ #
100
+ # @param input [Enumerator] Input stream.
101
+ # @param file [String] Path to temporary file.
102
+ def process_input(input, file)
103
+ File.open(file, 'wb') do |ios|
104
+ BioDSL::Serializer.new(ios) do |s|
105
+ input.each do |record|
106
+ @status[:records_in] += 1
107
+
108
+ s << record
109
+ end
110
+ end
111
+ end
112
+ end
113
+
114
+ # Compile a random set of numbers.
115
+ def decide_wanted
116
+ if @options[:pairs]
117
+ decide_wanted_pairs
118
+ else
119
+ @wanted =
120
+ (0...@status[:records_in]).to_a.shuffle[0...@options[:number]].to_set
121
+ end
122
+ end
123
+
124
+ # Compile a random set of number pairs.
125
+ def decide_wanted_pairs
126
+ @wanted = Set.new
127
+ range = (0...@status[:records_in])
128
+ num = @options[:number] / 2
129
+
130
+ range.to_a.shuffle.select(&:even?)[0...num].each do |i|
131
+ @wanted.merge([i, i + 1])
132
+ end
133
+ end
134
+
135
+ # Read records from temporary file and emit wanted records to the output
136
+ # stream.
137
+ #
138
+ # @param output [Enumerator::Yielder] Output stream.
139
+ # @param file [String] Path to termorary file with records.
140
+ def process_output(output, file)
141
+ File.open(file, 'rb') do |ios|
142
+ BioDSL::Serializer.new(ios) do |s|
143
+ s.each_with_index do |record, i|
144
+ if @wanted.include? i
145
+ output << record
146
+ @status[:records_out] += 1
147
+ end
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
153
+ end
@@ -0,0 +1,222 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Read FASTA entries from one or more files.
30
+ #
31
+ # +read_fasta+ read in sequence entries from FASTA files. Each sequence
32
+ # entry consists of a sequence name prefixed by a '>' followed by the sequence
33
+ # name on a line of its own, followed by one or my lines of sequence until the
34
+ # next entry or the end of the file. The resulting Biopiece record consists of
35
+ # the following record type:
36
+ #
37
+ # {:SEQ_NAME=>"test",
38
+ # :SEQ=>"AGCATCGACTAGCAGCATTT",
39
+ # :SEQ_LEN=>20}
40
+ #
41
+ # Input files may be compressed with gzip og bzip2.
42
+ #
43
+ # For more about the FASTA format:
44
+ #
45
+ # http://en.wikipedia.org/wiki/Fasta_format
46
+ #
47
+ # == Usage
48
+ # read_fasta(input: <glob>[, first: <uint>|last: <uint>])
49
+ #
50
+ # === Options
51
+ # * input <glob> - Input file or file glob expression.
52
+ # * first <uint> - Only read in the _first_ number of entries.
53
+ # * last <uint> - Only read in the _last_ number of entries.
54
+ #
55
+ # == Examples
56
+ #
57
+ # To read all FASTA entries from a file:
58
+ #
59
+ # read_fasta(input: "test.fna")
60
+ #
61
+ # To read all FASTA entries from a gzipped file:
62
+ #
63
+ # read_fasta(input: "test.fna.gz")
64
+ #
65
+ # To read in only 10 records from a FASTA file:
66
+ #
67
+ # read_fasta(input: "test.fna", first: 10)
68
+ #
69
+ # To read in the last 10 records from a FASTA file:
70
+ #
71
+ # read_fasta(input: "test.fna", last: 10)
72
+ #
73
+ # To read all FASTA entries from multiple files:
74
+ #
75
+ # read_fasta(input: "test1.fna,test2.fna")
76
+ #
77
+ # To read FASTA entries from multiple files using a glob expression:
78
+ #
79
+ # read_fasta(input: "*.fna")
80
+ class ReadFasta
81
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
82
+ residues_out)
83
+
84
+ # Constructor for the ReadFasta class.
85
+ #
86
+ # @param [Hash] options Options hash.
87
+ # @option options [String, Array] :input String or Array with glob
88
+ # expressions.
89
+ # @option options [Integer] :first Dump first number of records.
90
+ # @option options [Integer] :last Dump last number of records.
91
+ #
92
+ # @return [ReadFasta] Returns an instance of the class.
93
+ def initialize(options)
94
+ @options = options
95
+ @count = 0
96
+ @buffer = []
97
+
98
+ check_options
99
+ end
100
+
101
+ # Return a lambda for the read_fasta command.
102
+ #
103
+ # @return [Proc] Returns the read_fasta command lambda.
104
+ def lmb
105
+ lambda do |input, output, status|
106
+ status_init(status, STATS)
107
+
108
+ read_input(input, output)
109
+
110
+ options_glob(@options[:input]).each do |file|
111
+ BioDSL::Fasta.open(file) do |ios|
112
+ if @options[:first] && read_first(ios, output)
113
+ elsif @options[:last] && read_last(ios)
114
+ else
115
+ read_all(ios, output)
116
+ end
117
+ end
118
+ end
119
+
120
+ write_buffer(output) if @options[:last]
121
+ end
122
+ end
123
+
124
+ private
125
+
126
+ # Check the options.
127
+ def check_options
128
+ options_allowed(@options, :input, :first, :last)
129
+ options_required(@options, :input)
130
+ options_files_exist(@options, :input)
131
+ options_unique(@options, :first, :last)
132
+ options_assert(@options, ':first >= 0')
133
+ options_assert(@options, ':last >= 0')
134
+ end
135
+
136
+ # Read and emit records from the input to the output stream.
137
+ #
138
+ # @param input [Enumerable::Yielder] Input stream.
139
+ # @param output [Enumerable::Yielder] Output stream.
140
+ def read_input(input, output)
141
+ return unless input
142
+
143
+ input.each do |record|
144
+ output << record
145
+ @status[:records_in] += 1
146
+
147
+ if record[:SEQ]
148
+ @status[:sequences_in] += 1
149
+ @status[:residues_in] += record[:SEQ].length
150
+ end
151
+ end
152
+ end
153
+
154
+ # Read in a specified number of entries from the input and emit to the
155
+ # output.
156
+ #
157
+ # @param input [BioDSL::Fasta] FASTA file input stream.
158
+ # @param output [Enumerable::Yielder] Output stream.
159
+ #
160
+ # @return [Fixnum] Number of read entries.
161
+ def read_first(input, output)
162
+ first = @options[:first]
163
+
164
+ input.each do |entry|
165
+ break if @count == first
166
+ output << entry.to_bp
167
+
168
+ @status[:records_out] += 1
169
+ @status[:sequences_out] += 1
170
+ @status[:residues_out] += entry.length
171
+
172
+ @count += 1
173
+ end
174
+
175
+ @count
176
+ end
177
+
178
+ # Read in entries from input and cache the specified last number in a
179
+ # buffer.
180
+ #
181
+ # @param input [BioDSL::Fasta] FASTA file input stream.
182
+ #
183
+ # @return [Fixnum] Number of read entries.
184
+ def read_last(input)
185
+ last = @options[:last]
186
+
187
+ input.each do |entry|
188
+ @buffer << entry
189
+ @buffer.shift if @buffer.size > last
190
+ end
191
+
192
+ @buffer.size
193
+ end
194
+
195
+ # Read in all entries from input and emit to output.
196
+ #
197
+ # @param input [BioDSL::Fasta] FASTA file input stream.
198
+ # @param output [Enumerable::Yielder] Output stream.
199
+ def read_all(input, output)
200
+ input.each do |entry|
201
+ output << entry.to_bp
202
+
203
+ @status[:records_out] += 1
204
+ @status[:sequences_out] += 1
205
+ @status[:residues_out] += entry.length
206
+ end
207
+ end
208
+
209
+ # Emit all entries in buffer to output.
210
+ #
211
+ # @param output [Enumerable::Yielder] Output stream.
212
+ def write_buffer(output)
213
+ @buffer.each do |entry|
214
+ output << entry.to_bp
215
+
216
+ @status[:records_out] += 1
217
+ @status[:sequences_out] += 1
218
+ @status[:residues_out] += entry.length
219
+ end
220
+ end
221
+ end
222
+ end