BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,285 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # rubocop: disable LineLength
29
+ module BioDSL
30
+ # == Create a histogram with mean sequence quality scores.
31
+ #
32
+ # +plot_scores+ creates a histogram of the mean values per base of the quality
33
+ # scores from sequence data.
34
+ #
35
+ # Plotting is done using GNUplot which allows for different types of output
36
+ # the default one being crufty ASCII graphics.
37
+ #
38
+ # If plotting scores from sequences of variable length you can use the +count+
39
+ # option to co-plot the relative count at each base position. This allow you
40
+ # to detect areas with a low relative count showing a high mean score.
41
+ #
42
+ # GNUplot must be installed for plot_scores to work. Read more here:
43
+ #
44
+ # http://www.gnuplot.info/
45
+ #
46
+ # == Usage
47
+ #
48
+ # plot_scores([count: <bool>[, output: <file>[, force: <bool>
49
+ # [, terminal: <string>[, title: <string>
50
+ # [, xlabel: <string>[, ylabel: <string>
51
+ # [, test: <bool>]]]]]]]])
52
+ #
53
+ # === Options
54
+ #
55
+ # * count: <bool> - Add line plot of relative counts.
56
+ # * output: <file> - Output file.
57
+ # * force: <bool> - Force overwrite existing output file.
58
+ # * terminal: <string> - Terminal for output: dumb|post|svg|x11|aqua|png|pdf
59
+ # (default=dumb).
60
+ # * title: <string> - Plot title (default="Histogram").
61
+ # * xlabel: <string> - X-axis label (default=<key>).
62
+ # * ylabel: <string> - Y-axis label (default="n").
63
+ # * test: <bool> - Output Gnuplot script instread of plot.
64
+ #
65
+ # == Examples
66
+ #
67
+ # Here we plot the mean quality scores from a FASTQ file:
68
+ #
69
+ # read_fastq(input: "test.fq").plot_scores.run
70
+ #
71
+ # Mean Quality Scores
72
+ # + + + + + +
73
+ # 40 ++-------------+------------+-------------+-------------+------------+++
74
+ # | ***************** mean score ****** |
75
+ # 35 ++ *********************** ++
76
+ # ****************************** ** |
77
+ # 30 +********************************* * ++
78
+ # ************************************* * |
79
+ # 25 +*************************************** * ++
80
+ # ****************************************** ***** |
81
+ # 20 +**************************************************** ** * * ++
82
+ # ******************************************************************** *
83
+ # 15 +**********************************************************************+
84
+ # **********************************************************************
85
+ # 10 +**********************************************************************+
86
+ # **********************************************************************
87
+ # 5 +**********************************************************************+
88
+ # **********************************************************************
89
+ # 0 +**********************************************************************+
90
+ # + + + + + +
91
+ # 0 50 100 150 200 250
92
+ # Sequence position
93
+ #
94
+ # To render X11 output (i.e. instant view) use the +terminal+ option:
95
+ #
96
+ # read_fastq(input: "test.fq").
97
+ # plot_scores(terminal: :x11).run
98
+ #
99
+ # To generate a PNG image and save to file:
100
+ #
101
+ # read_fastq(input: "test.fq").
102
+ # plot_scores(terminal: :png, output: "plot.png").run
103
+ #
104
+ # rubocop: enable LineLength
105
+ # rubocop: disable ClassLength
106
+ class PlotScores
107
+ require 'gnuplotter'
108
+ require 'narray'
109
+ require 'BioDSL/helpers/aux_helper'
110
+
111
+ include AuxHelper
112
+
113
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
114
+ residues_out)
115
+
116
+ SCORES_MAX = 100_000 # Maximum score string length.
117
+
118
+ # Constructor for PlotScores.
119
+ #
120
+ # @param options [Hash] Options hash.
121
+ # @option options [Boolean] :count
122
+ # @option options [String] :output
123
+ # @option options [Boolean] :force
124
+ # @option options [Symbol] :terminal
125
+ # @option options [String] :title
126
+ # @option options [String] :xlabel
127
+ # @option options [String] :ylabel
128
+ # @option options [Boolean] :ylogscale
129
+ # @option options [Boolean] :test
130
+ #
131
+ # @return [PlotScores] Class instance.
132
+ def initialize(options)
133
+ @options = options
134
+ @scores_vec = NArray.int(SCORES_MAX)
135
+ @count_vec = NArray.int(SCORES_MAX)
136
+ @max = 0
137
+
138
+ aux_exist('gnuplot')
139
+ check_options
140
+ default
141
+ end
142
+
143
+ # Return command lambda for plot_scores.
144
+ #
145
+ # @return [Proc] Command lambda.
146
+ def lmb
147
+ lambda do |input, output, status|
148
+ status_init(status, STATS)
149
+
150
+ input.each do |record|
151
+ @status[:records_in] += 1
152
+
153
+ collect_plot_data(record)
154
+
155
+ write_output(output, record)
156
+ end
157
+
158
+ prepare_plot_data
159
+
160
+ plot_defaults
161
+ plot_scores
162
+ plot_count
163
+ plot_output
164
+ end
165
+ end
166
+
167
+ private
168
+
169
+ # Check options.
170
+ def check_options
171
+ options_allowed(@options, :count, :output, :force, :terminal, :title,
172
+ :xlabel, :ylabel, :ylogscale, :test)
173
+ options_allowed_values(@options, count: [true, false])
174
+ options_allowed_values(@options, test: [true, false])
175
+ options_allowed_values(@options, terminal: [:dumb, :post, :svg, :x11,
176
+ :aqua, :png, :pdf])
177
+ options_files_exist_force(@options, :output)
178
+ end
179
+
180
+ # Set default options.
181
+ def default
182
+ @options[:terminal] ||= :dumb
183
+ @options[:title] ||= 'Mean Quality Scores'
184
+ @options[:xlabel] ||= 'Sequence Position'
185
+ @options[:ylabel] ||= 'Mean Score'
186
+ end
187
+
188
+ # Collect plot data from a given record.
189
+ #
190
+ # @param record [Hash] BioDSL record.
191
+ def collect_plot_data(record)
192
+ scores = record[:SCORES]
193
+ return unless scores && scores.length > 0
194
+
195
+ check_length(scores)
196
+
197
+ score_vec = NArray.to_na(scores, 'byte') - Seq::SCORE_BASE
198
+ @scores_vec[0...scores.length] += score_vec
199
+ @count_vec[0...scores.length] += 1
200
+
201
+ @max = scores.length if scores.length > @max
202
+ end
203
+
204
+ # Check if the scores string is longer than SCORES_MAX.
205
+ #
206
+ # @raise [BioDSLError] if too long.
207
+ def check_length(scores)
208
+ return unless scores.length > SCORES_MAX
209
+ msg = "score string too long: #{scores.length} > #{SCORES_MAX}"
210
+ fail BioDSLError, msg
211
+ end
212
+
213
+ # Prepare data to plot.
214
+ def prepare_plot_data
215
+ @max = 1 if @max == 0 # ugly fix to avaid index error
216
+
217
+ count_vec = @count_vec[0...@max].to_f
218
+ count_vec *= (Seq::SCORE_MAX / @count_vec.max(0).to_f)
219
+
220
+ @x = (1..@max).to_a
221
+ @y1 = mean_vec.to_a
222
+ @y2 = count_vec.to_a
223
+ end
224
+
225
+ # Calculate the mean scores vector.
226
+ #
227
+ # @return [NArray] NArray with mean scores.
228
+ def mean_vec
229
+ @scores_vec[0...@max].to_f / @count_vec[0...@max]
230
+ end
231
+
232
+ # Set plot defaults
233
+ def plot_defaults
234
+ @gp = GnuPlotter.new
235
+ @gp.set terminal: @options[:terminal]
236
+ @gp.set title: @options[:title]
237
+ @gp.set xlabel: @options[:xlabel]
238
+ @gp.set ylabel: @options[:ylabel]
239
+ @gp.set output: @options[:output] if @options[:output]
240
+ @gp.set xrange: "[#{@x.min - 1}:#{@x.max + 1}]"
241
+ @gp.set yrange: "[#{Seq::SCORE_MIN}:#{Seq::SCORE_MAX}]"
242
+ @gp.set style: 'fill solid 0.5 border'
243
+ @gp.set xtics: 'out'
244
+ @gp.set ytics: 'out'
245
+ end
246
+
247
+ # Plot scores data.
248
+ def plot_scores
249
+ style = {with: 'boxes lc rgb "red"', title: '"mean score"'}
250
+
251
+ @gp.add_dataset(style) do |plotter|
252
+ @x.zip(@y1).each { |e| plotter << e }
253
+ end
254
+ end
255
+
256
+ # Plot count data.
257
+ def plot_count
258
+ return unless @options[:count]
259
+
260
+ style = {with: 'lines lt rgb "black"', title: '"relative count"'}
261
+
262
+ @gp.add_dataset(style) do |plotter|
263
+ @x.zip(@y2).each { |e| plotter << e }
264
+ end
265
+ end
266
+
267
+ # Output plot
268
+ def plot_output
269
+ if @options[:test]
270
+ $stderr.puts @gp.to_gp
271
+ elsif @options[:terminal] == :dumb
272
+ puts @gp.plot
273
+ else
274
+ @gp.plot
275
+ end
276
+ end
277
+
278
+ # Write record to output.
279
+ def write_output(output, record)
280
+ return unless output
281
+ output << record
282
+ @status[:records_out] += 1
283
+ end
284
+ end
285
+ end
@@ -0,0 +1,153 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Pick number of rand om records from the stream.
30
+ #
31
+ # +random+ can be used to pick a random number of records from the stream.
32
+ # Note that the order of records is preserved.
33
+ #
34
+ # Using the `pair: true` option allows random picking of interleaved
35
+ # paired-end sequence records.
36
+ #
37
+ # == Usage
38
+ #
39
+ # random(<number: <uint>[, pairs: <bool>])
40
+ #
41
+ # === Options
42
+ #
43
+ # * number: <uint> - Number of records to pick.
44
+ # * pairs: <bool> - Preserve interleaved pair order.
45
+ #
46
+ # == Examples
47
+ #
48
+ # To pick some random records from the stream do:
49
+ #
50
+ # BP.new.
51
+ # read_fasta(input: "in.fna").
52
+ # random(number: 10_000).
53
+ # write_fasta(output: "out.fna").
54
+ # run
55
+ class Random
56
+ STATS = %i(records_in records_out)
57
+
58
+ # Constructor for Randowm.
59
+ #
60
+ # @param options [Hash] Options hash.
61
+ #
62
+ # @option options [Fixnum] :number
63
+ # @option options [Boolean] :pairs
64
+ #
65
+ # @return [Random] Class instance.
66
+ def initialize(options)
67
+ @options = options
68
+ @wanted = nil
69
+
70
+ check_options
71
+ end
72
+
73
+ # Return command lambda for random.
74
+ #
75
+ # @return [Proc] Command lambda.
76
+ def lmb
77
+ lambda do |input, output, status|
78
+ status_init(status, STATS)
79
+
80
+ TmpDir.create('random') do |file, _|
81
+ process_input(input, file)
82
+ decide_wanted
83
+ process_output(output, file)
84
+ end
85
+ end
86
+ end
87
+
88
+ private
89
+
90
+ # Check options.
91
+ def check_options
92
+ options_allowed(@options, :number, :pairs)
93
+ options_required(@options, :number)
94
+ options_allowed_values(@options, pairs: [nil, true, false])
95
+ options_assert(@options, ':number > 0')
96
+ end
97
+
98
+ # Serialize records from input
99
+ #
100
+ # @param input [Enumerator] Input stream.
101
+ # @param file [String] Path to temporary file.
102
+ def process_input(input, file)
103
+ File.open(file, 'wb') do |ios|
104
+ BioDSL::Serializer.new(ios) do |s|
105
+ input.each do |record|
106
+ @status[:records_in] += 1
107
+
108
+ s << record
109
+ end
110
+ end
111
+ end
112
+ end
113
+
114
+ # Compile a random set of numbers.
115
+ def decide_wanted
116
+ if @options[:pairs]
117
+ decide_wanted_pairs
118
+ else
119
+ @wanted =
120
+ (0...@status[:records_in]).to_a.shuffle[0...@options[:number]].to_set
121
+ end
122
+ end
123
+
124
+ # Compile a random set of number pairs.
125
+ def decide_wanted_pairs
126
+ @wanted = Set.new
127
+ range = (0...@status[:records_in])
128
+ num = @options[:number] / 2
129
+
130
+ range.to_a.shuffle.select(&:even?)[0...num].each do |i|
131
+ @wanted.merge([i, i + 1])
132
+ end
133
+ end
134
+
135
+ # Read records from temporary file and emit wanted records to the output
136
+ # stream.
137
+ #
138
+ # @param output [Enumerator::Yielder] Output stream.
139
+ # @param file [String] Path to termorary file with records.
140
+ def process_output(output, file)
141
+ File.open(file, 'rb') do |ios|
142
+ BioDSL::Serializer.new(ios) do |s|
143
+ s.each_with_index do |record, i|
144
+ if @wanted.include? i
145
+ output << record
146
+ @status[:records_out] += 1
147
+ end
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
153
+ end
@@ -0,0 +1,222 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Read FASTA entries from one or more files.
30
+ #
31
+ # +read_fasta+ read in sequence entries from FASTA files. Each sequence
32
+ # entry consists of a sequence name prefixed by a '>' followed by the sequence
33
+ # name on a line of its own, followed by one or my lines of sequence until the
34
+ # next entry or the end of the file. The resulting Biopiece record consists of
35
+ # the following record type:
36
+ #
37
+ # {:SEQ_NAME=>"test",
38
+ # :SEQ=>"AGCATCGACTAGCAGCATTT",
39
+ # :SEQ_LEN=>20}
40
+ #
41
+ # Input files may be compressed with gzip og bzip2.
42
+ #
43
+ # For more about the FASTA format:
44
+ #
45
+ # http://en.wikipedia.org/wiki/Fasta_format
46
+ #
47
+ # == Usage
48
+ # read_fasta(input: <glob>[, first: <uint>|last: <uint>])
49
+ #
50
+ # === Options
51
+ # * input <glob> - Input file or file glob expression.
52
+ # * first <uint> - Only read in the _first_ number of entries.
53
+ # * last <uint> - Only read in the _last_ number of entries.
54
+ #
55
+ # == Examples
56
+ #
57
+ # To read all FASTA entries from a file:
58
+ #
59
+ # read_fasta(input: "test.fna")
60
+ #
61
+ # To read all FASTA entries from a gzipped file:
62
+ #
63
+ # read_fasta(input: "test.fna.gz")
64
+ #
65
+ # To read in only 10 records from a FASTA file:
66
+ #
67
+ # read_fasta(input: "test.fna", first: 10)
68
+ #
69
+ # To read in the last 10 records from a FASTA file:
70
+ #
71
+ # read_fasta(input: "test.fna", last: 10)
72
+ #
73
+ # To read all FASTA entries from multiple files:
74
+ #
75
+ # read_fasta(input: "test1.fna,test2.fna")
76
+ #
77
+ # To read FASTA entries from multiple files using a glob expression:
78
+ #
79
+ # read_fasta(input: "*.fna")
80
+ class ReadFasta
81
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
82
+ residues_out)
83
+
84
+ # Constructor for the ReadFasta class.
85
+ #
86
+ # @param [Hash] options Options hash.
87
+ # @option options [String, Array] :input String or Array with glob
88
+ # expressions.
89
+ # @option options [Integer] :first Dump first number of records.
90
+ # @option options [Integer] :last Dump last number of records.
91
+ #
92
+ # @return [ReadFasta] Returns an instance of the class.
93
+ def initialize(options)
94
+ @options = options
95
+ @count = 0
96
+ @buffer = []
97
+
98
+ check_options
99
+ end
100
+
101
+ # Return a lambda for the read_fasta command.
102
+ #
103
+ # @return [Proc] Returns the read_fasta command lambda.
104
+ def lmb
105
+ lambda do |input, output, status|
106
+ status_init(status, STATS)
107
+
108
+ read_input(input, output)
109
+
110
+ options_glob(@options[:input]).each do |file|
111
+ BioDSL::Fasta.open(file) do |ios|
112
+ if @options[:first] && read_first(ios, output)
113
+ elsif @options[:last] && read_last(ios)
114
+ else
115
+ read_all(ios, output)
116
+ end
117
+ end
118
+ end
119
+
120
+ write_buffer(output) if @options[:last]
121
+ end
122
+ end
123
+
124
+ private
125
+
126
+ # Check the options.
127
+ def check_options
128
+ options_allowed(@options, :input, :first, :last)
129
+ options_required(@options, :input)
130
+ options_files_exist(@options, :input)
131
+ options_unique(@options, :first, :last)
132
+ options_assert(@options, ':first >= 0')
133
+ options_assert(@options, ':last >= 0')
134
+ end
135
+
136
+ # Read and emit records from the input to the output stream.
137
+ #
138
+ # @param input [Enumerable::Yielder] Input stream.
139
+ # @param output [Enumerable::Yielder] Output stream.
140
+ def read_input(input, output)
141
+ return unless input
142
+
143
+ input.each do |record|
144
+ output << record
145
+ @status[:records_in] += 1
146
+
147
+ if record[:SEQ]
148
+ @status[:sequences_in] += 1
149
+ @status[:residues_in] += record[:SEQ].length
150
+ end
151
+ end
152
+ end
153
+
154
+ # Read in a specified number of entries from the input and emit to the
155
+ # output.
156
+ #
157
+ # @param input [BioDSL::Fasta] FASTA file input stream.
158
+ # @param output [Enumerable::Yielder] Output stream.
159
+ #
160
+ # @return [Fixnum] Number of read entries.
161
+ def read_first(input, output)
162
+ first = @options[:first]
163
+
164
+ input.each do |entry|
165
+ break if @count == first
166
+ output << entry.to_bp
167
+
168
+ @status[:records_out] += 1
169
+ @status[:sequences_out] += 1
170
+ @status[:residues_out] += entry.length
171
+
172
+ @count += 1
173
+ end
174
+
175
+ @count
176
+ end
177
+
178
+ # Read in entries from input and cache the specified last number in a
179
+ # buffer.
180
+ #
181
+ # @param input [BioDSL::Fasta] FASTA file input stream.
182
+ #
183
+ # @return [Fixnum] Number of read entries.
184
+ def read_last(input)
185
+ last = @options[:last]
186
+
187
+ input.each do |entry|
188
+ @buffer << entry
189
+ @buffer.shift if @buffer.size > last
190
+ end
191
+
192
+ @buffer.size
193
+ end
194
+
195
+ # Read in all entries from input and emit to output.
196
+ #
197
+ # @param input [BioDSL::Fasta] FASTA file input stream.
198
+ # @param output [Enumerable::Yielder] Output stream.
199
+ def read_all(input, output)
200
+ input.each do |entry|
201
+ output << entry.to_bp
202
+
203
+ @status[:records_out] += 1
204
+ @status[:sequences_out] += 1
205
+ @status[:residues_out] += entry.length
206
+ end
207
+ end
208
+
209
+ # Emit all entries in buffer to output.
210
+ #
211
+ # @param output [Enumerable::Yielder] Output stream.
212
+ def write_buffer(output)
213
+ @buffer.each do |entry|
214
+ output << entry.to_bp
215
+
216
+ @status[:records_out] += 1
217
+ @status[:sequences_out] += 1
218
+ @status[:residues_out] += entry.length
219
+ end
220
+ end
221
+ end
222
+ end