BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,233 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Plot tabular numerical data in a heatmap.
30
+ #
31
+ # A heatmap can be plotted with +plot_heatmap+ using numerical data (Non-
32
+ # numerical data is ignored). Data should be tabular with records as rows and
33
+ # keys as columns - the data cells plotted will be the values.
34
+ #
35
+ # Default graphics are crufty ASCII and you probably want high resolution
36
+ # postscript or SVG output instead with is easy using the +terminal+ option.
37
+ # Plotting is done using GNUplot which allows for different types of output.
38
+ #
39
+ # GNUplot must be installed for +plot_heatmap+ to work. Read more here:
40
+ #
41
+ # http://www.gnuplot.info/
42
+ #
43
+ # == Usage
44
+ #
45
+ # plot_heatmap([keys: <list> | skip: <list>[, output: <file>
46
+ # [, force: <bool> [, terminal: <string>
47
+ # [, title: <string>[, xlabel: <string>[, ylabel: <string>
48
+ # [, test: <bool>]]]]]]])
49
+ #
50
+ # === Options
51
+ #
52
+ # * keys: <list> - Comma separated list of keys to plot as columns.
53
+ # * skip: <list> - Comma separated list of keys to skip as columns.
54
+ # * output: <file> - Output file.
55
+ # * force: <bool> - Force overwrite existing output file.
56
+ # * terminal: <string> - Terminal for output: dumb|post|svg|x11|aqua|png|pdf
57
+ # (default=dumb).
58
+ # * title: <string> - Plot title (default="Heatmap").
59
+ # * xlabel: <string> - X-axis label (default="x").
60
+ # * ylabel: <string> - Y-axis label (default="y").
61
+ # * test: <bool> - Output Gnuplot script instead of plot.
62
+ #
63
+ # == Examples
64
+ #
65
+ # Here we plot a heatmap of data a table:
66
+ #
67
+ # BP.new.read_table(input: "test.tab").plot_heatmap.run
68
+ #
69
+ # rubocop:disable ClassLength
70
+ class PlotHeatmap
71
+ require 'gnuplotter'
72
+ require 'set'
73
+ require 'BioDSL/helpers/aux_helper'
74
+
75
+ include AuxHelper
76
+
77
+ STATS = %i(records_in records_out)
78
+
79
+ # Constructor for PlotHeatmap.
80
+ #
81
+ # @param options [Hash] Options hash.
82
+ # @option options [Array] :keys List of keys to plot as column.
83
+ # @option options [Array] :skip List of keys to skip as column.
84
+ # @option options [String] :output Path to output file.
85
+ # @option options [Boolean] :forcea Flag to force overwrite output file.
86
+ # @option options [Symbol] :terminal Set plot terminal type.
87
+ # @option options [String] :title Set plot title.
88
+ # @option options [String] :xlabel Set plot xlabel.
89
+ # @option options [String] :ylabel Set plot ylabel
90
+ # @option options [Boolean] :logscale Logscale Z-axis.
91
+ # @option options [Boolean] :test Output gnuplot script.
92
+ #
93
+ # @return [PlotHeatmap] Class instance.
94
+ def initialize(options)
95
+ @options = options
96
+ @headings = nil
97
+ @skip_keys = determine_skip_keys
98
+
99
+ aux_exist('gnuplot')
100
+ check_options
101
+ defaults
102
+ end
103
+
104
+ # Return command lambda for plot_histogram.
105
+ #
106
+ # @return [Proc] Command lambda.
107
+ def lmb
108
+ lambda do |input, output, status|
109
+ status_init(status, STATS)
110
+
111
+ gp = GnuPlotter.new
112
+
113
+ plot_options(gp)
114
+ plot_dataset(gp, input, output)
115
+ plot_output(gp)
116
+ end
117
+ end
118
+
119
+ private
120
+
121
+ # Check options.
122
+ def check_options
123
+ options_allowed(@options, :keys, :skip, :output, :force, :terminal,
124
+ :title, :xlabel, :ylabel, :logscale, :test)
125
+ options_unique(@options, :keys, :skip)
126
+ options_allowed_values(@options, terminal: [:dumb, :post, :svg, :x11,
127
+ :aqua, :png, :pdf])
128
+ options_allowed_values(@options, test: [nil, true, false])
129
+ options_allowed_values(@options, logscale: [nil, true, false])
130
+ options_files_exist_force(@options, :output)
131
+ end
132
+
133
+ # Set default options.
134
+ def defaults
135
+ @options[:terminal] ||= :dumb
136
+ @options[:title] ||= 'Heatmap'
137
+ @options[:xlabel] ||= 'x'
138
+ @options[:ylabel] ||= 'y'
139
+ end
140
+
141
+ # Compile a set of keys to skip.
142
+ #
143
+ # @return [Set] Set of keys to skip.
144
+ def determine_skip_keys
145
+ return unless @options[:skip]
146
+ @options[:skip].each_with_object(Set.new) { |e, a| a << e.to_sym }
147
+ end
148
+
149
+ # Determine the headings.
150
+ #
151
+ # @param record [Hash] BioDSL record.
152
+ def determine_headings(record)
153
+ @headings =
154
+ if @options[:keys]
155
+ @options[:keys].map(&:to_sym)
156
+ elsif record.keys.first =~ /^V\d+$/
157
+ sort_keys(record)
158
+ else
159
+ record.keys
160
+ end
161
+
162
+ @headings.reject! { |r| @skip_keys.include? r } if @options[:skip]
163
+ end
164
+
165
+ # Sort records keys numerically, when the keys are in the format Vn, where n
166
+ # is an Integer.
167
+ #
168
+ # @param record [Hash] BioDSL record.
169
+ #
170
+ # @return [Array] List of sorted keys.
171
+ def sort_keys(record)
172
+ record.keys.sort do |a, b|
173
+ a.to_s[1..a.to_s.size].to_i <=> b.to_s[1..a.to_s.size].to_i
174
+ end
175
+ end
176
+
177
+ # Set options for plot.
178
+ #
179
+ # @param gp [GnuPlotter] GnuPlotter object.
180
+ def plot_options(gp)
181
+ gp.set terminal: @options[:terminal].to_s
182
+ gp.set title: @options[:title]
183
+ gp.set xlabel: @options[:xlabel]
184
+ gp.set ylabel: @options[:ylabel]
185
+ gp.set output: @options[:output] if @options[:output]
186
+ gp.set view: 'map'
187
+ gp.set autoscale: 'xfix'
188
+ gp.set autoscale: 'yfix'
189
+ gp.set nokey: true
190
+ gp.set tic: 'scale 0'
191
+ gp.set palette: 'rgbformulae 22,13,10'
192
+ gp.set logscale: 'cb' if @options[:logscale]
193
+ gp.unset xtics: true
194
+ gp.unset ytics: true
195
+ end
196
+
197
+ # Plot relevant data from the input stream.
198
+ #
199
+ # @param gp [GnuPlotter] GnuPlotter object.
200
+ # @param input [Enumerator] Input stream.
201
+ # @param output [Enumerator::Yielder] Output stream.
202
+ def plot_dataset(gp, input, output)
203
+ gp.add_dataset(matrix: :true, with: 'image') do |plotter|
204
+ input.each do |record|
205
+ @status[:records_in] += 1
206
+
207
+ determine_headings(record) unless @headings
208
+
209
+ plotter << record.values_at(*@headings)
210
+
211
+ next unless output
212
+
213
+ output << record
214
+
215
+ @status[:records_out] += 1
216
+ end
217
+ end
218
+ end
219
+
220
+ # Output plot data according to options.
221
+ #
222
+ # @param gp [GnuPlotter] GnuPlotter object.
223
+ def plot_output(gp)
224
+ if @options[:test]
225
+ $stderr.puts gp.to_gp
226
+ elsif @options[:terminal] == :dumb
227
+ puts gp.splot
228
+ else
229
+ gp.splot
230
+ end
231
+ end
232
+ end
233
+ end
@@ -0,0 +1,306 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # rubocop:disable LineLength
29
+ module BioDSL
30
+ # == Plot a histogram of numerical values for a specified key.
31
+ #
32
+ # +plot_histogram+ create a histogram plot of the values for a specified key
33
+ # from all records in the stream. Plotting is done using GNUplot which allows
34
+ # for different types of output the default one being crufty ASCII graphics.
35
+ #
36
+ # GNUplot's facility for setting the xrange labels is used for numeric values,
37
+ # while for non-numeric values these are used for xrange labels.
38
+ #
39
+ # GNUplot must be installed for plot_histogram to work. Read more here:
40
+ #
41
+ # http://www.gnuplot.info/
42
+ #
43
+ # == Usage
44
+ #
45
+ # plot_histogram(<key: <string>>[, value: <string>[, output: <file>
46
+ # [, force: <bool>[, terminal: <string>[, title: <string>
47
+ # [, xlabel: <string>[, ylabel: <string>
48
+ # [, ylogscale: <bool>[, test: <bool>]]]]]]]]])
49
+ #
50
+ # === Options
51
+ #
52
+ # * key: <string> - Key to use for plotting.
53
+ # * value: <string> - Alternative key who's value to use.
54
+ # * output: <file> - Output file.
55
+ # * force: <bool> - Force overwrite existing output file.
56
+ # * terminal: <string> - Terminal for output: dumb|post|svg|x11|aqua|png|pdf
57
+ # (default=dumb).
58
+ # * title: <string> - Plot title (default="Histogram").
59
+ # * xlabel: <string> - X-axis label (default=<key>).
60
+ # * ylabel: <string> - Y-axis label (default="n").
61
+ # * ylogscale: <bool> - Set y-axis to log scale.
62
+ # * test: <bool> - Output Gnuplot script instead of plot.
63
+ #
64
+ # == Examples
65
+ #
66
+ # Here we plot a histogram of sequence lengths from a FASTA file:
67
+ #
68
+ # read_fasta(input: "test.fna").plot_histogram(key: :SEQ_LEN).run
69
+ #
70
+ # Histogram
71
+ # + + + + + +
72
+ # 90 +++-------------+------------+------------+------------+-------------+++
73
+ # | |
74
+ # 80 ++ **++
75
+ # | **|
76
+ # 70 ++ **++
77
+ # 60 ++ **++
78
+ # | **|
79
+ # 50 ++ **++
80
+ # | **|
81
+ # 40 ++ **++
82
+ # | **|
83
+ # 30 ++ **++
84
+ # 20 ++ **++
85
+ # | **|
86
+ # 10 ++ **++
87
+ # | ******|
88
+ # 0 +++-------------+------------+**--------**+--***-------+**--**********++
89
+ # + + + + + +
90
+ # 0 10 20 30 40 50
91
+ # SEQ_LEN
92
+ #
93
+ # To render X11 output (i.e. instant view) use the +terminal+ option:
94
+ #
95
+ # read_fasta(input: "test.fna").
96
+ # plot_histogram(key: :SEQ_LEN, terminal: :x11).run
97
+ #
98
+ # To generate a PNG image and save to file:
99
+ #
100
+ # read_fasta(input: "test.fna").
101
+ # plot_histogram(key: :SEQ_LEN, terminal: :png, output: "plot.png").run
102
+ #
103
+ # rubocop:disable ClassLength
104
+ # rubocop:enable LineLength
105
+ class PlotHistogram
106
+ require 'gnuplotter'
107
+ require 'BioDSL/helpers/aux_helper'
108
+
109
+ include AuxHelper
110
+
111
+ STATS = %i(records_in records_out)
112
+
113
+ # Constructor for PlotHistogram.
114
+ #
115
+ # @param options [Hash] Options hash.
116
+ # @option options [String,:Symbol] :key
117
+ # @option options [String,:Symbol] :value
118
+ # @option options [String] :output
119
+ # @option options [Booleon] :force
120
+ # @option options [String,:Symbol] :terminal
121
+ # @option options [String] :title
122
+ # @option options [String] :xlabel
123
+ # @option options [String] :ylabel
124
+ # @option options [Booleon] :ylogscale
125
+ # @option options [Booleon] :test
126
+ #
127
+ # @return [PlotHistogram] class instance.
128
+ def initialize(options)
129
+ @options = options
130
+ @key = options[:key]
131
+ @value = options[:value]
132
+ @count_hash = Hash.new(0)
133
+ @gp = nil
134
+
135
+ aux_exist('gnuplot')
136
+ check_options
137
+ defaults
138
+ end
139
+
140
+ # Return the command lambda for plot_histogram
141
+ #
142
+ # @return [Proc] command lambda.
143
+ def lmb
144
+ lambda do |input, output, status|
145
+ status_init(status, STATS)
146
+
147
+ process_input(input, output)
148
+ plot_create
149
+ plot_output
150
+ end
151
+ end
152
+
153
+ private
154
+
155
+ # Check options.
156
+ def check_options
157
+ options_allowed(@options, :key, :value, :output, :force, :terminal,
158
+ :title, :xlabel, :ylabel, :ylogscale, :test)
159
+ options_allowed_values(@options, terminal: [:dumb, :post, :svg, :x11,
160
+ :aqua, :png, :pdf])
161
+ options_allowed_values(@options, force: [nil, true, false])
162
+ options_allowed_values(@options, test: [nil, true, false])
163
+ options_required(@options, :key)
164
+ options_files_exist_force(@options, :output)
165
+ end
166
+
167
+ # Set default values for options hash.
168
+ def defaults
169
+ @options[:terminal] ||= :dumb
170
+ @options[:title] ||= 'Histogram'
171
+ @options[:xlabel] ||= @options[:key]
172
+ @options[:ylabel] ||= 'n'
173
+
174
+ @options[:ylogscale] &&
175
+ @options[:ylabel] = "log10(#{@options[:ylabel]})"
176
+ end
177
+
178
+ # Process the input stream, collect all plot data, and output records.
179
+ #
180
+ # @param input [Enumerator] Input stream.
181
+ # @param output [Enumerator::Yielder] Output stream.
182
+ def process_input(input, output)
183
+ input.each do |record|
184
+ @status[:records_in] += 1
185
+
186
+ if (k = record[@key])
187
+ if @value
188
+ if (v = record[@value])
189
+ @count_hash[k] += v
190
+ else
191
+ fail "value: #{@value} not found in record: #{record}"
192
+ end
193
+ else
194
+ @count_hash[k] += 1
195
+ end
196
+ end
197
+
198
+ process_output(output, record)
199
+ end
200
+ end
201
+
202
+ # Output record to the output stream if such is defined.
203
+ #
204
+ # @param output [Enumerator::Yielder] Output stream.
205
+ # @param record [Hash] BioDSL record.
206
+ def process_output(output, record)
207
+ return unless output
208
+ output << record
209
+ @status[:records_out] += 1
210
+ end
211
+
212
+ # Create a Gnuplot using the collected data from the input stream.
213
+ def plot_create
214
+ @gp = GnuPlotter.new
215
+ plot_defaults
216
+ plot_fix_ylogscale
217
+
218
+ if @count_hash.empty?
219
+ plot_empty
220
+ elsif @count_hash.keys.first.is_a? Numeric
221
+ plot_numeric
222
+ else
223
+ plot_string
224
+ end
225
+
226
+ plot_fix_xtics
227
+ end
228
+
229
+ # Set the default values for the plot.
230
+ def plot_defaults
231
+ @gp.set terminal: @options[:terminal].to_s
232
+ @gp.set title: @options[:title]
233
+ @gp.set xlabel: @options[:xlabel]
234
+ @gp.set ylabel: @options[:ylabel]
235
+ @gp.set autoscale: 'xfix'
236
+ @gp.set style: 'fill solid 0.5 border'
237
+ @gp.set xtics: 'out'
238
+ @gp.set ytics: 'out'
239
+ end
240
+
241
+ # Set plot values accodingly if the ylogscale flag is set.
242
+ def plot_fix_ylogscale
243
+ if @options[:ylogscale]
244
+ @gp.set logscale: 'y'
245
+ @gp.set yrange: '[1:*]'
246
+ else
247
+ @gp.set yrange: '[0:*]'
248
+ end
249
+ end
250
+
251
+ # Set plot values to create an empty plot if no plot data was collected.
252
+ def plot_empty
253
+ @gp.set yrange: '[-1:1]'
254
+ @gp.set key: 'off'
255
+ @gp.unset xtics: true
256
+ @gp.unset ytics: true
257
+ end
258
+
259
+ # If plot data have numeric xtic values use numeric xtic labels.
260
+ def plot_numeric
261
+ x_max = @count_hash.keys.max || 0
262
+
263
+ @gp.add_dataset(using: '1:2', with: 'boxes notitle') do |plotter|
264
+ (0..x_max).each { |x| plotter << [x, @count_hash[x]] }
265
+ end
266
+ end
267
+
268
+ # If plot data gave string xtic values use these as xtic labels.
269
+ def plot_string
270
+ plot_xtics_rotate
271
+
272
+ @gp.add_dataset(using: '2:xticlabels(1)',
273
+ with: 'boxes notitle lc rgb "red"') do |plotter|
274
+ @count_hash.each { |k, v| plotter << [k, v] }
275
+ end
276
+ end
277
+
278
+ # If xtic labels are longer then 2, rotate these.
279
+ def plot_xtics_rotate
280
+ return unless @count_hash.first.first.size > 2
281
+ @gp.set xtics: 'rotate'
282
+ @gp.set xlabel: ''
283
+ end
284
+
285
+ # Determine if xtics should be plottet and unset these if not. Don't plot
286
+ # xtics if more than 50 strings.
287
+ def plot_fix_xtics
288
+ return unless @count_hash.keys.first.class == String &&
289
+ @count_hash.size > 50
290
+ @gp.unset xtics: true
291
+ end
292
+
293
+ # Output plot data
294
+ def plot_output
295
+ @gp.set output: @options[:output] if @options[:output]
296
+
297
+ if @options[:test]
298
+ $stderr.puts @gp.to_gp
299
+ elsif @options[:terminal] == :dumb
300
+ puts @gp.plot
301
+ else
302
+ @gp.plot
303
+ end
304
+ end
305
+ end
306
+ end