BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Plot tabular numerical data in a heatmap.
|
|
30
|
+
#
|
|
31
|
+
# A heatmap can be plotted with +plot_heatmap+ using numerical data (Non-
|
|
32
|
+
# numerical data is ignored). Data should be tabular with records as rows and
|
|
33
|
+
# keys as columns - the data cells plotted will be the values.
|
|
34
|
+
#
|
|
35
|
+
# Default graphics are crufty ASCII and you probably want high resolution
|
|
36
|
+
# postscript or SVG output instead with is easy using the +terminal+ option.
|
|
37
|
+
# Plotting is done using GNUplot which allows for different types of output.
|
|
38
|
+
#
|
|
39
|
+
# GNUplot must be installed for +plot_heatmap+ to work. Read more here:
|
|
40
|
+
#
|
|
41
|
+
# http://www.gnuplot.info/
|
|
42
|
+
#
|
|
43
|
+
# == Usage
|
|
44
|
+
#
|
|
45
|
+
# plot_heatmap([keys: <list> | skip: <list>[, output: <file>
|
|
46
|
+
# [, force: <bool> [, terminal: <string>
|
|
47
|
+
# [, title: <string>[, xlabel: <string>[, ylabel: <string>
|
|
48
|
+
# [, test: <bool>]]]]]]])
|
|
49
|
+
#
|
|
50
|
+
# === Options
|
|
51
|
+
#
|
|
52
|
+
# * keys: <list> - Comma separated list of keys to plot as columns.
|
|
53
|
+
# * skip: <list> - Comma separated list of keys to skip as columns.
|
|
54
|
+
# * output: <file> - Output file.
|
|
55
|
+
# * force: <bool> - Force overwrite existing output file.
|
|
56
|
+
# * terminal: <string> - Terminal for output: dumb|post|svg|x11|aqua|png|pdf
|
|
57
|
+
# (default=dumb).
|
|
58
|
+
# * title: <string> - Plot title (default="Heatmap").
|
|
59
|
+
# * xlabel: <string> - X-axis label (default="x").
|
|
60
|
+
# * ylabel: <string> - Y-axis label (default="y").
|
|
61
|
+
# * test: <bool> - Output Gnuplot script instead of plot.
|
|
62
|
+
#
|
|
63
|
+
# == Examples
|
|
64
|
+
#
|
|
65
|
+
# Here we plot a heatmap of data a table:
|
|
66
|
+
#
|
|
67
|
+
# BP.new.read_table(input: "test.tab").plot_heatmap.run
|
|
68
|
+
#
|
|
69
|
+
# rubocop:disable ClassLength
|
|
70
|
+
class PlotHeatmap
|
|
71
|
+
require 'gnuplotter'
|
|
72
|
+
require 'set'
|
|
73
|
+
require 'BioDSL/helpers/aux_helper'
|
|
74
|
+
|
|
75
|
+
include AuxHelper
|
|
76
|
+
|
|
77
|
+
STATS = %i(records_in records_out)
|
|
78
|
+
|
|
79
|
+
# Constructor for PlotHeatmap.
|
|
80
|
+
#
|
|
81
|
+
# @param options [Hash] Options hash.
|
|
82
|
+
# @option options [Array] :keys List of keys to plot as column.
|
|
83
|
+
# @option options [Array] :skip List of keys to skip as column.
|
|
84
|
+
# @option options [String] :output Path to output file.
|
|
85
|
+
# @option options [Boolean] :forcea Flag to force overwrite output file.
|
|
86
|
+
# @option options [Symbol] :terminal Set plot terminal type.
|
|
87
|
+
# @option options [String] :title Set plot title.
|
|
88
|
+
# @option options [String] :xlabel Set plot xlabel.
|
|
89
|
+
# @option options [String] :ylabel Set plot ylabel
|
|
90
|
+
# @option options [Boolean] :logscale Logscale Z-axis.
|
|
91
|
+
# @option options [Boolean] :test Output gnuplot script.
|
|
92
|
+
#
|
|
93
|
+
# @return [PlotHeatmap] Class instance.
|
|
94
|
+
def initialize(options)
|
|
95
|
+
@options = options
|
|
96
|
+
@headings = nil
|
|
97
|
+
@skip_keys = determine_skip_keys
|
|
98
|
+
|
|
99
|
+
aux_exist('gnuplot')
|
|
100
|
+
check_options
|
|
101
|
+
defaults
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Return command lambda for plot_histogram.
|
|
105
|
+
#
|
|
106
|
+
# @return [Proc] Command lambda.
|
|
107
|
+
def lmb
|
|
108
|
+
lambda do |input, output, status|
|
|
109
|
+
status_init(status, STATS)
|
|
110
|
+
|
|
111
|
+
gp = GnuPlotter.new
|
|
112
|
+
|
|
113
|
+
plot_options(gp)
|
|
114
|
+
plot_dataset(gp, input, output)
|
|
115
|
+
plot_output(gp)
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
private
|
|
120
|
+
|
|
121
|
+
# Check options.
|
|
122
|
+
def check_options
|
|
123
|
+
options_allowed(@options, :keys, :skip, :output, :force, :terminal,
|
|
124
|
+
:title, :xlabel, :ylabel, :logscale, :test)
|
|
125
|
+
options_unique(@options, :keys, :skip)
|
|
126
|
+
options_allowed_values(@options, terminal: [:dumb, :post, :svg, :x11,
|
|
127
|
+
:aqua, :png, :pdf])
|
|
128
|
+
options_allowed_values(@options, test: [nil, true, false])
|
|
129
|
+
options_allowed_values(@options, logscale: [nil, true, false])
|
|
130
|
+
options_files_exist_force(@options, :output)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Set default options.
|
|
134
|
+
def defaults
|
|
135
|
+
@options[:terminal] ||= :dumb
|
|
136
|
+
@options[:title] ||= 'Heatmap'
|
|
137
|
+
@options[:xlabel] ||= 'x'
|
|
138
|
+
@options[:ylabel] ||= 'y'
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Compile a set of keys to skip.
|
|
142
|
+
#
|
|
143
|
+
# @return [Set] Set of keys to skip.
|
|
144
|
+
def determine_skip_keys
|
|
145
|
+
return unless @options[:skip]
|
|
146
|
+
@options[:skip].each_with_object(Set.new) { |e, a| a << e.to_sym }
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Determine the headings.
|
|
150
|
+
#
|
|
151
|
+
# @param record [Hash] BioDSL record.
|
|
152
|
+
def determine_headings(record)
|
|
153
|
+
@headings =
|
|
154
|
+
if @options[:keys]
|
|
155
|
+
@options[:keys].map(&:to_sym)
|
|
156
|
+
elsif record.keys.first =~ /^V\d+$/
|
|
157
|
+
sort_keys(record)
|
|
158
|
+
else
|
|
159
|
+
record.keys
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
@headings.reject! { |r| @skip_keys.include? r } if @options[:skip]
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Sort records keys numerically, when the keys are in the format Vn, where n
|
|
166
|
+
# is an Integer.
|
|
167
|
+
#
|
|
168
|
+
# @param record [Hash] BioDSL record.
|
|
169
|
+
#
|
|
170
|
+
# @return [Array] List of sorted keys.
|
|
171
|
+
def sort_keys(record)
|
|
172
|
+
record.keys.sort do |a, b|
|
|
173
|
+
a.to_s[1..a.to_s.size].to_i <=> b.to_s[1..a.to_s.size].to_i
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# Set options for plot.
|
|
178
|
+
#
|
|
179
|
+
# @param gp [GnuPlotter] GnuPlotter object.
|
|
180
|
+
def plot_options(gp)
|
|
181
|
+
gp.set terminal: @options[:terminal].to_s
|
|
182
|
+
gp.set title: @options[:title]
|
|
183
|
+
gp.set xlabel: @options[:xlabel]
|
|
184
|
+
gp.set ylabel: @options[:ylabel]
|
|
185
|
+
gp.set output: @options[:output] if @options[:output]
|
|
186
|
+
gp.set view: 'map'
|
|
187
|
+
gp.set autoscale: 'xfix'
|
|
188
|
+
gp.set autoscale: 'yfix'
|
|
189
|
+
gp.set nokey: true
|
|
190
|
+
gp.set tic: 'scale 0'
|
|
191
|
+
gp.set palette: 'rgbformulae 22,13,10'
|
|
192
|
+
gp.set logscale: 'cb' if @options[:logscale]
|
|
193
|
+
gp.unset xtics: true
|
|
194
|
+
gp.unset ytics: true
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Plot relevant data from the input stream.
|
|
198
|
+
#
|
|
199
|
+
# @param gp [GnuPlotter] GnuPlotter object.
|
|
200
|
+
# @param input [Enumerator] Input stream.
|
|
201
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
202
|
+
def plot_dataset(gp, input, output)
|
|
203
|
+
gp.add_dataset(matrix: :true, with: 'image') do |plotter|
|
|
204
|
+
input.each do |record|
|
|
205
|
+
@status[:records_in] += 1
|
|
206
|
+
|
|
207
|
+
determine_headings(record) unless @headings
|
|
208
|
+
|
|
209
|
+
plotter << record.values_at(*@headings)
|
|
210
|
+
|
|
211
|
+
next unless output
|
|
212
|
+
|
|
213
|
+
output << record
|
|
214
|
+
|
|
215
|
+
@status[:records_out] += 1
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Output plot data according to options.
|
|
221
|
+
#
|
|
222
|
+
# @param gp [GnuPlotter] GnuPlotter object.
|
|
223
|
+
def plot_output(gp)
|
|
224
|
+
if @options[:test]
|
|
225
|
+
$stderr.puts gp.to_gp
|
|
226
|
+
elsif @options[:terminal] == :dumb
|
|
227
|
+
puts gp.splot
|
|
228
|
+
else
|
|
229
|
+
gp.splot
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
end
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
# rubocop:disable LineLength
|
|
29
|
+
module BioDSL
|
|
30
|
+
# == Plot a histogram of numerical values for a specified key.
|
|
31
|
+
#
|
|
32
|
+
# +plot_histogram+ create a histogram plot of the values for a specified key
|
|
33
|
+
# from all records in the stream. Plotting is done using GNUplot which allows
|
|
34
|
+
# for different types of output the default one being crufty ASCII graphics.
|
|
35
|
+
#
|
|
36
|
+
# GNUplot's facility for setting the xrange labels is used for numeric values,
|
|
37
|
+
# while for non-numeric values these are used for xrange labels.
|
|
38
|
+
#
|
|
39
|
+
# GNUplot must be installed for plot_histogram to work. Read more here:
|
|
40
|
+
#
|
|
41
|
+
# http://www.gnuplot.info/
|
|
42
|
+
#
|
|
43
|
+
# == Usage
|
|
44
|
+
#
|
|
45
|
+
# plot_histogram(<key: <string>>[, value: <string>[, output: <file>
|
|
46
|
+
# [, force: <bool>[, terminal: <string>[, title: <string>
|
|
47
|
+
# [, xlabel: <string>[, ylabel: <string>
|
|
48
|
+
# [, ylogscale: <bool>[, test: <bool>]]]]]]]]])
|
|
49
|
+
#
|
|
50
|
+
# === Options
|
|
51
|
+
#
|
|
52
|
+
# * key: <string> - Key to use for plotting.
|
|
53
|
+
# * value: <string> - Alternative key who's value to use.
|
|
54
|
+
# * output: <file> - Output file.
|
|
55
|
+
# * force: <bool> - Force overwrite existing output file.
|
|
56
|
+
# * terminal: <string> - Terminal for output: dumb|post|svg|x11|aqua|png|pdf
|
|
57
|
+
# (default=dumb).
|
|
58
|
+
# * title: <string> - Plot title (default="Histogram").
|
|
59
|
+
# * xlabel: <string> - X-axis label (default=<key>).
|
|
60
|
+
# * ylabel: <string> - Y-axis label (default="n").
|
|
61
|
+
# * ylogscale: <bool> - Set y-axis to log scale.
|
|
62
|
+
# * test: <bool> - Output Gnuplot script instead of plot.
|
|
63
|
+
#
|
|
64
|
+
# == Examples
|
|
65
|
+
#
|
|
66
|
+
# Here we plot a histogram of sequence lengths from a FASTA file:
|
|
67
|
+
#
|
|
68
|
+
# read_fasta(input: "test.fna").plot_histogram(key: :SEQ_LEN).run
|
|
69
|
+
#
|
|
70
|
+
# Histogram
|
|
71
|
+
# + + + + + +
|
|
72
|
+
# 90 +++-------------+------------+------------+------------+-------------+++
|
|
73
|
+
# | |
|
|
74
|
+
# 80 ++ **++
|
|
75
|
+
# | **|
|
|
76
|
+
# 70 ++ **++
|
|
77
|
+
# 60 ++ **++
|
|
78
|
+
# | **|
|
|
79
|
+
# 50 ++ **++
|
|
80
|
+
# | **|
|
|
81
|
+
# 40 ++ **++
|
|
82
|
+
# | **|
|
|
83
|
+
# 30 ++ **++
|
|
84
|
+
# 20 ++ **++
|
|
85
|
+
# | **|
|
|
86
|
+
# 10 ++ **++
|
|
87
|
+
# | ******|
|
|
88
|
+
# 0 +++-------------+------------+**--------**+--***-------+**--**********++
|
|
89
|
+
# + + + + + +
|
|
90
|
+
# 0 10 20 30 40 50
|
|
91
|
+
# SEQ_LEN
|
|
92
|
+
#
|
|
93
|
+
# To render X11 output (i.e. instant view) use the +terminal+ option:
|
|
94
|
+
#
|
|
95
|
+
# read_fasta(input: "test.fna").
|
|
96
|
+
# plot_histogram(key: :SEQ_LEN, terminal: :x11).run
|
|
97
|
+
#
|
|
98
|
+
# To generate a PNG image and save to file:
|
|
99
|
+
#
|
|
100
|
+
# read_fasta(input: "test.fna").
|
|
101
|
+
# plot_histogram(key: :SEQ_LEN, terminal: :png, output: "plot.png").run
|
|
102
|
+
#
|
|
103
|
+
# rubocop:disable ClassLength
|
|
104
|
+
# rubocop:enable LineLength
|
|
105
|
+
class PlotHistogram
|
|
106
|
+
require 'gnuplotter'
|
|
107
|
+
require 'BioDSL/helpers/aux_helper'
|
|
108
|
+
|
|
109
|
+
include AuxHelper
|
|
110
|
+
|
|
111
|
+
STATS = %i(records_in records_out)
|
|
112
|
+
|
|
113
|
+
# Constructor for PlotHistogram.
|
|
114
|
+
#
|
|
115
|
+
# @param options [Hash] Options hash.
|
|
116
|
+
# @option options [String,:Symbol] :key
|
|
117
|
+
# @option options [String,:Symbol] :value
|
|
118
|
+
# @option options [String] :output
|
|
119
|
+
# @option options [Booleon] :force
|
|
120
|
+
# @option options [String,:Symbol] :terminal
|
|
121
|
+
# @option options [String] :title
|
|
122
|
+
# @option options [String] :xlabel
|
|
123
|
+
# @option options [String] :ylabel
|
|
124
|
+
# @option options [Booleon] :ylogscale
|
|
125
|
+
# @option options [Booleon] :test
|
|
126
|
+
#
|
|
127
|
+
# @return [PlotHistogram] class instance.
|
|
128
|
+
def initialize(options)
|
|
129
|
+
@options = options
|
|
130
|
+
@key = options[:key]
|
|
131
|
+
@value = options[:value]
|
|
132
|
+
@count_hash = Hash.new(0)
|
|
133
|
+
@gp = nil
|
|
134
|
+
|
|
135
|
+
aux_exist('gnuplot')
|
|
136
|
+
check_options
|
|
137
|
+
defaults
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Return the command lambda for plot_histogram
|
|
141
|
+
#
|
|
142
|
+
# @return [Proc] command lambda.
|
|
143
|
+
def lmb
|
|
144
|
+
lambda do |input, output, status|
|
|
145
|
+
status_init(status, STATS)
|
|
146
|
+
|
|
147
|
+
process_input(input, output)
|
|
148
|
+
plot_create
|
|
149
|
+
plot_output
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
private
|
|
154
|
+
|
|
155
|
+
# Check options.
|
|
156
|
+
def check_options
|
|
157
|
+
options_allowed(@options, :key, :value, :output, :force, :terminal,
|
|
158
|
+
:title, :xlabel, :ylabel, :ylogscale, :test)
|
|
159
|
+
options_allowed_values(@options, terminal: [:dumb, :post, :svg, :x11,
|
|
160
|
+
:aqua, :png, :pdf])
|
|
161
|
+
options_allowed_values(@options, force: [nil, true, false])
|
|
162
|
+
options_allowed_values(@options, test: [nil, true, false])
|
|
163
|
+
options_required(@options, :key)
|
|
164
|
+
options_files_exist_force(@options, :output)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Set default values for options hash.
|
|
168
|
+
def defaults
|
|
169
|
+
@options[:terminal] ||= :dumb
|
|
170
|
+
@options[:title] ||= 'Histogram'
|
|
171
|
+
@options[:xlabel] ||= @options[:key]
|
|
172
|
+
@options[:ylabel] ||= 'n'
|
|
173
|
+
|
|
174
|
+
@options[:ylogscale] &&
|
|
175
|
+
@options[:ylabel] = "log10(#{@options[:ylabel]})"
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Process the input stream, collect all plot data, and output records.
|
|
179
|
+
#
|
|
180
|
+
# @param input [Enumerator] Input stream.
|
|
181
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
182
|
+
def process_input(input, output)
|
|
183
|
+
input.each do |record|
|
|
184
|
+
@status[:records_in] += 1
|
|
185
|
+
|
|
186
|
+
if (k = record[@key])
|
|
187
|
+
if @value
|
|
188
|
+
if (v = record[@value])
|
|
189
|
+
@count_hash[k] += v
|
|
190
|
+
else
|
|
191
|
+
fail "value: #{@value} not found in record: #{record}"
|
|
192
|
+
end
|
|
193
|
+
else
|
|
194
|
+
@count_hash[k] += 1
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
process_output(output, record)
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Output record to the output stream if such is defined.
|
|
203
|
+
#
|
|
204
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
205
|
+
# @param record [Hash] BioDSL record.
|
|
206
|
+
def process_output(output, record)
|
|
207
|
+
return unless output
|
|
208
|
+
output << record
|
|
209
|
+
@status[:records_out] += 1
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# Create a Gnuplot using the collected data from the input stream.
|
|
213
|
+
def plot_create
|
|
214
|
+
@gp = GnuPlotter.new
|
|
215
|
+
plot_defaults
|
|
216
|
+
plot_fix_ylogscale
|
|
217
|
+
|
|
218
|
+
if @count_hash.empty?
|
|
219
|
+
plot_empty
|
|
220
|
+
elsif @count_hash.keys.first.is_a? Numeric
|
|
221
|
+
plot_numeric
|
|
222
|
+
else
|
|
223
|
+
plot_string
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
plot_fix_xtics
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Set the default values for the plot.
|
|
230
|
+
def plot_defaults
|
|
231
|
+
@gp.set terminal: @options[:terminal].to_s
|
|
232
|
+
@gp.set title: @options[:title]
|
|
233
|
+
@gp.set xlabel: @options[:xlabel]
|
|
234
|
+
@gp.set ylabel: @options[:ylabel]
|
|
235
|
+
@gp.set autoscale: 'xfix'
|
|
236
|
+
@gp.set style: 'fill solid 0.5 border'
|
|
237
|
+
@gp.set xtics: 'out'
|
|
238
|
+
@gp.set ytics: 'out'
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Set plot values accodingly if the ylogscale flag is set.
|
|
242
|
+
def plot_fix_ylogscale
|
|
243
|
+
if @options[:ylogscale]
|
|
244
|
+
@gp.set logscale: 'y'
|
|
245
|
+
@gp.set yrange: '[1:*]'
|
|
246
|
+
else
|
|
247
|
+
@gp.set yrange: '[0:*]'
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Set plot values to create an empty plot if no plot data was collected.
|
|
252
|
+
def plot_empty
|
|
253
|
+
@gp.set yrange: '[-1:1]'
|
|
254
|
+
@gp.set key: 'off'
|
|
255
|
+
@gp.unset xtics: true
|
|
256
|
+
@gp.unset ytics: true
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# If plot data have numeric xtic values use numeric xtic labels.
|
|
260
|
+
def plot_numeric
|
|
261
|
+
x_max = @count_hash.keys.max || 0
|
|
262
|
+
|
|
263
|
+
@gp.add_dataset(using: '1:2', with: 'boxes notitle') do |plotter|
|
|
264
|
+
(0..x_max).each { |x| plotter << [x, @count_hash[x]] }
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
# If plot data gave string xtic values use these as xtic labels.
|
|
269
|
+
def plot_string
|
|
270
|
+
plot_xtics_rotate
|
|
271
|
+
|
|
272
|
+
@gp.add_dataset(using: '2:xticlabels(1)',
|
|
273
|
+
with: 'boxes notitle lc rgb "red"') do |plotter|
|
|
274
|
+
@count_hash.each { |k, v| plotter << [k, v] }
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# If xtic labels are longer then 2, rotate these.
|
|
279
|
+
def plot_xtics_rotate
|
|
280
|
+
return unless @count_hash.first.first.size > 2
|
|
281
|
+
@gp.set xtics: 'rotate'
|
|
282
|
+
@gp.set xlabel: ''
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
# Determine if xtics should be plottet and unset these if not. Don't plot
|
|
286
|
+
# xtics if more than 50 strings.
|
|
287
|
+
def plot_fix_xtics
|
|
288
|
+
return unless @count_hash.keys.first.class == String &&
|
|
289
|
+
@count_hash.size > 50
|
|
290
|
+
@gp.unset xtics: true
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
# Output plot data
|
|
294
|
+
def plot_output
|
|
295
|
+
@gp.set output: @options[:output] if @options[:output]
|
|
296
|
+
|
|
297
|
+
if @options[:test]
|
|
298
|
+
$stderr.puts @gp.to_gp
|
|
299
|
+
elsif @options[:terminal] == :dumb
|
|
300
|
+
puts @gp.plot
|
|
301
|
+
else
|
|
302
|
+
@gp.plot
|
|
303
|
+
end
|
|
304
|
+
end
|
|
305
|
+
end
|
|
306
|
+
end
|