BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
# rubocop:disable LineLength
|
|
29
|
+
module BioDSL
|
|
30
|
+
# == Plot matches from the stream as a dotplot.
|
|
31
|
+
#
|
|
32
|
+
# +plot_matches+ is used to create dotplots of matches in the stream.
|
|
33
|
+
# plot_matches uses Q_BEG, Q_END, S_BEG, S_END from the stream. If strand
|
|
34
|
+
# information is available either by a STRAND key with the value '+' or '-',
|
|
35
|
+
# or by a DIRECTION key with the value 'forward' or 'reverse' then forward
|
|
36
|
+
# matches will be output in green and reverse matches in red (in all
|
|
37
|
+
# terminals, but +dumb+).
|
|
38
|
+
#
|
|
39
|
+
# Default graphics are crufty ASCII and you probably want high resolution
|
|
40
|
+
# postscript or SVG output instead with is easy using the +terminal+ option.
|
|
41
|
+
# Plotting is done using GNUplot which allows for different types of output.
|
|
42
|
+
#
|
|
43
|
+
# GNUplot must be installed for plot_matches to work. Read more here:
|
|
44
|
+
#
|
|
45
|
+
# http://www.gnuplot.info/
|
|
46
|
+
#
|
|
47
|
+
# == Usage
|
|
48
|
+
#
|
|
49
|
+
# plot_matches([direction: <string>[, output: <file>[, force: <bool>
|
|
50
|
+
# [, terminal: <string>[, title: <string>[, xlabel: <string>
|
|
51
|
+
# [, ylabel: <string>[, test: <bool>]]]]]]]])
|
|
52
|
+
#
|
|
53
|
+
# === Options
|
|
54
|
+
#
|
|
55
|
+
# * direction: <string> - Plot matches from forward|reverse|both direction(s)
|
|
56
|
+
# (default=both).
|
|
57
|
+
# * output: <file> - Output file.
|
|
58
|
+
# * force: <bool> - Force overwrite existing output file.
|
|
59
|
+
# * terminal: <string> - Terminal for output: dumb|post|svg|x11|aqua|png|pdf
|
|
60
|
+
# (default=dumb).
|
|
61
|
+
# * title: <string> - Plot title (default="Matches").
|
|
62
|
+
# * xlabel: <string> - X-axis label (default="x").
|
|
63
|
+
# * ylabel: <string> - Y-axis label (default="y").
|
|
64
|
+
# * test: <bool> - Output Gnuplot script instead of plot.
|
|
65
|
+
#
|
|
66
|
+
# == Examples
|
|
67
|
+
#
|
|
68
|
+
# Here we plot two matches from a table. The vector records are shown in the
|
|
69
|
+
# +dump+ output:
|
|
70
|
+
#
|
|
71
|
+
# BP.new.read_table(input: "test.tab").dump.plot_matches.run
|
|
72
|
+
#
|
|
73
|
+
# {:Q_BEG=>0, :Q_END=>10, :S_BEG=>0, :S_END=>10, :STRAND=>"+"}
|
|
74
|
+
# {:Q_BEG=>0, :Q_END=>10, :S_BEG=>0, :S_END=>10, :STRAND=>"-"}
|
|
75
|
+
#
|
|
76
|
+
# Matches
|
|
77
|
+
# + + + + + +
|
|
78
|
+
# 10 +>>>-----------+-------------+------------+-------------+----------->>>+
|
|
79
|
+
# | >>>> : : : : >>>> |
|
|
80
|
+
# | >>>> : : : : >>>> |
|
|
81
|
+
# 8 ++..........>>>>>......................................>>>>>..........++
|
|
82
|
+
# | : >>>> : : >>>> : |
|
|
83
|
+
# | : >>>> : : >>>> : |
|
|
84
|
+
# 6 ++.......................>>>>>............>>>>>.......................++
|
|
85
|
+
# | : :>>>> >>>>: : |
|
|
86
|
+
# | : : >>>> : : |
|
|
87
|
+
# | : :>>>> >>>>: : |
|
|
88
|
+
# 4 ++.......................>>>>>............>>>>>.......................++
|
|
89
|
+
# | : >>>> : : >>>> : |
|
|
90
|
+
# | : >>>> : : >>>> : |
|
|
91
|
+
# 2 ++..........>>>>>......................................>>>>>..........++
|
|
92
|
+
# | >>>> : : : : >>>> |
|
|
93
|
+
# | >>>> : : : : >>>> |
|
|
94
|
+
# 0 +>>>-----------+-------------+------------+-------------+----------->>>+
|
|
95
|
+
# + + + + + +
|
|
96
|
+
# 0 2 4 6 8 10
|
|
97
|
+
# x
|
|
98
|
+
#
|
|
99
|
+
# To render X11 output (i.e. instant view) use the +terminal+ option:
|
|
100
|
+
#
|
|
101
|
+
# plot_matches(terminal: :x11).run
|
|
102
|
+
#
|
|
103
|
+
# To generate a PNG image and save to file:
|
|
104
|
+
#
|
|
105
|
+
# plot_matches(terminal: :png, output: "plot.png").run
|
|
106
|
+
#
|
|
107
|
+
# rubocop:disable ClassLength
|
|
108
|
+
# rubocop:enable LineLength
|
|
109
|
+
class PlotMatches
|
|
110
|
+
require 'gnuplotter'
|
|
111
|
+
require 'BioDSL/helpers/aux_helper'
|
|
112
|
+
|
|
113
|
+
include AuxHelper
|
|
114
|
+
|
|
115
|
+
STATS = %i(records_in records_out matches_in)
|
|
116
|
+
|
|
117
|
+
# Constructor for PlotMatches.
|
|
118
|
+
#
|
|
119
|
+
# @param options [Hash] Options hash.
|
|
120
|
+
# @option options [Symbol] :direction
|
|
121
|
+
# @option options [String] :output
|
|
122
|
+
# @option options [Boolean] :force
|
|
123
|
+
# @option options [Symbol] :terminal
|
|
124
|
+
# @option options [String] :title
|
|
125
|
+
# @option options [String] :xlabel
|
|
126
|
+
# @option options [String] :ylabel
|
|
127
|
+
# @option options [Boolean] :test
|
|
128
|
+
#
|
|
129
|
+
# @return [PlotMatches] Class instance.
|
|
130
|
+
def initialize(options)
|
|
131
|
+
@options = options
|
|
132
|
+
@gp = nil
|
|
133
|
+
@style1 = {using: '1:2:3:4', with: 'vectors nohead ls 1'}
|
|
134
|
+
@style2 = {using: '1:2:3:4', with: 'vectors nohead ls 2'}
|
|
135
|
+
|
|
136
|
+
aux_exist('gnuplot')
|
|
137
|
+
check_options
|
|
138
|
+
defaults
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Return lambda for command plot_matches.
|
|
142
|
+
#
|
|
143
|
+
# @return [Proc] Command lambda.
|
|
144
|
+
def lmb
|
|
145
|
+
lambda do |input, output, status|
|
|
146
|
+
status_init(status, STATS)
|
|
147
|
+
|
|
148
|
+
@gp = GnuPlotter.new
|
|
149
|
+
plot_defaults
|
|
150
|
+
|
|
151
|
+
@gp.add_dataset(@style1) do |forward|
|
|
152
|
+
@gp.add_dataset(@style2) do |reverse|
|
|
153
|
+
input.each do |record|
|
|
154
|
+
@status[:records_in] += 1
|
|
155
|
+
|
|
156
|
+
plot_match(forward, reverse, record)
|
|
157
|
+
|
|
158
|
+
process_output(output, record)
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
plot_output
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
private
|
|
168
|
+
|
|
169
|
+
# Check options.
|
|
170
|
+
def check_options
|
|
171
|
+
options_allowed(@options, :direction, :output, :force, :terminal, :title,
|
|
172
|
+
:xlabel, :ylabel, :test)
|
|
173
|
+
options_allowed_values(@options, direction: [:forward, :reverse, :both])
|
|
174
|
+
options_allowed_values(@options, terminal: [:dumb, :post, :svg, :x11,
|
|
175
|
+
:aqua, :png, :pdf])
|
|
176
|
+
options_allowed_values(@options, test: [nil, true, false])
|
|
177
|
+
options_files_exist_force(@options, :output)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Set default options.
|
|
181
|
+
def defaults
|
|
182
|
+
@options[:direction] ||= :both
|
|
183
|
+
@options[:terminal] ||= :dumb
|
|
184
|
+
@options[:title] ||= 'Matches'
|
|
185
|
+
@options[:xlabel] ||= 'x'
|
|
186
|
+
@options[:ylabel] ||= 'y'
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Set plot default attributes.
|
|
190
|
+
def plot_defaults
|
|
191
|
+
@gp.set terminal: @options[:terminal].to_s
|
|
192
|
+
@gp.set title: @options[:title]
|
|
193
|
+
@gp.set xlabel: @options[:xlabel]
|
|
194
|
+
@gp.set ylabel: @options[:ylabel]
|
|
195
|
+
@gp.set autoscale: 'xfix'
|
|
196
|
+
@gp.set autoscale: 'yfix'
|
|
197
|
+
@gp.set style: 'fill solid 0.5 border'
|
|
198
|
+
@gp.set xtics: 'border out'
|
|
199
|
+
@gp.set ytics: 'border out'
|
|
200
|
+
@gp.set grid: :true
|
|
201
|
+
@gp.set nokey: :true
|
|
202
|
+
@gp.set style: 'line 1 linetype 1 linecolor rgb "green" linewidth ' \
|
|
203
|
+
'2 pointtype 6 pointsize default'
|
|
204
|
+
@gp.set style: 'line 2 linetype 1 linecolor rgb "red" linewidth ' \
|
|
205
|
+
'2 pointtype 6 pointsize default'
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# Add match data to forward or reverse dataset.
|
|
209
|
+
#
|
|
210
|
+
# @param forward [GnuPlotter::DataSet] Forward matches.
|
|
211
|
+
# @param reverse [GnuPlotter::DataSet] Reverse matches.
|
|
212
|
+
# @param record [Hash] BioDSL record.
|
|
213
|
+
def plot_match(forward, reverse, record)
|
|
214
|
+
return unless record[:Q_BEG] && record[:Q_END] &&
|
|
215
|
+
record[:S_BEG] && record[:S_END]
|
|
216
|
+
@status[:matches_in] += 1
|
|
217
|
+
|
|
218
|
+
q_len = record[:Q_END] - record[:Q_BEG]
|
|
219
|
+
s_len = record[:S_END] - record[:S_BEG]
|
|
220
|
+
|
|
221
|
+
plot_match_strand(forward, reverse, record, q_len, s_len)
|
|
222
|
+
plot_match_direction(forward, reverse, record, q_len, s_len)
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Add match data to forward or reverse dataset depeding on match strand.
|
|
226
|
+
#
|
|
227
|
+
# @param forward [GnuPlotter::DataSet] Forward matches.
|
|
228
|
+
# @param reverse [GnuPlotter::DataSet] Reverse matches.
|
|
229
|
+
# @param record [Hash] BioDSL record.
|
|
230
|
+
# @param q_len [Integer] Length of query match.
|
|
231
|
+
# @param s_len [Integer] Length of subject match.
|
|
232
|
+
def plot_match_strand(forward, reverse, record, q_len, s_len)
|
|
233
|
+
return unless record[:STRAND]
|
|
234
|
+
|
|
235
|
+
if record[:STRAND] == '+'
|
|
236
|
+
forward << [record[:Q_BEG], record[:S_BEG], q_len, s_len]
|
|
237
|
+
else
|
|
238
|
+
reverse << [record[:Q_END], record[:S_BEG], -1 * q_len, s_len]
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
# Add match data to forward or reverse dataset depeding on match direction.
|
|
243
|
+
#
|
|
244
|
+
# @param forward [GnuPlotter::DataSet] Forward matches.
|
|
245
|
+
# @param reverse [GnuPlotter::DataSet] Reverse matches.
|
|
246
|
+
# @param record [Hash] BioDSL record.
|
|
247
|
+
# @param q_len [Integer] Length of query match.
|
|
248
|
+
# @param s_len [Integer] Length of subject match.
|
|
249
|
+
def plot_match_direction(forward, reverse, record, q_len, s_len)
|
|
250
|
+
return unless record[:DIRECTION]
|
|
251
|
+
|
|
252
|
+
if record[:DIRECTION] == 'forward'
|
|
253
|
+
forward << [record[:Q_BEG], record[:S_BEG], q_len, s_len]
|
|
254
|
+
else
|
|
255
|
+
reverse << [record[:Q_END], record[:S_BEG], -1 * q_len, s_len]
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# Output plot data
|
|
260
|
+
def plot_output
|
|
261
|
+
@gp.set output: @options[:output] if @options[:output]
|
|
262
|
+
|
|
263
|
+
if @options[:test]
|
|
264
|
+
$stderr.puts @gp.to_gp
|
|
265
|
+
elsif @options[:terminal] == :dumb
|
|
266
|
+
puts @gp.plot
|
|
267
|
+
else
|
|
268
|
+
@gp.plot
|
|
269
|
+
end
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
# Emit record to output stream if defined.
|
|
273
|
+
#
|
|
274
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
275
|
+
# @param record [Hash] BioDSL record.
|
|
276
|
+
def process_output(output, record)
|
|
277
|
+
return unless output
|
|
278
|
+
output << record
|
|
279
|
+
@status[:records_out] += 1
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
end
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Plot the residue distribution of sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# +plot_residue_distribution+ creates a residue distribution plot per sequence
|
|
32
|
+
# position of sequences in the stream. Plotting is done using GNUplot which
|
|
33
|
+
# allows for different types of output the default one being crufty ASCII
|
|
34
|
+
# graphics.
|
|
35
|
+
#
|
|
36
|
+
# If plotting distributions from sequences of variable length you can use the
|
|
37
|
+
# +count+ option to co-plot the relative count at each base position. This
|
|
38
|
+
# allow you to explain areas with a scewed distribution.
|
|
39
|
+
#
|
|
40
|
+
# GNUplot must be installed for +plot_residue_distribution+ to work. Read more
|
|
41
|
+
# here:
|
|
42
|
+
#
|
|
43
|
+
# http://www.gnuplot.info/
|
|
44
|
+
#
|
|
45
|
+
# == Usage
|
|
46
|
+
#
|
|
47
|
+
# plot_residue_distribution([count: <bool>[, output: <file>
|
|
48
|
+
# [, force: <bool> [, terminal: <string>
|
|
49
|
+
# [, title: <string>[, xlabel: <string>
|
|
50
|
+
# [, ylabel: <string>[, test: <bool>]]]]]]])
|
|
51
|
+
#
|
|
52
|
+
# === Options
|
|
53
|
+
#
|
|
54
|
+
# * count: <bool> - Plot relative count (default=false).
|
|
55
|
+
# * output: <file> - Output file.
|
|
56
|
+
# * force: <bool> - Force overwrite existing output file.
|
|
57
|
+
# * terminal: <string> - Terminal for output: dumb|post|svg|x11|aqua|png|pdf
|
|
58
|
+
# (default=dumb).
|
|
59
|
+
# * title: <string> - Plot title (default="Heatmap").
|
|
60
|
+
# * xlabel: <string> - X-axis label (default="x").
|
|
61
|
+
# * ylabel: <string> - Y-axis label (default="y").
|
|
62
|
+
# * test: <bool> - Output Gnuplot script instead of plot.
|
|
63
|
+
#
|
|
64
|
+
# == Examples
|
|
65
|
+
#
|
|
66
|
+
# Here we plot a residue distribution of a FASTA file:
|
|
67
|
+
#
|
|
68
|
+
# BP.new.read_fasta(input: "test.fna").plot_residue_distribution.run
|
|
69
|
+
#
|
|
70
|
+
# rubocop: disable ClassLength
|
|
71
|
+
class PlotResidueDistribution
|
|
72
|
+
require 'gnuplotter'
|
|
73
|
+
require 'set'
|
|
74
|
+
require 'BioDSL/helpers/aux_helper'
|
|
75
|
+
|
|
76
|
+
include AuxHelper
|
|
77
|
+
|
|
78
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
79
|
+
residues_out)
|
|
80
|
+
|
|
81
|
+
# Constructo for PlotResidueDistribution.
|
|
82
|
+
#
|
|
83
|
+
# @param options [Hash] Options hash.
|
|
84
|
+
# @option options [Boolean] :count
|
|
85
|
+
# @option options [String] :output
|
|
86
|
+
# @option options [Boolean] :force
|
|
87
|
+
# @option options [:Symbol] :terminal
|
|
88
|
+
# @option options [String] :title
|
|
89
|
+
# @option options [String] :xlabel
|
|
90
|
+
# @option options [String] :ylabel
|
|
91
|
+
# @option options [Boolean] :test
|
|
92
|
+
#
|
|
93
|
+
# @return [PlotResidueDistribution] Class instance.
|
|
94
|
+
def initialize(options)
|
|
95
|
+
@options = options
|
|
96
|
+
@counts = Hash.new { |h, k| h[k] = Hash.new(0) }
|
|
97
|
+
@total = Hash.new(0)
|
|
98
|
+
@residues = Set.new
|
|
99
|
+
@gp = nil
|
|
100
|
+
@offset = Set.new # Hackery thing to offset datasets 1 postion.
|
|
101
|
+
|
|
102
|
+
aux_exist('gnuplot')
|
|
103
|
+
check_options
|
|
104
|
+
defaults
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Return command lambda for PlotResidueDistribution.
|
|
108
|
+
#
|
|
109
|
+
# @return [Proc] Command lambda.
|
|
110
|
+
def lmb
|
|
111
|
+
lambda do |input, output, status|
|
|
112
|
+
status_init(status, STATS)
|
|
113
|
+
|
|
114
|
+
input.each do |record|
|
|
115
|
+
@status[:records_in] += 1
|
|
116
|
+
|
|
117
|
+
count_residues(record) if record.key? :SEQ
|
|
118
|
+
|
|
119
|
+
next unless output
|
|
120
|
+
output << record
|
|
121
|
+
@status[:records_out] += 1
|
|
122
|
+
|
|
123
|
+
if record.key? :SEQ
|
|
124
|
+
@status[:sequences_out] += 1
|
|
125
|
+
@status[:residues_out] += record[:SEQ].length
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
plot_create
|
|
130
|
+
plot_output
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
private
|
|
135
|
+
|
|
136
|
+
# Check options.
|
|
137
|
+
def check_options
|
|
138
|
+
options_allowed(@options, :count, :output, :force, :terminal, :title,
|
|
139
|
+
:xlabel, :ylabel, :test)
|
|
140
|
+
options_allowed_values(@options, terminal: [:dumb, :post, :svg, :x11,
|
|
141
|
+
:aqua, :png, :pdf])
|
|
142
|
+
options_allowed_values(@options, count: [nil, true, false])
|
|
143
|
+
options_allowed_values(@options, test: [nil, true, false])
|
|
144
|
+
options_files_exist_force(@options, :output)
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Set default options.
|
|
148
|
+
def defaults
|
|
149
|
+
@options[:terminal] ||= :dumb
|
|
150
|
+
@options[:title] ||= 'Residue Distribution'
|
|
151
|
+
@options[:xlabel] ||= 'Sequence position'
|
|
152
|
+
@options[:ylabel] ||= '%'
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Given a record with a sequence count its residues.
|
|
156
|
+
#
|
|
157
|
+
# @param record [Hash] BioDSL record
|
|
158
|
+
def count_residues(record)
|
|
159
|
+
@status[:sequences_in] += 1
|
|
160
|
+
@status[:residues_in] += record[:SEQ].length
|
|
161
|
+
|
|
162
|
+
record[:SEQ].upcase.chars.each_with_index do |char, i|
|
|
163
|
+
c = char.to_sym
|
|
164
|
+
@counts[i][c] += 1
|
|
165
|
+
@total[i] += 1
|
|
166
|
+
@residues.add(c)
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Create plot.
|
|
171
|
+
def plot_create
|
|
172
|
+
@gp = GnuPlotter.new
|
|
173
|
+
plot_defaults
|
|
174
|
+
|
|
175
|
+
@residues.sort.reverse.each_with_index do |residue, i|
|
|
176
|
+
plot_residue(residue, i)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
plot_count if @options[:count]
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Plot residue data.
|
|
183
|
+
def plot_residue(residue, i)
|
|
184
|
+
@gp.add_dataset(using: 1, with: "histogram lt #{i + 1}",
|
|
185
|
+
title: "\"#{residue}\"") do |plotter|
|
|
186
|
+
@counts.each do |pos, dist|
|
|
187
|
+
plotter << 0.0 unless @offset.include? residue
|
|
188
|
+
plotter << 100 * dist[residue].to_f / @total[pos]
|
|
189
|
+
@offset << residue
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Plot count data.
|
|
195
|
+
def plot_count
|
|
196
|
+
max = @total.values.max
|
|
197
|
+
style = {using: '1:2', with: 'lines lw 2 lt rgb "black"',
|
|
198
|
+
title: '"count"'}
|
|
199
|
+
|
|
200
|
+
@gp.add_dataset(style) do |plotter|
|
|
201
|
+
@counts.each_key do |pos|
|
|
202
|
+
plotter << [0, 0.0] unless @offset.include? :count
|
|
203
|
+
plotter << [pos, 100 * @total[pos].to_f / max]
|
|
204
|
+
@offset << :count
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Set plot defaults
|
|
210
|
+
#
|
|
211
|
+
# rubocop: disable MethodLength
|
|
212
|
+
def plot_defaults
|
|
213
|
+
@gp.set terminal: @options[:terminal].to_s
|
|
214
|
+
@gp.set title: @options[:title]
|
|
215
|
+
@gp.set xlabel: @options[:xlabel]
|
|
216
|
+
@gp.set ylabel: @options[:ylabel]
|
|
217
|
+
@gp.set output: @options[:output] if @options[:output]
|
|
218
|
+
@gp.set xtics: 'out'
|
|
219
|
+
@gp.set ytics: 'out'
|
|
220
|
+
@gp.set yrange: '[0:100]'
|
|
221
|
+
@gp.set xrange: "[0:#{@counts.size}]"
|
|
222
|
+
@gp.set auto: 'fix'
|
|
223
|
+
@gp.set offsets: '1'
|
|
224
|
+
@gp.set key: 'outside right top vertical Left reverse noenhanced ' \
|
|
225
|
+
'autotitles columnhead nobox'
|
|
226
|
+
@gp.set key: 'invert samplen 4 spacing 1 width 0 height 0'
|
|
227
|
+
@gp.set style: 'fill solid 0.5 border'
|
|
228
|
+
@gp.set style: 'histogram rowstacked'
|
|
229
|
+
@gp.set style: 'data histograms'
|
|
230
|
+
@gp.set boxwidth: '0.75 absolute'
|
|
231
|
+
|
|
232
|
+
plot_colors
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
# Set plot line colors
|
|
236
|
+
# color scheme: http://en.wikipedia.org/wiki/Help:Distinguishable_colors
|
|
237
|
+
def plot_colors
|
|
238
|
+
@gp.set linetype: '1 lc rgb "#FF0010"' # Red
|
|
239
|
+
@gp.set linetype: '2 lc rgb "#191919"' # Ebony
|
|
240
|
+
@gp.set linetype: '3 lc rgb "#0075DC"' # Blue
|
|
241
|
+
@gp.set linetype: '4 lc rgb "#2BCE48"' # Green
|
|
242
|
+
@gp.set linetype: '5 lc rgb "#FFFF00"' # Yellow
|
|
243
|
+
@gp.set linetype: '6 lc rgb "#4C005C"' # Damson
|
|
244
|
+
@gp.set linetype: '7 lc rgb "#993F00"' # Caramel
|
|
245
|
+
@gp.set linetype: '8 lc rgb "#FFCC99"' # Honeydew
|
|
246
|
+
@gp.set linetype: '9 lc rgb "#808080"' # Iron
|
|
247
|
+
@gp.set linetype: '10 lc rgb "#94FFB5"' # Jade
|
|
248
|
+
@gp.set linetype: '11 lc rgb "#8F7C00"' # Khaki
|
|
249
|
+
@gp.set linetype: '12 lc rgb "#9DCC00"' # Lime
|
|
250
|
+
@gp.set linetype: '13 lc rgb "#C20088"' # Mallow
|
|
251
|
+
@gp.set linetype: '14 lc rgb "#003380"' # Navy
|
|
252
|
+
@gp.set linetype: '15 lc rgb "#FFA405"' # Orpiment
|
|
253
|
+
@gp.set linetype: '16 lc rgb "#FFA8BB"' # Pink
|
|
254
|
+
@gp.set linetype: '17 lc rgb "#426600"' # Quagmire
|
|
255
|
+
@gp.set linetype: '18 lc rgb "#F0A3FF"' # Amethyst
|
|
256
|
+
@gp.set linetype: '19 lc rgb "#5EF1F2"' # Sky
|
|
257
|
+
@gp.set linetype: '20 lc rgb "#00998F"' # Turquoise
|
|
258
|
+
@gp.set linetype: '21 lc rgb "#E0FF66"' # Uranium
|
|
259
|
+
@gp.set linetype: '22 lc rgb "#740AFF"' # Violet
|
|
260
|
+
@gp.set linetype: '23 lc rgb "#990000"' # Wine
|
|
261
|
+
@gp.set linetype: '24 lc rgb "#FFFF80"' # Xanthin
|
|
262
|
+
@gp.set linetype: '25 lc rgb "#005C31"' # Forest
|
|
263
|
+
@gp.set linetype: '26 lc rgb "#FF5005"' # Zinnia
|
|
264
|
+
@gp.set linetype: 'cycle 26'
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
# Output plot data.
|
|
268
|
+
def plot_output
|
|
269
|
+
if @options[:test]
|
|
270
|
+
$stderr.puts @gp.to_gp
|
|
271
|
+
elsif @options[:terminal] == :dumb
|
|
272
|
+
puts @gp.plot
|
|
273
|
+
else
|
|
274
|
+
@gp.plot
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
end
|