BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
# rubocop: disable LineLength
|
|
29
|
+
module BioDSL
|
|
30
|
+
# == Create a histogram with mean sequence quality scores.
|
|
31
|
+
#
|
|
32
|
+
# +plot_scores+ creates a histogram of the mean values per base of the quality
|
|
33
|
+
# scores from sequence data.
|
|
34
|
+
#
|
|
35
|
+
# Plotting is done using GNUplot which allows for different types of output
|
|
36
|
+
# the default one being crufty ASCII graphics.
|
|
37
|
+
#
|
|
38
|
+
# If plotting scores from sequences of variable length you can use the +count+
|
|
39
|
+
# option to co-plot the relative count at each base position. This allow you
|
|
40
|
+
# to detect areas with a low relative count showing a high mean score.
|
|
41
|
+
#
|
|
42
|
+
# GNUplot must be installed for plot_scores to work. Read more here:
|
|
43
|
+
#
|
|
44
|
+
# http://www.gnuplot.info/
|
|
45
|
+
#
|
|
46
|
+
# == Usage
|
|
47
|
+
#
|
|
48
|
+
# plot_scores([count: <bool>[, output: <file>[, force: <bool>
|
|
49
|
+
# [, terminal: <string>[, title: <string>
|
|
50
|
+
# [, xlabel: <string>[, ylabel: <string>
|
|
51
|
+
# [, test: <bool>]]]]]]]])
|
|
52
|
+
#
|
|
53
|
+
# === Options
|
|
54
|
+
#
|
|
55
|
+
# * count: <bool> - Add line plot of relative counts.
|
|
56
|
+
# * output: <file> - Output file.
|
|
57
|
+
# * force: <bool> - Force overwrite existing output file.
|
|
58
|
+
# * terminal: <string> - Terminal for output: dumb|post|svg|x11|aqua|png|pdf
|
|
59
|
+
# (default=dumb).
|
|
60
|
+
# * title: <string> - Plot title (default="Histogram").
|
|
61
|
+
# * xlabel: <string> - X-axis label (default=<key>).
|
|
62
|
+
# * ylabel: <string> - Y-axis label (default="n").
|
|
63
|
+
# * test: <bool> - Output Gnuplot script instread of plot.
|
|
64
|
+
#
|
|
65
|
+
# == Examples
|
|
66
|
+
#
|
|
67
|
+
# Here we plot the mean quality scores from a FASTQ file:
|
|
68
|
+
#
|
|
69
|
+
# read_fastq(input: "test.fq").plot_scores.run
|
|
70
|
+
#
|
|
71
|
+
# Mean Quality Scores
|
|
72
|
+
# + + + + + +
|
|
73
|
+
# 40 ++-------------+------------+-------------+-------------+------------+++
|
|
74
|
+
# | ***************** mean score ****** |
|
|
75
|
+
# 35 ++ *********************** ++
|
|
76
|
+
# ****************************** ** |
|
|
77
|
+
# 30 +********************************* * ++
|
|
78
|
+
# ************************************* * |
|
|
79
|
+
# 25 +*************************************** * ++
|
|
80
|
+
# ****************************************** ***** |
|
|
81
|
+
# 20 +**************************************************** ** * * ++
|
|
82
|
+
# ******************************************************************** *
|
|
83
|
+
# 15 +**********************************************************************+
|
|
84
|
+
# **********************************************************************
|
|
85
|
+
# 10 +**********************************************************************+
|
|
86
|
+
# **********************************************************************
|
|
87
|
+
# 5 +**********************************************************************+
|
|
88
|
+
# **********************************************************************
|
|
89
|
+
# 0 +**********************************************************************+
|
|
90
|
+
# + + + + + +
|
|
91
|
+
# 0 50 100 150 200 250
|
|
92
|
+
# Sequence position
|
|
93
|
+
#
|
|
94
|
+
# To render X11 output (i.e. instant view) use the +terminal+ option:
|
|
95
|
+
#
|
|
96
|
+
# read_fastq(input: "test.fq").
|
|
97
|
+
# plot_scores(terminal: :x11).run
|
|
98
|
+
#
|
|
99
|
+
# To generate a PNG image and save to file:
|
|
100
|
+
#
|
|
101
|
+
# read_fastq(input: "test.fq").
|
|
102
|
+
# plot_scores(terminal: :png, output: "plot.png").run
|
|
103
|
+
#
|
|
104
|
+
# rubocop: enable LineLength
|
|
105
|
+
# rubocop: disable ClassLength
|
|
106
|
+
class PlotScores
|
|
107
|
+
require 'gnuplotter'
|
|
108
|
+
require 'narray'
|
|
109
|
+
require 'BioDSL/helpers/aux_helper'
|
|
110
|
+
|
|
111
|
+
include AuxHelper
|
|
112
|
+
|
|
113
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
114
|
+
residues_out)
|
|
115
|
+
|
|
116
|
+
SCORES_MAX = 100_000 # Maximum score string length.
|
|
117
|
+
|
|
118
|
+
# Constructor for PlotScores.
|
|
119
|
+
#
|
|
120
|
+
# @param options [Hash] Options hash.
|
|
121
|
+
# @option options [Boolean] :count
|
|
122
|
+
# @option options [String] :output
|
|
123
|
+
# @option options [Boolean] :force
|
|
124
|
+
# @option options [Symbol] :terminal
|
|
125
|
+
# @option options [String] :title
|
|
126
|
+
# @option options [String] :xlabel
|
|
127
|
+
# @option options [String] :ylabel
|
|
128
|
+
# @option options [Boolean] :ylogscale
|
|
129
|
+
# @option options [Boolean] :test
|
|
130
|
+
#
|
|
131
|
+
# @return [PlotScores] Class instance.
|
|
132
|
+
def initialize(options)
|
|
133
|
+
@options = options
|
|
134
|
+
@scores_vec = NArray.int(SCORES_MAX)
|
|
135
|
+
@count_vec = NArray.int(SCORES_MAX)
|
|
136
|
+
@max = 0
|
|
137
|
+
|
|
138
|
+
aux_exist('gnuplot')
|
|
139
|
+
check_options
|
|
140
|
+
default
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Return command lambda for plot_scores.
|
|
144
|
+
#
|
|
145
|
+
# @return [Proc] Command lambda.
|
|
146
|
+
def lmb
|
|
147
|
+
lambda do |input, output, status|
|
|
148
|
+
status_init(status, STATS)
|
|
149
|
+
|
|
150
|
+
input.each do |record|
|
|
151
|
+
@status[:records_in] += 1
|
|
152
|
+
|
|
153
|
+
collect_plot_data(record)
|
|
154
|
+
|
|
155
|
+
write_output(output, record)
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
prepare_plot_data
|
|
159
|
+
|
|
160
|
+
plot_defaults
|
|
161
|
+
plot_scores
|
|
162
|
+
plot_count
|
|
163
|
+
plot_output
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
private
|
|
168
|
+
|
|
169
|
+
# Check options.
|
|
170
|
+
def check_options
|
|
171
|
+
options_allowed(@options, :count, :output, :force, :terminal, :title,
|
|
172
|
+
:xlabel, :ylabel, :ylogscale, :test)
|
|
173
|
+
options_allowed_values(@options, count: [true, false])
|
|
174
|
+
options_allowed_values(@options, test: [true, false])
|
|
175
|
+
options_allowed_values(@options, terminal: [:dumb, :post, :svg, :x11,
|
|
176
|
+
:aqua, :png, :pdf])
|
|
177
|
+
options_files_exist_force(@options, :output)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Set default options.
|
|
181
|
+
def default
|
|
182
|
+
@options[:terminal] ||= :dumb
|
|
183
|
+
@options[:title] ||= 'Mean Quality Scores'
|
|
184
|
+
@options[:xlabel] ||= 'Sequence Position'
|
|
185
|
+
@options[:ylabel] ||= 'Mean Score'
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Collect plot data from a given record.
|
|
189
|
+
#
|
|
190
|
+
# @param record [Hash] BioDSL record.
|
|
191
|
+
def collect_plot_data(record)
|
|
192
|
+
scores = record[:SCORES]
|
|
193
|
+
return unless scores && scores.length > 0
|
|
194
|
+
|
|
195
|
+
check_length(scores)
|
|
196
|
+
|
|
197
|
+
score_vec = NArray.to_na(scores, 'byte') - Seq::SCORE_BASE
|
|
198
|
+
@scores_vec[0...scores.length] += score_vec
|
|
199
|
+
@count_vec[0...scores.length] += 1
|
|
200
|
+
|
|
201
|
+
@max = scores.length if scores.length > @max
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Check if the scores string is longer than SCORES_MAX.
|
|
205
|
+
#
|
|
206
|
+
# @raise [BioDSLError] if too long.
|
|
207
|
+
def check_length(scores)
|
|
208
|
+
return unless scores.length > SCORES_MAX
|
|
209
|
+
msg = "score string too long: #{scores.length} > #{SCORES_MAX}"
|
|
210
|
+
fail BioDSLError, msg
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# Prepare data to plot.
|
|
214
|
+
def prepare_plot_data
|
|
215
|
+
@max = 1 if @max == 0 # ugly fix to avaid index error
|
|
216
|
+
|
|
217
|
+
count_vec = @count_vec[0...@max].to_f
|
|
218
|
+
count_vec *= (Seq::SCORE_MAX / @count_vec.max(0).to_f)
|
|
219
|
+
|
|
220
|
+
@x = (1..@max).to_a
|
|
221
|
+
@y1 = mean_vec.to_a
|
|
222
|
+
@y2 = count_vec.to_a
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Calculate the mean scores vector.
|
|
226
|
+
#
|
|
227
|
+
# @return [NArray] NArray with mean scores.
|
|
228
|
+
def mean_vec
|
|
229
|
+
@scores_vec[0...@max].to_f / @count_vec[0...@max]
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
# Set plot defaults
|
|
233
|
+
def plot_defaults
|
|
234
|
+
@gp = GnuPlotter.new
|
|
235
|
+
@gp.set terminal: @options[:terminal]
|
|
236
|
+
@gp.set title: @options[:title]
|
|
237
|
+
@gp.set xlabel: @options[:xlabel]
|
|
238
|
+
@gp.set ylabel: @options[:ylabel]
|
|
239
|
+
@gp.set output: @options[:output] if @options[:output]
|
|
240
|
+
@gp.set xrange: "[#{@x.min - 1}:#{@x.max + 1}]"
|
|
241
|
+
@gp.set yrange: "[#{Seq::SCORE_MIN}:#{Seq::SCORE_MAX}]"
|
|
242
|
+
@gp.set style: 'fill solid 0.5 border'
|
|
243
|
+
@gp.set xtics: 'out'
|
|
244
|
+
@gp.set ytics: 'out'
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# Plot scores data.
|
|
248
|
+
def plot_scores
|
|
249
|
+
style = {with: 'boxes lc rgb "red"', title: '"mean score"'}
|
|
250
|
+
|
|
251
|
+
@gp.add_dataset(style) do |plotter|
|
|
252
|
+
@x.zip(@y1).each { |e| plotter << e }
|
|
253
|
+
end
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
# Plot count data.
|
|
257
|
+
def plot_count
|
|
258
|
+
return unless @options[:count]
|
|
259
|
+
|
|
260
|
+
style = {with: 'lines lt rgb "black"', title: '"relative count"'}
|
|
261
|
+
|
|
262
|
+
@gp.add_dataset(style) do |plotter|
|
|
263
|
+
@x.zip(@y2).each { |e| plotter << e }
|
|
264
|
+
end
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
# Output plot
|
|
268
|
+
def plot_output
|
|
269
|
+
if @options[:test]
|
|
270
|
+
$stderr.puts @gp.to_gp
|
|
271
|
+
elsif @options[:terminal] == :dumb
|
|
272
|
+
puts @gp.plot
|
|
273
|
+
else
|
|
274
|
+
@gp.plot
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# Write record to output.
|
|
279
|
+
def write_output(output, record)
|
|
280
|
+
return unless output
|
|
281
|
+
output << record
|
|
282
|
+
@status[:records_out] += 1
|
|
283
|
+
end
|
|
284
|
+
end
|
|
285
|
+
end
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Pick number of rand om records from the stream.
|
|
30
|
+
#
|
|
31
|
+
# +random+ can be used to pick a random number of records from the stream.
|
|
32
|
+
# Note that the order of records is preserved.
|
|
33
|
+
#
|
|
34
|
+
# Using the `pair: true` option allows random picking of interleaved
|
|
35
|
+
# paired-end sequence records.
|
|
36
|
+
#
|
|
37
|
+
# == Usage
|
|
38
|
+
#
|
|
39
|
+
# random(<number: <uint>[, pairs: <bool>])
|
|
40
|
+
#
|
|
41
|
+
# === Options
|
|
42
|
+
#
|
|
43
|
+
# * number: <uint> - Number of records to pick.
|
|
44
|
+
# * pairs: <bool> - Preserve interleaved pair order.
|
|
45
|
+
#
|
|
46
|
+
# == Examples
|
|
47
|
+
#
|
|
48
|
+
# To pick some random records from the stream do:
|
|
49
|
+
#
|
|
50
|
+
# BP.new.
|
|
51
|
+
# read_fasta(input: "in.fna").
|
|
52
|
+
# random(number: 10_000).
|
|
53
|
+
# write_fasta(output: "out.fna").
|
|
54
|
+
# run
|
|
55
|
+
class Random
|
|
56
|
+
STATS = %i(records_in records_out)
|
|
57
|
+
|
|
58
|
+
# Constructor for Randowm.
|
|
59
|
+
#
|
|
60
|
+
# @param options [Hash] Options hash.
|
|
61
|
+
#
|
|
62
|
+
# @option options [Fixnum] :number
|
|
63
|
+
# @option options [Boolean] :pairs
|
|
64
|
+
#
|
|
65
|
+
# @return [Random] Class instance.
|
|
66
|
+
def initialize(options)
|
|
67
|
+
@options = options
|
|
68
|
+
@wanted = nil
|
|
69
|
+
|
|
70
|
+
check_options
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Return command lambda for random.
|
|
74
|
+
#
|
|
75
|
+
# @return [Proc] Command lambda.
|
|
76
|
+
def lmb
|
|
77
|
+
lambda do |input, output, status|
|
|
78
|
+
status_init(status, STATS)
|
|
79
|
+
|
|
80
|
+
TmpDir.create('random') do |file, _|
|
|
81
|
+
process_input(input, file)
|
|
82
|
+
decide_wanted
|
|
83
|
+
process_output(output, file)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
private
|
|
89
|
+
|
|
90
|
+
# Check options.
|
|
91
|
+
def check_options
|
|
92
|
+
options_allowed(@options, :number, :pairs)
|
|
93
|
+
options_required(@options, :number)
|
|
94
|
+
options_allowed_values(@options, pairs: [nil, true, false])
|
|
95
|
+
options_assert(@options, ':number > 0')
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Serialize records from input
|
|
99
|
+
#
|
|
100
|
+
# @param input [Enumerator] Input stream.
|
|
101
|
+
# @param file [String] Path to temporary file.
|
|
102
|
+
def process_input(input, file)
|
|
103
|
+
File.open(file, 'wb') do |ios|
|
|
104
|
+
BioDSL::Serializer.new(ios) do |s|
|
|
105
|
+
input.each do |record|
|
|
106
|
+
@status[:records_in] += 1
|
|
107
|
+
|
|
108
|
+
s << record
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Compile a random set of numbers.
|
|
115
|
+
def decide_wanted
|
|
116
|
+
if @options[:pairs]
|
|
117
|
+
decide_wanted_pairs
|
|
118
|
+
else
|
|
119
|
+
@wanted =
|
|
120
|
+
(0...@status[:records_in]).to_a.shuffle[0...@options[:number]].to_set
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Compile a random set of number pairs.
|
|
125
|
+
def decide_wanted_pairs
|
|
126
|
+
@wanted = Set.new
|
|
127
|
+
range = (0...@status[:records_in])
|
|
128
|
+
num = @options[:number] / 2
|
|
129
|
+
|
|
130
|
+
range.to_a.shuffle.select(&:even?)[0...num].each do |i|
|
|
131
|
+
@wanted.merge([i, i + 1])
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Read records from temporary file and emit wanted records to the output
|
|
136
|
+
# stream.
|
|
137
|
+
#
|
|
138
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
139
|
+
# @param file [String] Path to termorary file with records.
|
|
140
|
+
def process_output(output, file)
|
|
141
|
+
File.open(file, 'rb') do |ios|
|
|
142
|
+
BioDSL::Serializer.new(ios) do |s|
|
|
143
|
+
s.each_with_index do |record, i|
|
|
144
|
+
if @wanted.include? i
|
|
145
|
+
output << record
|
|
146
|
+
@status[:records_out] += 1
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Read FASTA entries from one or more files.
|
|
30
|
+
#
|
|
31
|
+
# +read_fasta+ read in sequence entries from FASTA files. Each sequence
|
|
32
|
+
# entry consists of a sequence name prefixed by a '>' followed by the sequence
|
|
33
|
+
# name on a line of its own, followed by one or my lines of sequence until the
|
|
34
|
+
# next entry or the end of the file. The resulting Biopiece record consists of
|
|
35
|
+
# the following record type:
|
|
36
|
+
#
|
|
37
|
+
# {:SEQ_NAME=>"test",
|
|
38
|
+
# :SEQ=>"AGCATCGACTAGCAGCATTT",
|
|
39
|
+
# :SEQ_LEN=>20}
|
|
40
|
+
#
|
|
41
|
+
# Input files may be compressed with gzip og bzip2.
|
|
42
|
+
#
|
|
43
|
+
# For more about the FASTA format:
|
|
44
|
+
#
|
|
45
|
+
# http://en.wikipedia.org/wiki/Fasta_format
|
|
46
|
+
#
|
|
47
|
+
# == Usage
|
|
48
|
+
# read_fasta(input: <glob>[, first: <uint>|last: <uint>])
|
|
49
|
+
#
|
|
50
|
+
# === Options
|
|
51
|
+
# * input <glob> - Input file or file glob expression.
|
|
52
|
+
# * first <uint> - Only read in the _first_ number of entries.
|
|
53
|
+
# * last <uint> - Only read in the _last_ number of entries.
|
|
54
|
+
#
|
|
55
|
+
# == Examples
|
|
56
|
+
#
|
|
57
|
+
# To read all FASTA entries from a file:
|
|
58
|
+
#
|
|
59
|
+
# read_fasta(input: "test.fna")
|
|
60
|
+
#
|
|
61
|
+
# To read all FASTA entries from a gzipped file:
|
|
62
|
+
#
|
|
63
|
+
# read_fasta(input: "test.fna.gz")
|
|
64
|
+
#
|
|
65
|
+
# To read in only 10 records from a FASTA file:
|
|
66
|
+
#
|
|
67
|
+
# read_fasta(input: "test.fna", first: 10)
|
|
68
|
+
#
|
|
69
|
+
# To read in the last 10 records from a FASTA file:
|
|
70
|
+
#
|
|
71
|
+
# read_fasta(input: "test.fna", last: 10)
|
|
72
|
+
#
|
|
73
|
+
# To read all FASTA entries from multiple files:
|
|
74
|
+
#
|
|
75
|
+
# read_fasta(input: "test1.fna,test2.fna")
|
|
76
|
+
#
|
|
77
|
+
# To read FASTA entries from multiple files using a glob expression:
|
|
78
|
+
#
|
|
79
|
+
# read_fasta(input: "*.fna")
|
|
80
|
+
class ReadFasta
|
|
81
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
82
|
+
residues_out)
|
|
83
|
+
|
|
84
|
+
# Constructor for the ReadFasta class.
|
|
85
|
+
#
|
|
86
|
+
# @param [Hash] options Options hash.
|
|
87
|
+
# @option options [String, Array] :input String or Array with glob
|
|
88
|
+
# expressions.
|
|
89
|
+
# @option options [Integer] :first Dump first number of records.
|
|
90
|
+
# @option options [Integer] :last Dump last number of records.
|
|
91
|
+
#
|
|
92
|
+
# @return [ReadFasta] Returns an instance of the class.
|
|
93
|
+
def initialize(options)
|
|
94
|
+
@options = options
|
|
95
|
+
@count = 0
|
|
96
|
+
@buffer = []
|
|
97
|
+
|
|
98
|
+
check_options
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Return a lambda for the read_fasta command.
|
|
102
|
+
#
|
|
103
|
+
# @return [Proc] Returns the read_fasta command lambda.
|
|
104
|
+
def lmb
|
|
105
|
+
lambda do |input, output, status|
|
|
106
|
+
status_init(status, STATS)
|
|
107
|
+
|
|
108
|
+
read_input(input, output)
|
|
109
|
+
|
|
110
|
+
options_glob(@options[:input]).each do |file|
|
|
111
|
+
BioDSL::Fasta.open(file) do |ios|
|
|
112
|
+
if @options[:first] && read_first(ios, output)
|
|
113
|
+
elsif @options[:last] && read_last(ios)
|
|
114
|
+
else
|
|
115
|
+
read_all(ios, output)
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
write_buffer(output) if @options[:last]
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
private
|
|
125
|
+
|
|
126
|
+
# Check the options.
|
|
127
|
+
def check_options
|
|
128
|
+
options_allowed(@options, :input, :first, :last)
|
|
129
|
+
options_required(@options, :input)
|
|
130
|
+
options_files_exist(@options, :input)
|
|
131
|
+
options_unique(@options, :first, :last)
|
|
132
|
+
options_assert(@options, ':first >= 0')
|
|
133
|
+
options_assert(@options, ':last >= 0')
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Read and emit records from the input to the output stream.
|
|
137
|
+
#
|
|
138
|
+
# @param input [Enumerable::Yielder] Input stream.
|
|
139
|
+
# @param output [Enumerable::Yielder] Output stream.
|
|
140
|
+
def read_input(input, output)
|
|
141
|
+
return unless input
|
|
142
|
+
|
|
143
|
+
input.each do |record|
|
|
144
|
+
output << record
|
|
145
|
+
@status[:records_in] += 1
|
|
146
|
+
|
|
147
|
+
if record[:SEQ]
|
|
148
|
+
@status[:sequences_in] += 1
|
|
149
|
+
@status[:residues_in] += record[:SEQ].length
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Read in a specified number of entries from the input and emit to the
|
|
155
|
+
# output.
|
|
156
|
+
#
|
|
157
|
+
# @param input [BioDSL::Fasta] FASTA file input stream.
|
|
158
|
+
# @param output [Enumerable::Yielder] Output stream.
|
|
159
|
+
#
|
|
160
|
+
# @return [Fixnum] Number of read entries.
|
|
161
|
+
def read_first(input, output)
|
|
162
|
+
first = @options[:first]
|
|
163
|
+
|
|
164
|
+
input.each do |entry|
|
|
165
|
+
break if @count == first
|
|
166
|
+
output << entry.to_bp
|
|
167
|
+
|
|
168
|
+
@status[:records_out] += 1
|
|
169
|
+
@status[:sequences_out] += 1
|
|
170
|
+
@status[:residues_out] += entry.length
|
|
171
|
+
|
|
172
|
+
@count += 1
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
@count
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Read in entries from input and cache the specified last number in a
|
|
179
|
+
# buffer.
|
|
180
|
+
#
|
|
181
|
+
# @param input [BioDSL::Fasta] FASTA file input stream.
|
|
182
|
+
#
|
|
183
|
+
# @return [Fixnum] Number of read entries.
|
|
184
|
+
def read_last(input)
|
|
185
|
+
last = @options[:last]
|
|
186
|
+
|
|
187
|
+
input.each do |entry|
|
|
188
|
+
@buffer << entry
|
|
189
|
+
@buffer.shift if @buffer.size > last
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
@buffer.size
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Read in all entries from input and emit to output.
|
|
196
|
+
#
|
|
197
|
+
# @param input [BioDSL::Fasta] FASTA file input stream.
|
|
198
|
+
# @param output [Enumerable::Yielder] Output stream.
|
|
199
|
+
def read_all(input, output)
|
|
200
|
+
input.each do |entry|
|
|
201
|
+
output << entry.to_bp
|
|
202
|
+
|
|
203
|
+
@status[:records_out] += 1
|
|
204
|
+
@status[:sequences_out] += 1
|
|
205
|
+
@status[:residues_out] += entry.length
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Emit all entries in buffer to output.
|
|
210
|
+
#
|
|
211
|
+
# @param output [Enumerable::Yielder] Output stream.
|
|
212
|
+
def write_buffer(output)
|
|
213
|
+
@buffer.each do |entry|
|
|
214
|
+
output << entry.to_bp
|
|
215
|
+
|
|
216
|
+
@status[:records_out] += 1
|
|
217
|
+
@status[:sequences_out] += 1
|
|
218
|
+
@status[:residues_out] += entry.length
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
end
|