BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Align sequences in the stream using Mothur.
|
|
30
|
+
#
|
|
31
|
+
# This is a wrapper for the +mothur+ command +align.seqs()+. Basically,
|
|
32
|
+
# it aligns sequences to a reference alignment.
|
|
33
|
+
#
|
|
34
|
+
# Please refer to the manual:
|
|
35
|
+
#
|
|
36
|
+
# http://www.mothur.org/wiki/Align.seqs
|
|
37
|
+
#
|
|
38
|
+
# Mothur must be installed for +align_seq_mothurs+ to work. Read more here:
|
|
39
|
+
#
|
|
40
|
+
# http://www.mothur.org/
|
|
41
|
+
#
|
|
42
|
+
# == Usage
|
|
43
|
+
#
|
|
44
|
+
# align_seq_mothur(<template_file: <file>>[, cpus: <uint>])
|
|
45
|
+
#
|
|
46
|
+
# === Options
|
|
47
|
+
#
|
|
48
|
+
# * template_file: <file> - File with template alignment in FASTA format.
|
|
49
|
+
# * cpus: <uint> - Number of CPU cores to use (default=1).
|
|
50
|
+
#
|
|
51
|
+
# == Examples
|
|
52
|
+
#
|
|
53
|
+
# To align the entries in the FASTA file `test.fna` to the template alignment
|
|
54
|
+
# in the file `template.fna` do:
|
|
55
|
+
#
|
|
56
|
+
# BP.new.
|
|
57
|
+
# read_fasta(input: "test.fna").
|
|
58
|
+
# align_seq_mothur(template_file: "template.fna").
|
|
59
|
+
# run
|
|
60
|
+
class AlignSeqMothur
|
|
61
|
+
require 'English'
|
|
62
|
+
require 'BioDSL/helpers/aux_helper'
|
|
63
|
+
|
|
64
|
+
include AuxHelper
|
|
65
|
+
|
|
66
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
67
|
+
residues_out)
|
|
68
|
+
|
|
69
|
+
# Constructor for the AlignSeqMothur class.
|
|
70
|
+
#
|
|
71
|
+
# @param [Hash] options Options hash.
|
|
72
|
+
# @option options [String] :template_file Path to template file.
|
|
73
|
+
# @option options [Integer] :cpus Number of CPUs to use.
|
|
74
|
+
#
|
|
75
|
+
# @return [AlignSeqMothur] Returns an instance of the class.
|
|
76
|
+
def initialize(options)
|
|
77
|
+
@options = options
|
|
78
|
+
|
|
79
|
+
aux_exist('mothur')
|
|
80
|
+
check_options
|
|
81
|
+
defaults
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Return a lambda for the align_seq_mothur command.
|
|
85
|
+
#
|
|
86
|
+
# @return [Proc] Returns the align_seq_mothur command lambda.
|
|
87
|
+
def lmb
|
|
88
|
+
lambda do |input, output, status|
|
|
89
|
+
status_init(status, STATS)
|
|
90
|
+
|
|
91
|
+
TmpDir.create('input.fna', 'input.align') do |tmp_in, tmp_out, tmp_dir|
|
|
92
|
+
process_input(input, output, tmp_in)
|
|
93
|
+
run_mothur(@options[:template_file], @options[:cpus], tmp_dir, tmp_in)
|
|
94
|
+
process_output(output, tmp_out)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
private
|
|
100
|
+
|
|
101
|
+
# Check the options.
|
|
102
|
+
def check_options
|
|
103
|
+
options_allowed(@options, :template_file, :cpus)
|
|
104
|
+
options_required(@options, :template_file)
|
|
105
|
+
options_files_exist(@options, :template_file)
|
|
106
|
+
options_assert(@options, ':cpus >= 1')
|
|
107
|
+
options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Set default options.
|
|
111
|
+
def defaults
|
|
112
|
+
@options[:cpus] ||= 1
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Process all records in the input stream and write those with sequences to
|
|
116
|
+
# file and all other records to the output stream.
|
|
117
|
+
#
|
|
118
|
+
# @param input [BioDSL::Stream] The input stream.
|
|
119
|
+
# @param output [BioDSL::Stream] The output stream.
|
|
120
|
+
# @param tmp_in [String] Path to temporary file.
|
|
121
|
+
def process_input(input, output, tmp_in)
|
|
122
|
+
BioDSL::Fasta.open(tmp_in, 'w') do |ios|
|
|
123
|
+
input.each_with_index do |record, i|
|
|
124
|
+
@status[:records_in] += 1
|
|
125
|
+
|
|
126
|
+
if record[:SEQ]
|
|
127
|
+
write_entry(ios, record, i)
|
|
128
|
+
else
|
|
129
|
+
output << record
|
|
130
|
+
@status[:records_out] += 1
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Write a record containing sequence information to a FASTA file IO handle.
|
|
137
|
+
# If no sequence_name is found in the record use the sequence index
|
|
138
|
+
# instead.
|
|
139
|
+
#
|
|
140
|
+
# @param ios [Fasta::IO] FASTA IO.
|
|
141
|
+
# @param record [Hash] BioDSL record to create FASTA entry from.
|
|
142
|
+
# @param i [Integer] Sequence index.
|
|
143
|
+
def write_entry(ios, record, i)
|
|
144
|
+
seq_name = record[:SEQ_NAME] || i.to_s
|
|
145
|
+
entry = BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ])
|
|
146
|
+
|
|
147
|
+
@status[:sequences_in] += 1
|
|
148
|
+
@status[:residues_in] += entry.length
|
|
149
|
+
|
|
150
|
+
ios.puts entry.to_fasta
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Read all FASTA entries from output file and emit to the output stream.
|
|
154
|
+
#
|
|
155
|
+
# @param output [BioDSL::Stream] The output stream.
|
|
156
|
+
# @param tmp_out [String] Path to temporary file.
|
|
157
|
+
def process_output(output, tmp_out)
|
|
158
|
+
BioDSL::Fasta.open(tmp_out) do |ios|
|
|
159
|
+
ios.each do |entry|
|
|
160
|
+
output << entry.to_bp
|
|
161
|
+
@status[:records_out] += 1
|
|
162
|
+
@status[:sequences_out] += 1
|
|
163
|
+
@status[:residues_out] += entry.length
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Run Mothur using a system call.
|
|
169
|
+
#
|
|
170
|
+
# @param template_file [String] Path to template file.
|
|
171
|
+
# @param cpus [Integer] Number of CPUs to use.
|
|
172
|
+
# @param tmp_dir [String] Path to temporary dir.
|
|
173
|
+
# @param tmp_in [String] Path to temporary file.
|
|
174
|
+
#
|
|
175
|
+
# @raise [RunTimeError] If system call fails.
|
|
176
|
+
def run_mothur(template_file, cpus, tmp_dir, tmp_in)
|
|
177
|
+
cmd = <<-CMD.gsub(/^\s+\|/, '').delete("\n")
|
|
178
|
+
|mothur "#set.dir(input=#{tmp_dir});
|
|
179
|
+
|set.dir(output=#{tmp_dir});
|
|
180
|
+
|align.seqs(candidate=#{tmp_in},
|
|
181
|
+
|template=#{template_file},
|
|
182
|
+
|processors=#{cpus})"
|
|
183
|
+
CMD
|
|
184
|
+
|
|
185
|
+
if BioDSL.verbose
|
|
186
|
+
system(cmd)
|
|
187
|
+
else
|
|
188
|
+
system("#{cmd} > /dev/null 2>&1")
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
fail 'Mothur failed' unless $CHILD_STATUS.success?
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
end
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Analyze the residue distribution from sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# +analyze_residue_distribution+ determines the distribution per position
|
|
32
|
+
# of residues from sequences and output records per observed residue with
|
|
33
|
+
# counts at the different positions. Using the +percent+ option outputs the
|
|
34
|
+
# count as percentages of observed residues per position.
|
|
35
|
+
#
|
|
36
|
+
# The records output looks like this:
|
|
37
|
+
#
|
|
38
|
+
# {:RECORD_TYPE=>"residue distribution",
|
|
39
|
+
# :V0=>"A",
|
|
40
|
+
# :V1=>5,
|
|
41
|
+
# :V2=>0,
|
|
42
|
+
# :V3=>0,
|
|
43
|
+
# :V4=>0}
|
|
44
|
+
#
|
|
45
|
+
# Which are ready for +write_table+. See examples.
|
|
46
|
+
#
|
|
47
|
+
# == Usage
|
|
48
|
+
#
|
|
49
|
+
# analyze_residue_distribution([percent: <bool>])
|
|
50
|
+
#
|
|
51
|
+
# === Options
|
|
52
|
+
#
|
|
53
|
+
# * percent: <bool> - Output distributions in percent (default=false).
|
|
54
|
+
#
|
|
55
|
+
# == Examples
|
|
56
|
+
#
|
|
57
|
+
# Consider the following entries in the file `test.fna`:
|
|
58
|
+
#
|
|
59
|
+
# >DNA
|
|
60
|
+
# AGCT
|
|
61
|
+
# >RNA
|
|
62
|
+
# AGCU
|
|
63
|
+
# >Protein
|
|
64
|
+
# FLS*
|
|
65
|
+
# >Gaps
|
|
66
|
+
# -.~
|
|
67
|
+
#
|
|
68
|
+
# Now we run the data through the following pipeline and get the resulting
|
|
69
|
+
# table:
|
|
70
|
+
#
|
|
71
|
+
# BP.new.
|
|
72
|
+
# read_fasta(input: "test.fna").
|
|
73
|
+
# analyze_residue_distribution.
|
|
74
|
+
# grab(select: "residue").
|
|
75
|
+
# write_table(skip: [:RECORD_TYPE]).
|
|
76
|
+
# run
|
|
77
|
+
#
|
|
78
|
+
# A 2 0 0 0
|
|
79
|
+
# G 0 2 0 0
|
|
80
|
+
# C 0 0 2 0
|
|
81
|
+
# T 0 0 0 1
|
|
82
|
+
# U 0 0 0 1
|
|
83
|
+
# F 1 0 0 0
|
|
84
|
+
# L 0 1 0 0
|
|
85
|
+
# S 0 0 1 0
|
|
86
|
+
# * 0 0 0 1
|
|
87
|
+
# - 1 0 0 0
|
|
88
|
+
# . 0 1 0 0
|
|
89
|
+
# ~ 0 0 1 0
|
|
90
|
+
#
|
|
91
|
+
# Here we do the same as above, but output percentages instead of absolute
|
|
92
|
+
# counts:
|
|
93
|
+
#
|
|
94
|
+
# BP.new.
|
|
95
|
+
# read_fasta(input: "test.fna").
|
|
96
|
+
# analyze_residue_distribution(percent: true).
|
|
97
|
+
# grab(select: "residue").
|
|
98
|
+
# write_table(skip: [:RECORD_TYPE]).
|
|
99
|
+
# run
|
|
100
|
+
#
|
|
101
|
+
# A 50 0 0 0
|
|
102
|
+
# G 0 50 0 0
|
|
103
|
+
# C 0 0 50 0
|
|
104
|
+
# T 0 0 0 33
|
|
105
|
+
# U 0 0 0 33
|
|
106
|
+
# F 25 0 0 0
|
|
107
|
+
# L 0 25 0 0
|
|
108
|
+
# S 0 0 25 0
|
|
109
|
+
# * 0 0 0 33
|
|
110
|
+
# - 25 0 0 0
|
|
111
|
+
# . 0 25 0 0
|
|
112
|
+
# ~ 0 0 25 0
|
|
113
|
+
class AnalyzeResidueDistribution
|
|
114
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
115
|
+
residues_out)
|
|
116
|
+
|
|
117
|
+
# Constructor for the AnalyzeResidueDistribution class.
|
|
118
|
+
#
|
|
119
|
+
# @param [Hash] options Options hash.
|
|
120
|
+
# @option options [Boolean] :percent Output distribution in percent.
|
|
121
|
+
#
|
|
122
|
+
# @return [AnalyzeResidueDistribution] Returns an instance of the class.
|
|
123
|
+
def initialize(options)
|
|
124
|
+
@options = options
|
|
125
|
+
|
|
126
|
+
check_options
|
|
127
|
+
|
|
128
|
+
@counts = Hash.new { |h, k| h[k] = Hash.new(0) }
|
|
129
|
+
@total = Hash.new(0)
|
|
130
|
+
@residues = Set.new
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Return a lambda for the read_fasta command.
|
|
134
|
+
#
|
|
135
|
+
# @return [Proc] Returns the read_fasta command lambda.
|
|
136
|
+
def lmb
|
|
137
|
+
require 'set'
|
|
138
|
+
|
|
139
|
+
lambda do |input, output, status|
|
|
140
|
+
status_init(status, STATS)
|
|
141
|
+
|
|
142
|
+
input.each do |record|
|
|
143
|
+
@status[:records_in] += 1
|
|
144
|
+
|
|
145
|
+
analyze_residues(record[:SEQ]) if record[:SEQ]
|
|
146
|
+
|
|
147
|
+
if output
|
|
148
|
+
output << record
|
|
149
|
+
@status[:records_out] += 1
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
calc_dist(output)
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
private
|
|
158
|
+
|
|
159
|
+
# Check the options.
|
|
160
|
+
def check_options
|
|
161
|
+
options_allowed(@options, :percent)
|
|
162
|
+
options_allowed_values(@options, percent: [nil, true, false])
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Analyze the sequence distribution of a given sequence.
|
|
166
|
+
#
|
|
167
|
+
# @param seq [String] - Sequence to analyze.
|
|
168
|
+
def analyze_residues(seq)
|
|
169
|
+
@status[:sequences_in] += 1
|
|
170
|
+
@status[:sequences_out] += 1
|
|
171
|
+
@status[:residues_in] += seq.length
|
|
172
|
+
@status[:residues_out] += seq.length
|
|
173
|
+
|
|
174
|
+
seq.upcase.chars.each_with_index do |char, i|
|
|
175
|
+
c = char.to_sym
|
|
176
|
+
@counts[i][c] += 1
|
|
177
|
+
@total[i] += 1
|
|
178
|
+
@residues.add(c)
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Calculate the residue destribution.
|
|
183
|
+
#
|
|
184
|
+
# @param output [BioDSL::Stream] Output stream.
|
|
185
|
+
def calc_dist(output)
|
|
186
|
+
@residues.each do |res|
|
|
187
|
+
record = {}
|
|
188
|
+
record[:RECORD_TYPE] = 'residue distribution'
|
|
189
|
+
record[:V0] = res.to_s
|
|
190
|
+
|
|
191
|
+
if @options[:percent]
|
|
192
|
+
calc_dist_percent(record, res)
|
|
193
|
+
else
|
|
194
|
+
calc_dist_count(record, res)
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
output << record
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
# Calculate the residue distribution in percent for a given residue.
|
|
202
|
+
#
|
|
203
|
+
# @param record [Hash] BioDSL record.
|
|
204
|
+
# @param res [Symbol] Residue.
|
|
205
|
+
def calc_dist_percent(record, res)
|
|
206
|
+
@counts.each do |pos, dist|
|
|
207
|
+
value = (@total[pos] == 0) ? 0 : 100 * dist[res] / @total[pos]
|
|
208
|
+
record["V#{pos + 1}".to_sym] = value
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# Calculate the residue distribution for a given residue.
|
|
213
|
+
#
|
|
214
|
+
# @param record [Hash] BioDSL record.
|
|
215
|
+
# @param res [Symbol] Residue.
|
|
216
|
+
def calc_dist_count(record, res)
|
|
217
|
+
@counts.each do |pos, dist|
|
|
218
|
+
record["V#{pos + 1}".to_sym] = dist[res]
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
end
|