BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Collect OTU data from records in the stream.
|
|
30
|
+
#
|
|
31
|
+
# +collect_otus+ count the number of times each OTU is found in a set of
|
|
32
|
+
# samples. OTUs are given by the :S_ID key and samples by the :SAMPLE key.
|
|
33
|
+
# If a :SEQ_COUNT key is present it will be used to increment the OTU count,
|
|
34
|
+
# allowing for dereplicated sequences to be used.
|
|
35
|
+
#
|
|
36
|
+
# == Usage
|
|
37
|
+
#
|
|
38
|
+
# collect_otus()
|
|
39
|
+
#
|
|
40
|
+
# === Options
|
|
41
|
+
#
|
|
42
|
+
# == Examples
|
|
43
|
+
#
|
|
44
|
+
class CollectOtus
|
|
45
|
+
require 'set'
|
|
46
|
+
|
|
47
|
+
STATS = %i(records_in records_out hits_in hits_out)
|
|
48
|
+
|
|
49
|
+
# Constructor for CollectOtus.
|
|
50
|
+
#
|
|
51
|
+
# @param options [Hash] Options hash.
|
|
52
|
+
def initialize(options)
|
|
53
|
+
@options = options
|
|
54
|
+
|
|
55
|
+
check_options
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Return lambda for CollectOtus command.
|
|
59
|
+
#
|
|
60
|
+
# @return [Proc] Command lambda.
|
|
61
|
+
def lmb
|
|
62
|
+
lambda do |input, output, status|
|
|
63
|
+
status_init(status, STATS)
|
|
64
|
+
|
|
65
|
+
count_hash = process_input(input, output)
|
|
66
|
+
samples = collect_samples(count_hash)
|
|
67
|
+
process_output(count_hash, samples, output)
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
private
|
|
72
|
+
|
|
73
|
+
# Check options.
|
|
74
|
+
def check_options
|
|
75
|
+
options_allowed(@options, nil)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Read input stream and for all hit records add these to the count hash.
|
|
79
|
+
#
|
|
80
|
+
# @param input [Enumerator] Input stream.
|
|
81
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
82
|
+
#
|
|
83
|
+
# @return [Hash] Returns the count_hash.
|
|
84
|
+
def process_input(input, output)
|
|
85
|
+
count_hash = Hash.new { |h, k| h[k] = Hash.new(0) }
|
|
86
|
+
|
|
87
|
+
input.each do |record|
|
|
88
|
+
@status[:records_in] += 1
|
|
89
|
+
|
|
90
|
+
if record[:TYPE] && record[:TYPE] == 'H'
|
|
91
|
+
add_to_count_hash(count_hash, record)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
output << record
|
|
95
|
+
@status[:records_out] += 1
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
count_hash
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Add to the count_hash a given record.
|
|
102
|
+
#
|
|
103
|
+
# @param count_hash [Hash] Hash with sample counts
|
|
104
|
+
# @param record [Hash] BioDSL record with sample and count.
|
|
105
|
+
def add_to_count_hash(count_hash, record)
|
|
106
|
+
id = record[:S_ID].to_sym
|
|
107
|
+
sample = record[:SAMPLE].upcase.to_sym
|
|
108
|
+
count_hash[id][sample] += (record[:SEQ_COUNT] || 1)
|
|
109
|
+
@status[:hits_in] += 1
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Collect all samples in the count_hash into a sorted set.
|
|
113
|
+
#
|
|
114
|
+
# @param count_hash [Hash] Hash with sample counts.
|
|
115
|
+
#
|
|
116
|
+
# @return [SortedSet] Sample names.
|
|
117
|
+
def collect_samples(count_hash)
|
|
118
|
+
samples = SortedSet.new
|
|
119
|
+
|
|
120
|
+
count_hash.values.each do |value|
|
|
121
|
+
value.keys.map { |key| samples << key }
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
samples
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Output all samples and counts from the count_hash and samples to the
|
|
128
|
+
# output stream.
|
|
129
|
+
#
|
|
130
|
+
# @param count_hash [Hash] Hash with sample counts
|
|
131
|
+
# @param samples [SortedSet] Set with sample names.
|
|
132
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
133
|
+
def process_output(count_hash, samples, output)
|
|
134
|
+
count_hash.each do |key, value|
|
|
135
|
+
record = {}
|
|
136
|
+
record[:RECORD_TYPE] = 'OTU'
|
|
137
|
+
record[:OTU] = key.to_s
|
|
138
|
+
|
|
139
|
+
samples.each do |sample|
|
|
140
|
+
record["#{sample}_COUNT".to_sym] = value[sample]
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
output << record
|
|
144
|
+
|
|
145
|
+
@status[:hits_out] += 1
|
|
146
|
+
@status[:records_out] += 1
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Complment sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# +complement_seq+ complements sequences in the stream. The sequence type -
|
|
32
|
+
# DNA or RNA - is guessed by inspected the first sequence in the stream.
|
|
33
|
+
#
|
|
34
|
+
# +complement_seq+ can be used together with +reverse_seq+ to reverse-
|
|
35
|
+
# complement sequences.
|
|
36
|
+
#
|
|
37
|
+
# == Usage
|
|
38
|
+
#
|
|
39
|
+
# complement_seq()
|
|
40
|
+
#
|
|
41
|
+
# === Options
|
|
42
|
+
#
|
|
43
|
+
# == Examples
|
|
44
|
+
#
|
|
45
|
+
# Consider the following FASTQ entry in the file test.fq:
|
|
46
|
+
#
|
|
47
|
+
# @M02529:88:000000000-AC0WY:1:1101:12879:1928 2:N:0:185
|
|
48
|
+
# TTGTAAAACGACGGCCAGTG
|
|
49
|
+
# +
|
|
50
|
+
# >>>>>FFFFD@A?A0AE0FG
|
|
51
|
+
#
|
|
52
|
+
# To complement the sequence do:
|
|
53
|
+
#
|
|
54
|
+
# BP.new.read_fastq(input:"test.fq").complement_seq.dump.run
|
|
55
|
+
#
|
|
56
|
+
# {:SEQ_NAME=>"M02529:88:000000000-AC0WY:1:1101:12879:1928 2:N:0:185",
|
|
57
|
+
# :SEQ=>"AACATTTTGCTGCCGGTCAC",
|
|
58
|
+
# :SEQ_LEN=>20,
|
|
59
|
+
# :SCORES=>">>>>>FFFFD@A?A0AE0FG"}
|
|
60
|
+
class ComplementSeq
|
|
61
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
62
|
+
residues_out)
|
|
63
|
+
|
|
64
|
+
# Constructor for ComplementSeq.
|
|
65
|
+
#
|
|
66
|
+
# @param options [Hash] Options hash.
|
|
67
|
+
def initialize(options)
|
|
68
|
+
@options = options
|
|
69
|
+
@type = nil
|
|
70
|
+
|
|
71
|
+
check_options
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Return the command lambda for ComplementSeq.
|
|
75
|
+
#
|
|
76
|
+
# @return [Proc] Command lambda
|
|
77
|
+
def lmb
|
|
78
|
+
lambda do |input, output, status|
|
|
79
|
+
status_init(status, STATS)
|
|
80
|
+
|
|
81
|
+
input.each do |record|
|
|
82
|
+
@status[:records_in] += 1
|
|
83
|
+
|
|
84
|
+
complement(record) if record.key? :SEQ
|
|
85
|
+
|
|
86
|
+
output << record
|
|
87
|
+
|
|
88
|
+
@status[:records_out] += 1
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
private
|
|
94
|
+
|
|
95
|
+
# Check options.
|
|
96
|
+
def check_options
|
|
97
|
+
options_allowed(@options, nil)
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Complements sequence in record.
|
|
101
|
+
#
|
|
102
|
+
# @param record [Hash] BioDSL record with sequence.
|
|
103
|
+
def complement(record)
|
|
104
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
105
|
+
@type = entry.type_guess unless @type
|
|
106
|
+
entry.type = @type
|
|
107
|
+
entry.complement!
|
|
108
|
+
|
|
109
|
+
@status[:sequences_in] += 1
|
|
110
|
+
@status[:sequences_out] += 1
|
|
111
|
+
@status[:residues_in] += entry.length
|
|
112
|
+
@status[:residues_out] += entry.length
|
|
113
|
+
|
|
114
|
+
record.merge! entry.to_bp
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
# Namespace for BioDSL.
|
|
29
|
+
module BioDSL
|
|
30
|
+
# == Count the number of records in the stream.
|
|
31
|
+
#
|
|
32
|
+
# +count+ counts the number of records in the stream and outputs the
|
|
33
|
+
# count as a record who's count is _not_ included. Using the +output+
|
|
34
|
+
# option will output the count in a file as a table with header.
|
|
35
|
+
#
|
|
36
|
+
# == Usage
|
|
37
|
+
#
|
|
38
|
+
# count([output: <file>[, force: <bool]])
|
|
39
|
+
#
|
|
40
|
+
# === Options
|
|
41
|
+
#
|
|
42
|
+
# * output: <file> - Output file.
|
|
43
|
+
# * force: <bool> - Force overwrite existing output file.
|
|
44
|
+
#
|
|
45
|
+
# == Examples
|
|
46
|
+
#
|
|
47
|
+
# To count the number of records in the file `test.fq`:
|
|
48
|
+
#
|
|
49
|
+
# BP.new.read_fastq(input: "test.fq").count(output: "count.txt").dump.run
|
|
50
|
+
#
|
|
51
|
+
# {:SEQ_NAME=>"ILLUMINA-52179E_0004:2:1:1040:5263#TTAGGC/1",
|
|
52
|
+
# :SEQ=>"TTCGGCATCGGCGGCGACGTTGGCGGCGGGGCCGGGCGGGTCGANNNCAT",
|
|
53
|
+
# :SEQ_LEN=>50,
|
|
54
|
+
# :SCORES=>"GGFBGGEADFAFFDDD,-5AC5?>C:)7?#####################"}
|
|
55
|
+
# {:SEQ_NAME=>"ILLUMINA-52179E_0004:2:1:1041:14486#TTAGGC/1",
|
|
56
|
+
# :SEQ=>"CATGGCGTATGCCAGACGGCCAGAACGATGGCCGCCGGGCTTCANNNAAG",
|
|
57
|
+
# :SEQ_LEN=>50,
|
|
58
|
+
# :SCORES=>"FFFFDBD?EEEEEEEFGGFAGAGEFDF=BFGFFGGDDDD=ABAA######"}
|
|
59
|
+
# {:SEQ_NAME=>"ILLUMINA-52179E_0004:2:1:1043:19446#TTAGGC/1",
|
|
60
|
+
# :SEQ=>"CGGTACTGATCGAGTGTCAGGCTGTTGATCGCCGCGGGCGGGGGTNNGAC",
|
|
61
|
+
# :SEQ_LEN=>50,
|
|
62
|
+
# :SCORES=>"ECAEBEEEEEFFFFFEFFFFDDEEEGGGGGDEBEECBDAE@#########"}
|
|
63
|
+
# {:RECORD_TYPE=>"count", :COUNT=>3}
|
|
64
|
+
#
|
|
65
|
+
# And the count is also saved in the file `count.txt`:
|
|
66
|
+
# #RECORD_TYPE COUNT
|
|
67
|
+
# count 3
|
|
68
|
+
class Count
|
|
69
|
+
STATS = %i(records_in records_out)
|
|
70
|
+
|
|
71
|
+
# Constructor for the count command.
|
|
72
|
+
#
|
|
73
|
+
# @param options [Hash] Options hash.
|
|
74
|
+
# @option options [String] :output Path to output file.
|
|
75
|
+
# @option options [Boolean] :force Force overwrite of output file.
|
|
76
|
+
#
|
|
77
|
+
# @return [Count] Instance of class Count.
|
|
78
|
+
def initialize(options)
|
|
79
|
+
@options = options
|
|
80
|
+
|
|
81
|
+
check_options
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Return the command lambda for count.
|
|
85
|
+
#
|
|
86
|
+
# @return [Proc] Command lambda.
|
|
87
|
+
def lmb
|
|
88
|
+
lambda do |input, output, status|
|
|
89
|
+
status_init(status, STATS)
|
|
90
|
+
|
|
91
|
+
process_input(input, output)
|
|
92
|
+
|
|
93
|
+
new_record = {
|
|
94
|
+
RECORD_TYPE: 'count',
|
|
95
|
+
COUNT: @status[:records_in]
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
output << new_record
|
|
99
|
+
@status[:records_out] += 1
|
|
100
|
+
|
|
101
|
+
write_output if @options[:output]
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
private
|
|
106
|
+
|
|
107
|
+
# Check options.
|
|
108
|
+
def check_options
|
|
109
|
+
options_allowed(@options, :output, :force)
|
|
110
|
+
options_allowed_values(@options, force: [true, false, nil])
|
|
111
|
+
options_files_exist_force(@options, :output)
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Process the input stream and emit all recors to the output stream.
|
|
115
|
+
#
|
|
116
|
+
# @param input [Enumerator] Input stream
|
|
117
|
+
# @param output [Enumerator::Yielder] Output stream
|
|
118
|
+
def process_input(input, output)
|
|
119
|
+
input.each do |record|
|
|
120
|
+
@status[:records_in] += 1
|
|
121
|
+
|
|
122
|
+
output << record
|
|
123
|
+
@status[:records_out] += 1
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Write output table to file.
|
|
128
|
+
def write_output
|
|
129
|
+
Filesys.open(@options[:output], 'w') do |ios|
|
|
130
|
+
ios.puts "#RECORD_TYPE\tCOUNT"
|
|
131
|
+
ios.puts "count\t#{@status[:records_in]}"
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Count the number of times values of given keys exists in stream.
|
|
30
|
+
#
|
|
31
|
+
# +count_values+ count the values for a given comma seperated list of keys.
|
|
32
|
+
#
|
|
33
|
+
# == Usage
|
|
34
|
+
#
|
|
35
|
+
# count_values(<keys: <list>)
|
|
36
|
+
#
|
|
37
|
+
# === Options
|
|
38
|
+
#
|
|
39
|
+
# * keys: <list> - Keys whos values to count.
|
|
40
|
+
#
|
|
41
|
+
# == Examples
|
|
42
|
+
#
|
|
43
|
+
# Consider the following two column table in the file `test.tab`:
|
|
44
|
+
#
|
|
45
|
+
# Human H1
|
|
46
|
+
# Human H2
|
|
47
|
+
# Human H3
|
|
48
|
+
# Dog D1
|
|
49
|
+
# Dog D2
|
|
50
|
+
# Mouse M1
|
|
51
|
+
#
|
|
52
|
+
# To count the values of both columns we first read the table with
|
|
53
|
+
# +read_table+ and then pass the result to +count_values+:
|
|
54
|
+
#
|
|
55
|
+
# BP.new.
|
|
56
|
+
# read_table(input: "test.tab").
|
|
57
|
+
# count_values(keys: [:V0, :V1]).
|
|
58
|
+
# dump.
|
|
59
|
+
# run
|
|
60
|
+
#
|
|
61
|
+
# {:V0=>"Human", :V1=>"H1", :V0_COUNT=>3, :V1_COUNT=>1}
|
|
62
|
+
# {:V0=>"Human", :V1=>"H2", :V0_COUNT=>3, :V1_COUNT=>1}
|
|
63
|
+
# {:V0=>"Human", :V1=>"H3", :V0_COUNT=>3, :V1_COUNT=>1}
|
|
64
|
+
# {:V0=>"Dog", :V1=>"D1", :V0_COUNT=>2, :V1_COUNT=>1}
|
|
65
|
+
# {:V0=>"Dog", :V1=>"D2", :V0_COUNT=>2, :V1_COUNT=>1}
|
|
66
|
+
# {:V0=>"Mouse", :V1=>"M1", :V0_COUNT=>1, :V1_COUNT=>1}
|
|
67
|
+
class CountValues
|
|
68
|
+
STATS = %i(records_in records_out)
|
|
69
|
+
|
|
70
|
+
# Constructor for CountValues.
|
|
71
|
+
#
|
|
72
|
+
# @param options [Hash] Options hash.
|
|
73
|
+
# @option options [Array] List of keys whos values to count.
|
|
74
|
+
#
|
|
75
|
+
# @return [CountValues] Instance of class.
|
|
76
|
+
def initialize(options)
|
|
77
|
+
@options = options
|
|
78
|
+
|
|
79
|
+
check_options
|
|
80
|
+
|
|
81
|
+
@keys = @options[:keys].map(&:to_sym)
|
|
82
|
+
@count_hash = Hash.new { |h, k| h[k] = Hash.new(0) }
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Return the command lambda for the count_values command.
|
|
86
|
+
#
|
|
87
|
+
# @return [Proc] Return command lambda.
|
|
88
|
+
def lmb
|
|
89
|
+
lambda do |input, output, status|
|
|
90
|
+
status_init(status, STATS)
|
|
91
|
+
|
|
92
|
+
TmpDir.create('count_values') do |tmp_file, _|
|
|
93
|
+
process_input(input, tmp_file)
|
|
94
|
+
process_output(output, tmp_file)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
private
|
|
100
|
+
|
|
101
|
+
# Check options.
|
|
102
|
+
def check_options
|
|
103
|
+
options_allowed(@options, :keys)
|
|
104
|
+
options_required(@options, :keys)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Save serialized stream to a temporary file and counting the requested
|
|
108
|
+
# values.
|
|
109
|
+
#
|
|
110
|
+
# @param input [Enumerator] Input stream.
|
|
111
|
+
# @param tmp_file [String] Path to temp file.
|
|
112
|
+
def process_input(input, tmp_file)
|
|
113
|
+
File.open(tmp_file, 'wb') do |ios|
|
|
114
|
+
BioDSL::Serializer.new(ios) do |s|
|
|
115
|
+
input.each do |record|
|
|
116
|
+
@keys.map do |key|
|
|
117
|
+
@count_hash[key][record[key]] += 1 if record.key? key
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
@status[:records_in] += 1
|
|
121
|
+
|
|
122
|
+
s << record
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Output serialized stream to the output stream including value counts.
|
|
129
|
+
#
|
|
130
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
131
|
+
# @param tmp_file [String] Path to temp file with serialized input stream.
|
|
132
|
+
def process_output(output, tmp_file)
|
|
133
|
+
File.open(tmp_file, 'rb') do |ios|
|
|
134
|
+
BioDSL::Serializer.new(ios) do |s|
|
|
135
|
+
s.each do |record|
|
|
136
|
+
@keys.map do |key|
|
|
137
|
+
if record.key? key
|
|
138
|
+
record["#{key}_COUNT".to_sym] = @count_hash[key][record[key]]
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
output << record
|
|
143
|
+
@status[:records_out] += 1
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|