BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Merge pair-end sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# +merge_pair_seq+ merges paired sequences in the stream, if these are
|
|
32
|
+
# interleaved. Sequence names must be in either Illumina1.3/1.5 format
|
|
33
|
+
# trailing a /1 or /2 or Illumina1.8 containing 1: or 2:. Sequence names must
|
|
34
|
+
# match accordingly in order to merge sequences.
|
|
35
|
+
#
|
|
36
|
+
# == Usage
|
|
37
|
+
#
|
|
38
|
+
# merge_pair_seq
|
|
39
|
+
#
|
|
40
|
+
# === Options
|
|
41
|
+
#
|
|
42
|
+
# == Examples
|
|
43
|
+
#
|
|
44
|
+
# Consider the following FASTQ entry in the file test.fq:
|
|
45
|
+
#
|
|
46
|
+
# @M01168:16:000000000-A1R9L:1:1101:14862:1868 1:N:0:14
|
|
47
|
+
# TGGGGAATATTGGACAATGG
|
|
48
|
+
# +
|
|
49
|
+
# <??????BDDDDDDDDGGGG
|
|
50
|
+
# @M01168:16:000000000-A1R9L:1:1101:14862:1868 2:N:0:14
|
|
51
|
+
# CCTGTTTGCTACCCACGCTT
|
|
52
|
+
# +
|
|
53
|
+
# ?????BB<-<BDDDDDFEEF
|
|
54
|
+
# @M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14
|
|
55
|
+
# TAGGGAATCTTGCACAATGG
|
|
56
|
+
# +
|
|
57
|
+
# <???9?BBBDBDDBDDFFFF
|
|
58
|
+
# @M01168:16:000000000-A1R9L:1:1101:13906:2139 2:N:0:14
|
|
59
|
+
# ACTCTTCGCTACCCATGCTT
|
|
60
|
+
# +
|
|
61
|
+
# ,5<??BB?DDABDBDDFFFF
|
|
62
|
+
# @M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14
|
|
63
|
+
# TAGGGAATCTTGCACAATGG
|
|
64
|
+
# +
|
|
65
|
+
# ?????BBBBBDDBDDBFFFF
|
|
66
|
+
# @M01168:16:000000000-A1R9L:1:1101:14865:2158 2:N:0:14
|
|
67
|
+
# CCTCTTCGCTACCCATGCTT
|
|
68
|
+
# +
|
|
69
|
+
# ??,<??B?BB?BBBBBFF?F
|
|
70
|
+
#
|
|
71
|
+
# To merge these interleaved pair-end sequences use merge_pair_seq:
|
|
72
|
+
#
|
|
73
|
+
# BP.new.
|
|
74
|
+
# read_fastq(input: "test.fq", encoding: :base_33).
|
|
75
|
+
# merge_pair_seq.
|
|
76
|
+
# dump.
|
|
77
|
+
# run
|
|
78
|
+
#
|
|
79
|
+
# {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14862:1868 1:N:0:14",
|
|
80
|
+
# :SEQ=>"TGGGGAATATTGGACAATGGCCTGTTTGCTACCCACGCTT",
|
|
81
|
+
# :SEQ_LEN=>40,
|
|
82
|
+
# :SCORES=>"<??????BDDDDDDDDGGGG?????BB<-<BDDDDDFEEF",
|
|
83
|
+
# :SEQ_LEN_LEFT=>20,
|
|
84
|
+
# :SEQ_LEN_RIGHT=>20}
|
|
85
|
+
# {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14",
|
|
86
|
+
# :SEQ=>"TAGGGAATCTTGCACAATGGACTCTTCGCTACCCATGCTT",
|
|
87
|
+
# :SEQ_LEN=>40,
|
|
88
|
+
# :SCORES=>"<???9?BBBDBDDBDDFFFF,5<??BB?DDABDBDDFFFF",
|
|
89
|
+
# :SEQ_LEN_LEFT=>20,
|
|
90
|
+
# :SEQ_LEN_RIGHT=>20}
|
|
91
|
+
# {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14",
|
|
92
|
+
# :SEQ=>"TAGGGAATCTTGCACAATGGCCTCTTCGCTACCCATGCTT",
|
|
93
|
+
# :SEQ_LEN=>40,
|
|
94
|
+
# :SCORES=>"?????BBBBBDDBDDBFFFF??,<??B?BB?BBBBBFF?F",
|
|
95
|
+
# :SEQ_LEN_LEFT=>20,
|
|
96
|
+
# :SEQ_LEN_RIGHT=>20}
|
|
97
|
+
class MergePairSeq
|
|
98
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
99
|
+
residues_out)
|
|
100
|
+
|
|
101
|
+
# Constructor for MergePairSeq.
|
|
102
|
+
#
|
|
103
|
+
# @param options [Hash] Options hash.
|
|
104
|
+
#
|
|
105
|
+
# @return [MergePairSeq] Instance of MergePairSeq.
|
|
106
|
+
def initialize(options)
|
|
107
|
+
@options = options
|
|
108
|
+
|
|
109
|
+
check_options
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Return the command lambda for merge_pair_seq.
|
|
113
|
+
#
|
|
114
|
+
# @return [Proc] Command lambda for.
|
|
115
|
+
def lmb
|
|
116
|
+
lambda do |input, output, status|
|
|
117
|
+
status_init(status, STATS)
|
|
118
|
+
|
|
119
|
+
input.each_slice(2) do |record1, record2|
|
|
120
|
+
@status[:records_in] += record2 ? 2 : 1
|
|
121
|
+
|
|
122
|
+
if record1[:SEQ] && record2[:SEQ]
|
|
123
|
+
output << merge_pair_seq(record1, record2)
|
|
124
|
+
|
|
125
|
+
@status[:sequences_in] += 2
|
|
126
|
+
@status[:sequences_out] += 1
|
|
127
|
+
@status[:records_out] += 1
|
|
128
|
+
else
|
|
129
|
+
output.puts record1, record2
|
|
130
|
+
|
|
131
|
+
@status[:records_out] += 2
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
private
|
|
138
|
+
|
|
139
|
+
# Check options.
|
|
140
|
+
def check_options
|
|
141
|
+
options_allowed(@options, nil)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Merge entry pair and return a new BioDSL record with this.
|
|
145
|
+
#
|
|
146
|
+
# @param record1 [Hash] BioDSL record 1.
|
|
147
|
+
# @param record2 [Hash] BioDSL record 2.
|
|
148
|
+
#
|
|
149
|
+
# @return [Hash] BioDSL record.
|
|
150
|
+
def merge_pair_seq(record1, record2)
|
|
151
|
+
entry1 = BioDSL::Seq.new_bp(record1)
|
|
152
|
+
entry2 = BioDSL::Seq.new_bp(record2)
|
|
153
|
+
|
|
154
|
+
BioDSL::Seq.check_name_pair(entry1, entry2)
|
|
155
|
+
|
|
156
|
+
@status[:residues_in] += entry1.length + entry2.length
|
|
157
|
+
|
|
158
|
+
length1 = entry1.length
|
|
159
|
+
length2 = entry2.length
|
|
160
|
+
|
|
161
|
+
entry1 << entry2
|
|
162
|
+
|
|
163
|
+
@status[:residues_out] += entry1.length
|
|
164
|
+
|
|
165
|
+
new_record(entry1, length1, length2)
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def new_record(entry1, length1, length2)
|
|
169
|
+
new_record = entry1.to_bp
|
|
170
|
+
new_record[:SEQ_LEN_LEFT] = length1
|
|
171
|
+
new_record[:SEQ_LEN_RIGHT] = length2
|
|
172
|
+
new_record
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Merge records on a given key with tabular data from one or more files.
|
|
30
|
+
#
|
|
31
|
+
# +merge_table+ reads in one or more tabular files and merges any records in
|
|
32
|
+
# the stream with identical values for a given key. The values for the given
|
|
33
|
+
# key must be unique in the tabular files, but not necesarily in the stream.
|
|
34
|
+
#
|
|
35
|
+
# Consult *read_table* for details on how the tabular files are read.
|
|
36
|
+
#
|
|
37
|
+
# The stats for +merge_table+ includes the following values:
|
|
38
|
+
#
|
|
39
|
+
# * rows_total - total number of table rows.
|
|
40
|
+
# * rows_matched - number of table rows with the given key.
|
|
41
|
+
# * rows_unmatched - number of table rows without the given key.
|
|
42
|
+
# * merged - number of records that was merged.
|
|
43
|
+
# * non_merged - number of records that was not merged.
|
|
44
|
+
#
|
|
45
|
+
# == Usage
|
|
46
|
+
# merge_table(<input: <glob>>, <key: <string>>[, columns: <list>
|
|
47
|
+
# [, keys: <list>[, skip: <uint>[, delimiter: <string>]]]])
|
|
48
|
+
#
|
|
49
|
+
# === Options
|
|
50
|
+
# * input <glob> - Input file or file glob expression.
|
|
51
|
+
# * key <string> - Key used to merge
|
|
52
|
+
# * columns <list> - List of columns to read in that order.
|
|
53
|
+
# * keys <list> - List of key identifiers to use for each column.
|
|
54
|
+
# * skip <uint> - Number of initial lines to skip (default=0).
|
|
55
|
+
# * delimiter <string> - Delimter to use for separating columsn
|
|
56
|
+
# (default="\s+").
|
|
57
|
+
#
|
|
58
|
+
# == Examples
|
|
59
|
+
#
|
|
60
|
+
# Consider the following two files:
|
|
61
|
+
#
|
|
62
|
+
# test1.tab:
|
|
63
|
+
# #ID ORGANISM
|
|
64
|
+
# 1 parrot
|
|
65
|
+
# 2 eel
|
|
66
|
+
# 3 platypus
|
|
67
|
+
# 4 beetle
|
|
68
|
+
#
|
|
69
|
+
# test2.tab:
|
|
70
|
+
#
|
|
71
|
+
# #ID COUNT
|
|
72
|
+
# 1 5423
|
|
73
|
+
# 2 34
|
|
74
|
+
# 3 2423
|
|
75
|
+
# 4 234
|
|
76
|
+
#
|
|
77
|
+
# We can merge the data with +merge_table+ like this:
|
|
78
|
+
#
|
|
79
|
+
# BP.new.
|
|
80
|
+
# read_table(input: "test1.tab").
|
|
81
|
+
# merge_table(input: "test2.tab", key: :ID).
|
|
82
|
+
# dump.
|
|
83
|
+
# run
|
|
84
|
+
#
|
|
85
|
+
# {:ID=>1, :ORGANISM=>"parrot", :COUNT=>5423}
|
|
86
|
+
# {:ID=>2, :ORGANISM=>"eel", :COUNT=>34}
|
|
87
|
+
# {:ID=>3, :ORGANISM=>"platypus", :COUNT=>2423}
|
|
88
|
+
# {:ID=>4, :ORGANISM=>"beetle", :COUNT=>234}
|
|
89
|
+
class MergeTable
|
|
90
|
+
STATS = %i(records_in records_out rows_total rows_matched rows_unmatched
|
|
91
|
+
merged non_merged)
|
|
92
|
+
|
|
93
|
+
# Constructor for MergeTable.
|
|
94
|
+
#
|
|
95
|
+
# @param options [Hash]
|
|
96
|
+
# Options hash.
|
|
97
|
+
#
|
|
98
|
+
# @option options [String] :input
|
|
99
|
+
# Input glob expression.
|
|
100
|
+
#
|
|
101
|
+
# @option options [String, Symbol] :key
|
|
102
|
+
# Key used to merge.
|
|
103
|
+
#
|
|
104
|
+
# @option options [Array] :keys
|
|
105
|
+
# List of key identifiers to use for each column.
|
|
106
|
+
#
|
|
107
|
+
# @option options [Array] :columns
|
|
108
|
+
# List of columns to read in that order.
|
|
109
|
+
#
|
|
110
|
+
# @option options [Integer] :skip
|
|
111
|
+
# Number of initial lines to skip.
|
|
112
|
+
#
|
|
113
|
+
# @option options [String] :delimiter
|
|
114
|
+
# Delimter to use for separating columns.
|
|
115
|
+
#
|
|
116
|
+
# @return [MergeTable] Class instance.
|
|
117
|
+
def initialize(options)
|
|
118
|
+
@options = options
|
|
119
|
+
|
|
120
|
+
check_options
|
|
121
|
+
defaults
|
|
122
|
+
|
|
123
|
+
@table = {}
|
|
124
|
+
@key = @options[:key].to_sym
|
|
125
|
+
@keys = options[:keys] ? @options[:keys].map(&:to_sym) : nil
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Return command lambda for merge_table.
|
|
129
|
+
#
|
|
130
|
+
# @return [Proc] Command lambda.
|
|
131
|
+
def lmb
|
|
132
|
+
lambda do |input, output, status|
|
|
133
|
+
status_init(status, STATS)
|
|
134
|
+
|
|
135
|
+
parse_input_tables
|
|
136
|
+
|
|
137
|
+
input.each do |record|
|
|
138
|
+
@status[:records_in] += 1
|
|
139
|
+
|
|
140
|
+
if record[@key] && @table[record[@key]]
|
|
141
|
+
@status[:merged] += 1
|
|
142
|
+
record = record.merge(@table[record[@key]])
|
|
143
|
+
else
|
|
144
|
+
@status[:non_merged] += 1
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
output << record
|
|
148
|
+
@status[:records_out] += 1
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
@status[:rows_total] = @status[:rows_matched] + @status[:rows_unmatched]
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
private
|
|
156
|
+
|
|
157
|
+
# Check options.
|
|
158
|
+
def check_options
|
|
159
|
+
options_allowed(@options, :input, :key, :keys, :columns, :skip,
|
|
160
|
+
:delimiter)
|
|
161
|
+
options_required(@options, :input, :key)
|
|
162
|
+
options_files_exist(@options, :input)
|
|
163
|
+
options_list_unique(@options, :keys, :columns)
|
|
164
|
+
options_assert(@options, ':skip >= 0')
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Set default options.
|
|
168
|
+
def defaults
|
|
169
|
+
@options[:skip] ||= 0
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Parse input table files and add each row to a table hash.
|
|
173
|
+
def parse_input_tables
|
|
174
|
+
options_glob(@options[:input]).each do |file|
|
|
175
|
+
BioDSL::CSV.open(file) do |ios|
|
|
176
|
+
ios.skip(@options[:skip])
|
|
177
|
+
|
|
178
|
+
ios.each_hash(delimiter: @options[:delimiter],
|
|
179
|
+
select: @options[:columns]) do |record|
|
|
180
|
+
trim_record(record) if @keys
|
|
181
|
+
|
|
182
|
+
add_row(record)
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Trim given record removing unwanted key/values.
|
|
189
|
+
#
|
|
190
|
+
# @param record [Hash] BioDSL record.
|
|
191
|
+
def trim_record(record)
|
|
192
|
+
record.first(@keys.size).each_with_index do |(k, v), i|
|
|
193
|
+
record.delete(k)
|
|
194
|
+
record[@keys[i]] = v
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Add a given record to the table hash.
|
|
199
|
+
#
|
|
200
|
+
# @param record [Hash] BioDSL record.
|
|
201
|
+
#
|
|
202
|
+
# @raise [RuntimeError] if duplicate values are found.
|
|
203
|
+
def add_row(record)
|
|
204
|
+
if record[@key]
|
|
205
|
+
check_duplicate(record)
|
|
206
|
+
|
|
207
|
+
@status[:rows_matched] += 1
|
|
208
|
+
|
|
209
|
+
@table[record[@key]] = record
|
|
210
|
+
else
|
|
211
|
+
@status[:rows_unmatched] += 1
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# Check if a given record is already added to the table and raise if so.
|
|
216
|
+
#
|
|
217
|
+
# @param record [Hash] BioDSL record.
|
|
218
|
+
#
|
|
219
|
+
# @raise [RuntimeError] if duplicate values are found.
|
|
220
|
+
def check_duplicate(record)
|
|
221
|
+
return unless @table[record[@key]]
|
|
222
|
+
fail "Duplicate values found for key: #{@key} value: #{record[@key]}"
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
end
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Merge values of specified keys.
|
|
30
|
+
#
|
|
31
|
+
# +merge_values+ merges the values of a list of keys using a given delimiter
|
|
32
|
+
# and saves the new value as the value of the first key.
|
|
33
|
+
#
|
|
34
|
+
# == Usage
|
|
35
|
+
#
|
|
36
|
+
# merge_values(<keys: <list>>[, delimiter: <string>])
|
|
37
|
+
#
|
|
38
|
+
# === Options
|
|
39
|
+
#
|
|
40
|
+
# * keys: <list> - List of keys to merge.
|
|
41
|
+
# * delimiter: <string> - Delimiter (default='_').
|
|
42
|
+
#
|
|
43
|
+
# == Examples
|
|
44
|
+
#
|
|
45
|
+
# Consider the following record:
|
|
46
|
+
#
|
|
47
|
+
# {ID: "FOO", COUNT: 10, SEQ: "gataag"}
|
|
48
|
+
#
|
|
49
|
+
# To merge the values so that the COUNT and ID is merged in that order do:
|
|
50
|
+
#
|
|
51
|
+
# merge_values(keys: [:COUNT, :ID])
|
|
52
|
+
#
|
|
53
|
+
# {:ID=>"FOO", :COUNT=>"10_FOO", :SEQ=>"gataag"}
|
|
54
|
+
#
|
|
55
|
+
# Changing the +delimiter+ and order:
|
|
56
|
+
#
|
|
57
|
+
# merge_values(keys: [:ID, :COUNT], delimiter: ':count=')
|
|
58
|
+
#
|
|
59
|
+
# {:ID=>"FOO:count=10", :COUNT=>10, :SEQ=>"gataag"}
|
|
60
|
+
class MergeValues
|
|
61
|
+
STATS = %i(records_in records_out)
|
|
62
|
+
|
|
63
|
+
# Constructor for MergeValues.
|
|
64
|
+
#
|
|
65
|
+
# @param options [Hash] Options hash.
|
|
66
|
+
# @option options [Array] :keys Keys whos values to merge.
|
|
67
|
+
# @option options [String] :delimiter Delimiter for joining.
|
|
68
|
+
#
|
|
69
|
+
# @return [MergeValues] Class instance of MergeValues.
|
|
70
|
+
def initialize(options)
|
|
71
|
+
@options = options
|
|
72
|
+
check_options
|
|
73
|
+
defaults
|
|
74
|
+
|
|
75
|
+
@keys = options[:keys]
|
|
76
|
+
@delimiter = options[:delimiter]
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Return command lambda for merge_values.
|
|
80
|
+
#
|
|
81
|
+
# @return [Proc] Command lambda.
|
|
82
|
+
def lmb
|
|
83
|
+
lambda do |input, output, status|
|
|
84
|
+
status_init(status, STATS)
|
|
85
|
+
|
|
86
|
+
input.each do |record|
|
|
87
|
+
@status[:records_in] += 1
|
|
88
|
+
|
|
89
|
+
if @keys.all? { |key| record.key? key }
|
|
90
|
+
values = @keys.inject([]) { |a, e| a << record[e.to_sym] }
|
|
91
|
+
record[@keys.first] = values.join(@delimiter)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
output << record
|
|
95
|
+
@status[:records_out] += 1
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
private
|
|
101
|
+
|
|
102
|
+
# Check options.
|
|
103
|
+
def check_options
|
|
104
|
+
options_allowed(@options, :keys, :delimiter)
|
|
105
|
+
options_required(@options, :keys)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Set default options.
|
|
109
|
+
def defaults
|
|
110
|
+
@options[:delimiter] ||= '_'
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|