BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# rubocop: disable ClassLength
|
|
30
|
+
|
|
31
|
+
# == Write tabular output from the stream.
|
|
32
|
+
#
|
|
33
|
+
# Description
|
|
34
|
+
#
|
|
35
|
+
# +write_table+ writes tabular output from the stream.
|
|
36
|
+
#
|
|
37
|
+
# == Usage
|
|
38
|
+
# write_table([keys: <string> | skip: <string>][, output: <file>[, force:
|
|
39
|
+
# <bool>[, header: <bool>[, pretty: <bool>[, commify: <bool>
|
|
40
|
+
# [, delimiter: <string>[, first: <uint> | last: <uint>
|
|
41
|
+
# [, gzip: <bool>, [bzip2: <bool>]]]]]]]]]
|
|
42
|
+
#
|
|
43
|
+
# === Options
|
|
44
|
+
# * keys <string> - Comma separated list of keys to print in that order.
|
|
45
|
+
# * skip <string> - Comma separated list of keys to skip printing.
|
|
46
|
+
# * output <file> - Output file.
|
|
47
|
+
# * force <bool> - Force overwrite existing output file.
|
|
48
|
+
# * header <bool> - Output header.
|
|
49
|
+
# * pretty <bool> - Pretty print table.
|
|
50
|
+
# * commify <bool> - Commify numbers when pretty printing.
|
|
51
|
+
# * delimiter <string> - Specify delimiter (default="\t").
|
|
52
|
+
# * first <uint> - Only output +first+ number of rows.
|
|
53
|
+
# * last <uint> - Only output +last+ number of rows.
|
|
54
|
+
# * gzip <bool> - Write gzipped output file.
|
|
55
|
+
# * bzip2 <bool> - Write bzipped output file.
|
|
56
|
+
#
|
|
57
|
+
# == Examples
|
|
58
|
+
#
|
|
59
|
+
# Consider the following records in the stream:
|
|
60
|
+
#
|
|
61
|
+
# {ORGANISM: Human
|
|
62
|
+
# COUNT: 23524
|
|
63
|
+
# SEQ: ATACGTCAG},
|
|
64
|
+
# {ORGANISM: Dog
|
|
65
|
+
# COUNT: 2442
|
|
66
|
+
# SEQ: AGCATGAC},
|
|
67
|
+
# {ORGANISM: Mouse
|
|
68
|
+
# COUNT: 234
|
|
69
|
+
# SEQ: GACTG},
|
|
70
|
+
# {ORGANISM: Cat
|
|
71
|
+
# COUNT: 2342
|
|
72
|
+
# SEQ: AAATGCA}
|
|
73
|
+
#
|
|
74
|
+
# To write all records from the stream as a table, do:
|
|
75
|
+
#
|
|
76
|
+
# write_table()
|
|
77
|
+
#
|
|
78
|
+
# Human 23524 ATACGTCAG
|
|
79
|
+
# Dog 2442 AGCATGAC
|
|
80
|
+
# Mouse 234 GACTG
|
|
81
|
+
# Cat 2342 AAATGCA
|
|
82
|
+
#
|
|
83
|
+
# If you supply the +header+ option, then the first row in the table will be a
|
|
84
|
+
# 'header' line prefixed with a '#':
|
|
85
|
+
#
|
|
86
|
+
# write_table(header: true)
|
|
87
|
+
#
|
|
88
|
+
# #ORGANISM COUNT SEQ
|
|
89
|
+
# Human 23524 ATACGTCAG
|
|
90
|
+
# Dog 2442 AGCATGAC
|
|
91
|
+
# Mouse 234 GACTG
|
|
92
|
+
# Cat 2342 AAATGCA
|
|
93
|
+
#
|
|
94
|
+
# You can also change the delimiter from the default (tab) to e.g. ',':
|
|
95
|
+
#
|
|
96
|
+
# write_table(delimiter: ',')
|
|
97
|
+
#
|
|
98
|
+
# Human,23524,ATACGTCAG
|
|
99
|
+
# Dog,2442,AGCATGAC
|
|
100
|
+
# Mouse,234,GACTG
|
|
101
|
+
# Cat,2342,AAATGCA
|
|
102
|
+
#
|
|
103
|
+
# If you want the values output in a specific order you have to supply a comma
|
|
104
|
+
# separated list using the +keys+ option that will print only those keys in
|
|
105
|
+
# that order:
|
|
106
|
+
#
|
|
107
|
+
# write_table(keys: [:SEQ, :COUNT])
|
|
108
|
+
#
|
|
109
|
+
# ATACGTCAG 23524
|
|
110
|
+
# AGCATGAC 2442
|
|
111
|
+
# GACTG 234
|
|
112
|
+
# AAATGCA 2342
|
|
113
|
+
#
|
|
114
|
+
# Keys in the format V0, V1, V2 ... Vn, is automagically sorted numerically.
|
|
115
|
+
#
|
|
116
|
+
# Alternatively, if you have some keys that you don't want in the tabular
|
|
117
|
+
# output, use the +skip+ option. So to print all keys except SEQ and SEQ_TYPE
|
|
118
|
+
# do:
|
|
119
|
+
#
|
|
120
|
+
# write_table(skip: [:SEQ])
|
|
121
|
+
#
|
|
122
|
+
# Human 23524
|
|
123
|
+
# Dog 2442
|
|
124
|
+
# Mouse 234
|
|
125
|
+
# Cat 2342
|
|
126
|
+
#
|
|
127
|
+
# And if you want a pretty printed table use the +pretty+ option and throw in
|
|
128
|
+
# the +commify+ option if you want commified numbers:
|
|
129
|
+
#
|
|
130
|
+
# write_tab(pretty: true, header: true, commify: true)
|
|
131
|
+
#
|
|
132
|
+
# +----------+--------+-----------+
|
|
133
|
+
# | ORGANISM | COUNT | SEQ |
|
|
134
|
+
# +----------+--------+-----------+
|
|
135
|
+
# | Human | 23,524 | ATACGTCAG |
|
|
136
|
+
# | Dog | 2,442 | AGCATGAC |
|
|
137
|
+
# | Mouse | 234 | GACTG |
|
|
138
|
+
# | Cat | 2,342 | AAATGCA |
|
|
139
|
+
# +----------+--------+-----------+
|
|
140
|
+
#
|
|
141
|
+
# To write a table to a file 'test.tab':
|
|
142
|
+
#
|
|
143
|
+
# write_table(output: "test.tab")
|
|
144
|
+
#
|
|
145
|
+
# To write a table to a file 'test.tab' with only the first 3 rows:
|
|
146
|
+
#
|
|
147
|
+
# write_table(output: "test.tab", first: 3)
|
|
148
|
+
#
|
|
149
|
+
# To write a table to a file 'test.tab' with only the last 3 rows:
|
|
150
|
+
#
|
|
151
|
+
# write_table(output: "test.tab", last: 3)
|
|
152
|
+
#
|
|
153
|
+
# To overwrite output file if this exists use the +force+ option:
|
|
154
|
+
#
|
|
155
|
+
# write_table(output: "test.tab", force: true)
|
|
156
|
+
#
|
|
157
|
+
# To write gzipped output to a file 'test.tab.gz'.
|
|
158
|
+
#
|
|
159
|
+
# write_table(output: "test.tab.gz", gzip: true)
|
|
160
|
+
#
|
|
161
|
+
# To write bzipped output to a file 'test.tab.bz2'.
|
|
162
|
+
#
|
|
163
|
+
# write_table(output: "test.tab.bz2", bzip2: true)
|
|
164
|
+
class WriteTable
|
|
165
|
+
require 'set'
|
|
166
|
+
require 'terminal-table'
|
|
167
|
+
|
|
168
|
+
STATS = %i(records_in records_out)
|
|
169
|
+
|
|
170
|
+
# Constructor for WriteTable.
|
|
171
|
+
#
|
|
172
|
+
# @param options [Hash] Options hash.
|
|
173
|
+
# @option options [Array] :keys
|
|
174
|
+
# @option options [Array] :skip
|
|
175
|
+
# @option options [String] :output
|
|
176
|
+
# @option options [Boolean] :force
|
|
177
|
+
# @option options [Boolean] :header
|
|
178
|
+
# @option options [Boolean] :pretty
|
|
179
|
+
# @option options [Boolean] :commify
|
|
180
|
+
# @option options [String] :delimiter
|
|
181
|
+
# @option options [Fixnum] :first
|
|
182
|
+
# @option options [Fixnum] :last
|
|
183
|
+
# @option options [Boolean] :gzip
|
|
184
|
+
# @option options [Boolean] :bzip2
|
|
185
|
+
#
|
|
186
|
+
# @return [WriteTable] Class instance.
|
|
187
|
+
def initialize(options)
|
|
188
|
+
@options = options
|
|
189
|
+
check_options
|
|
190
|
+
@options[:delimiter] ||= "\t"
|
|
191
|
+
@compress = choose_compression
|
|
192
|
+
@headings = nil
|
|
193
|
+
@header = @options[:header] ? true : false
|
|
194
|
+
@last = []
|
|
195
|
+
@rows = []
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Return command lambda for write_table.
|
|
199
|
+
#
|
|
200
|
+
# @return [Proc] Command lambda.
|
|
201
|
+
def lmb
|
|
202
|
+
lambda do |input, output, status|
|
|
203
|
+
status_init(status, STATS)
|
|
204
|
+
|
|
205
|
+
if @options[:output]
|
|
206
|
+
Filesys.open(@options[:output], 'w', compress: @compress) do |tab_out|
|
|
207
|
+
write_table(input, output, tab_out)
|
|
208
|
+
end
|
|
209
|
+
else
|
|
210
|
+
write_table(input, output, $stdout)
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
private
|
|
216
|
+
|
|
217
|
+
# Check options.
|
|
218
|
+
def check_options
|
|
219
|
+
options_allowed(@options, :keys, :skip, :output, :force, :header, :pretty,
|
|
220
|
+
:commify, :delimiter, :first, :last, :gzip, :bzip2)
|
|
221
|
+
options_unique(@options, :keys, :skip)
|
|
222
|
+
options_unique(@options, :first, :last)
|
|
223
|
+
options_unique(@options, :gzip, :bzip2)
|
|
224
|
+
options_allowed_values(@options, force: [nil, true, false])
|
|
225
|
+
options_allowed_values(@options, header: [nil, true, false])
|
|
226
|
+
options_tie(@options, commify: :pretty)
|
|
227
|
+
options_conflict(@options, delimiter: :pretty)
|
|
228
|
+
options_allowed_values(@options, pretty: [nil, true, false],
|
|
229
|
+
commify: [nil, true, false],
|
|
230
|
+
gzip: [nil, true, false],
|
|
231
|
+
bzip2: [nil, true, false])
|
|
232
|
+
options_tie(@options, gzip: :output, bzip2: :output)
|
|
233
|
+
options_files_exist_force(@options, :output)
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
# Choose compression to use which can either be gzip or bzip2 or no
|
|
237
|
+
# compression.
|
|
238
|
+
#
|
|
239
|
+
# @return [Symbol,nil] Compression.
|
|
240
|
+
def choose_compression
|
|
241
|
+
if @options[:gzip]
|
|
242
|
+
:gzip
|
|
243
|
+
elsif @options[:bzip2]
|
|
244
|
+
:bzip2
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# Write table from records read from the input stream and emit records
|
|
249
|
+
# to the output stream and table rows to the tab_out IO.
|
|
250
|
+
#
|
|
251
|
+
# @param input [Enumerator] Input stream.
|
|
252
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
253
|
+
# @param tab_out [IO,STDOUT] Output to file or stdout.
|
|
254
|
+
def write_table(input, output, tab_out)
|
|
255
|
+
input.each_with_index do |record, i|
|
|
256
|
+
@status[:records_in] += 1
|
|
257
|
+
|
|
258
|
+
compile_headings(record) unless @headings
|
|
259
|
+
|
|
260
|
+
row = record.values_at(*@headings)
|
|
261
|
+
|
|
262
|
+
if @options[:pretty]
|
|
263
|
+
@rows << row
|
|
264
|
+
else
|
|
265
|
+
output_row(tab_out, row, i)
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
if output
|
|
269
|
+
output << record
|
|
270
|
+
@status[:records_out] += 1
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
@options[:pretty] ? output_pretty(tab_out) : output_last(tab_out)
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
# Compile a list of headings to be used with the output table.
|
|
278
|
+
#
|
|
279
|
+
# @param record [Hash] BioDSL record.
|
|
280
|
+
def compile_headings(record)
|
|
281
|
+
@headings = if @options[:keys]
|
|
282
|
+
@options[:keys].map(&:to_sym)
|
|
283
|
+
elsif record.keys.first =~ /^V\d+$/
|
|
284
|
+
sort_keys(record)
|
|
285
|
+
else
|
|
286
|
+
record.keys
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
skip_headings if @options[:skip]
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
# Sort keys in the form V[0-9]+ on the numerical part in ascending order.
|
|
293
|
+
def sort_keys(record)
|
|
294
|
+
record.keys.sort do |a, b|
|
|
295
|
+
a.to_s[1..a.to_s.size].to_i <=> b.to_s[1..a.to_s.size].to_i
|
|
296
|
+
end
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
# Output row.
|
|
300
|
+
#
|
|
301
|
+
# @param tab_out [Enumerator::Yielder,STDOUT]
|
|
302
|
+
# @param row [Array] Row to output
|
|
303
|
+
# @param i [Fixnum] Row number
|
|
304
|
+
def output_row(tab_out, row, i)
|
|
305
|
+
output_header(tab_out) if @header
|
|
306
|
+
|
|
307
|
+
return if row.compact.empty?
|
|
308
|
+
|
|
309
|
+
if @options[:first]
|
|
310
|
+
process_first(tab_out, row, i)
|
|
311
|
+
elsif @options[:last]
|
|
312
|
+
process_last(row)
|
|
313
|
+
else
|
|
314
|
+
tab_out.puts row.join(@options[:delimiter])
|
|
315
|
+
end
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
# Output header to the given IO if the +header+ flag is set.
|
|
319
|
+
#
|
|
320
|
+
# @param tab_out [IO,STDOUT] Table output IO.
|
|
321
|
+
def output_header(tab_out)
|
|
322
|
+
unless @headings.compact.empty?
|
|
323
|
+
tab_out.puts '#' + @headings.join(@options[:delimiter])
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
@header = false
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
# Output row to IO if row is among the first number requested.
|
|
330
|
+
#
|
|
331
|
+
# @param tab_out [IO,STDOUT] Table output IO.
|
|
332
|
+
# @param row [Array] Row with table data.
|
|
333
|
+
# @param i [Integer] Row number.
|
|
334
|
+
def process_first(tab_out, row, i)
|
|
335
|
+
return unless i < @options[:first]
|
|
336
|
+
tab_out.puts row.join(@options[:delimiter])
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
# Add row to last buffer and adjust the size of the buffer to the number of
|
|
340
|
+
# rows requested.
|
|
341
|
+
#
|
|
342
|
+
# @param row [Array] Row with table data.
|
|
343
|
+
def process_last(row)
|
|
344
|
+
@last << row
|
|
345
|
+
@last.shift if @last.size > @options[:last]
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
# Skip headings according to the specified options.
|
|
349
|
+
def skip_headings
|
|
350
|
+
skip = @options[:skip].each_with_object(Set.new) { |e, a| a << e.to_sym }
|
|
351
|
+
@headings.reject! { |r| skip.include? r }
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
# Output data rows as pretty printed table.
|
|
355
|
+
#
|
|
356
|
+
# @param tab_out [IO,STDOUT] Table output IO.
|
|
357
|
+
def output_pretty(tab_out)
|
|
358
|
+
return unless @options[:pretty]
|
|
359
|
+
|
|
360
|
+
table = Terminal::Table.new
|
|
361
|
+
|
|
362
|
+
unless @rows.empty?
|
|
363
|
+
table.headings = @headings if @options[:header]
|
|
364
|
+
commify if @options[:commify]
|
|
365
|
+
fill_table(table)
|
|
366
|
+
align_columns(table)
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
tab_out.puts table
|
|
370
|
+
end
|
|
371
|
+
|
|
372
|
+
# Insert commas in large numbers for readability.
|
|
373
|
+
def commify
|
|
374
|
+
@rows.each do |row|
|
|
375
|
+
row.each_with_index do |cell, i|
|
|
376
|
+
if cell.is_a? Integer
|
|
377
|
+
row[i] = cell.to_i.commify
|
|
378
|
+
elsif cell.is_a? Float
|
|
379
|
+
row[i] = cell.to_f.commify
|
|
380
|
+
end
|
|
381
|
+
end
|
|
382
|
+
end
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
# Fill terminal table with data.
|
|
386
|
+
#
|
|
387
|
+
# @param table [Terminal::Table] Table to be pretty printed.
|
|
388
|
+
def fill_table(table)
|
|
389
|
+
table.rows = if @options[:first]
|
|
390
|
+
@rows.first(@options[:first])
|
|
391
|
+
elsif @options[:last]
|
|
392
|
+
@rows.last(@options[:last])
|
|
393
|
+
else
|
|
394
|
+
@rows
|
|
395
|
+
end
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
# Iterate over the first row in the given table to be pretty printed and
|
|
399
|
+
# determine the alignment of each column.
|
|
400
|
+
#
|
|
401
|
+
# @param table [Terminal::Table] Table to be pretty printed.
|
|
402
|
+
def align_columns(table)
|
|
403
|
+
@rows.first.each_with_index do |cell, i|
|
|
404
|
+
next unless cell.is_a?(Fixnum) ||
|
|
405
|
+
cell.is_a?(Float) ||
|
|
406
|
+
cell.delete(',') =~ /^[0-9]+$/
|
|
407
|
+
|
|
408
|
+
table.align_column(i, :right)
|
|
409
|
+
end
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
# Output last table rows.
|
|
413
|
+
#
|
|
414
|
+
# @param tab_out [IO,STDOUT] Table output IO.
|
|
415
|
+
def output_last(tab_out)
|
|
416
|
+
@last.each { |row| tab_out.puts(row.join(@options[:delimiter])) }
|
|
417
|
+
end
|
|
418
|
+
end
|
|
419
|
+
end
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Write aligned sequences from stream as a tree.
|
|
30
|
+
#
|
|
31
|
+
# Description
|
|
32
|
+
#
|
|
33
|
+
# +write_tree+ takes aligned sequences from the stream and uses FastTree to to
|
|
34
|
+
# create a distance tree between the sequences. The tree is in Newick format.
|
|
35
|
+
# FastTree must be installed.
|
|
36
|
+
#
|
|
37
|
+
# For more about the FastTree here:
|
|
38
|
+
#
|
|
39
|
+
# http://www.microbesonline.org/fasttree/
|
|
40
|
+
#
|
|
41
|
+
# == Usage
|
|
42
|
+
# write_tree([, output: <file>[, force: <bool>[, type: <string>]]])
|
|
43
|
+
#
|
|
44
|
+
# === Options
|
|
45
|
+
# * output <file> - Output file.
|
|
46
|
+
# * force <bool> - Force overwrite existing output file.
|
|
47
|
+
# * type <string> - Sequence type :dna|:rna|:protein (default=:dna).
|
|
48
|
+
#
|
|
49
|
+
# == Examples
|
|
50
|
+
#
|
|
51
|
+
# To create a tree from aligned FASTA sequences in the file `align.fna` do:
|
|
52
|
+
#
|
|
53
|
+
# BP.new.
|
|
54
|
+
# read_fasta(input: "align.fna").
|
|
55
|
+
# write_tree(output: "align.tree").
|
|
56
|
+
# run
|
|
57
|
+
class WriteTree
|
|
58
|
+
require 'open3'
|
|
59
|
+
require 'BioDSL/helpers/aux_helper'
|
|
60
|
+
|
|
61
|
+
include AuxHelper
|
|
62
|
+
|
|
63
|
+
STATS = %i(records_in records_out sequences_in residues_in)
|
|
64
|
+
|
|
65
|
+
# Constructor for WriteTree.
|
|
66
|
+
#
|
|
67
|
+
# @param options [Hash] Options hash.
|
|
68
|
+
# @option options [String] :output
|
|
69
|
+
# @option options [Boolean] :force
|
|
70
|
+
# @option options [Symbol] :type
|
|
71
|
+
#
|
|
72
|
+
# @return [WriteTree] Class instance.
|
|
73
|
+
def initialize(options)
|
|
74
|
+
@options = options
|
|
75
|
+
|
|
76
|
+
aux_exist('FastTree')
|
|
77
|
+
check_options
|
|
78
|
+
|
|
79
|
+
@cmd = compile_command
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# rubocop: disable Metrics/AbcSize
|
|
83
|
+
# rubocop: disable MethodLength
|
|
84
|
+
|
|
85
|
+
# Return command lambda for write_tree.
|
|
86
|
+
#
|
|
87
|
+
# @return [Proc] Command lambda.
|
|
88
|
+
def lmb
|
|
89
|
+
lambda do |input, output, status|
|
|
90
|
+
status_init(status, STATS)
|
|
91
|
+
|
|
92
|
+
Open3.popen3(@cmd) do |stdin, stdout, stderr, wait_thr|
|
|
93
|
+
input.each_with_index do |record, i|
|
|
94
|
+
@status[:records_in] += 1
|
|
95
|
+
|
|
96
|
+
write_seq(stdin, record, i) if record[:SEQ]
|
|
97
|
+
|
|
98
|
+
output << record && @status[:records_out] += 1 if output
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
stdin.close
|
|
102
|
+
|
|
103
|
+
tree_data = stdout.read.chomp
|
|
104
|
+
|
|
105
|
+
stdout.close
|
|
106
|
+
|
|
107
|
+
exit_status = wait_thr.value
|
|
108
|
+
|
|
109
|
+
fail stderr.read unless exit_status.success?
|
|
110
|
+
|
|
111
|
+
write_tree(tree_data)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# rubocop: enable Metrics/AbcSize
|
|
117
|
+
# rubocop: enable MethodLength
|
|
118
|
+
|
|
119
|
+
private
|
|
120
|
+
|
|
121
|
+
# Check options.
|
|
122
|
+
def check_options
|
|
123
|
+
options_allowed(@options, :force, :output, :type)
|
|
124
|
+
options_allowed_values(@options, type: [:dna, :rna, :protein])
|
|
125
|
+
options_files_exist_force(@options, :output)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Compile command for running FastTree.
|
|
129
|
+
#
|
|
130
|
+
# @return [String] FastTree command.
|
|
131
|
+
def compile_command
|
|
132
|
+
cmd = []
|
|
133
|
+
cmd << 'FastTree'
|
|
134
|
+
cmd << '-nt' unless @options[:type] == :protein
|
|
135
|
+
cmd << '-quiet' unless BioDSL.verbose
|
|
136
|
+
cmd.join(' ')
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Write a record with sequence to stdin.
|
|
140
|
+
#
|
|
141
|
+
# @param stdin [IO] Open3 IO.
|
|
142
|
+
# @param record [Hash] BioDSL record.
|
|
143
|
+
# @param i [Integer] Record index.
|
|
144
|
+
def write_seq(stdin, record, i)
|
|
145
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
146
|
+
entry.seq_name ||= i
|
|
147
|
+
|
|
148
|
+
@status[:sequences_in] += 1
|
|
149
|
+
@status[:residues_in] += entry.length
|
|
150
|
+
|
|
151
|
+
stdin.puts entry.to_fasta
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Write tree data to file or stdout.
|
|
155
|
+
#
|
|
156
|
+
# @param tree_data [String] Tree data in Newick format.
|
|
157
|
+
def write_tree(tree_data)
|
|
158
|
+
if @options[:output]
|
|
159
|
+
File.open(@options[:output], 'w') do |ios|
|
|
160
|
+
ios.puts tree_data
|
|
161
|
+
end
|
|
162
|
+
else
|
|
163
|
+
puts tree_data
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
16
|
+
# #
|
|
17
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
18
|
+
# #
|
|
19
|
+
# This software is part of BioDSL (www.github.com/maasha/BioDSL). #
|
|
20
|
+
# #
|
|
21
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
22
|
+
|
|
23
|
+
module BioDSL
|
|
24
|
+
# Module that require all files in the BioDSL/commands/ directory
|
|
25
|
+
module Commands
|
|
26
|
+
Dir[File.join(File.dirname(__FILE__), 'commands', '*')].each do |file|
|
|
27
|
+
require file.split(File::SEPARATOR)[-3..-1].join(File::SEPARATOR).
|
|
28
|
+
chomp('.rb')
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# Module with Config constants.
|
|
30
|
+
module Config
|
|
31
|
+
require 'parallel'
|
|
32
|
+
require 'BioDSL/helpers/options_helper'
|
|
33
|
+
|
|
34
|
+
extend OptionsHelper
|
|
35
|
+
|
|
36
|
+
HISTORY_FILE = File.join(ENV['HOME'], '.BioDSL_history')
|
|
37
|
+
LOG_FILE = File.join(ENV['HOME'], '.BioDSL_log')
|
|
38
|
+
RC_FILE = File.join(ENV['HOME'], '.BioDSLrc')
|
|
39
|
+
STATUS_PROGRESS_INTERVAL = 0.1 # update progress every n second.
|
|
40
|
+
|
|
41
|
+
options = options_load_rc({}, :pipeline)
|
|
42
|
+
|
|
43
|
+
TMP_DIR = if options && !options[:tmp_dir].empty?
|
|
44
|
+
options[:tmp_dir].first
|
|
45
|
+
else
|
|
46
|
+
Dir.tmpdir
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
CORES_MAX = if options && !options[:processor_count].empty?
|
|
50
|
+
options[:processor_count].first.to_i
|
|
51
|
+
else
|
|
52
|
+
Parallel.processor_count
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|