BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
@@ -0,0 +1,329 @@
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
18
|
+
# USA. #
|
19
|
+
# #
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
21
|
+
# #
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
|
+
# #
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
25
|
+
# #
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
|
+
|
28
|
+
module BioDSL
|
29
|
+
# == Read tabular data from one or more files.
|
30
|
+
#
|
31
|
+
# Tabular input can be read with +read_table+ which will read in chosen rows
|
32
|
+
# and chosen columns (separated by a given delimiter) from a table in ASCII
|
33
|
+
# text format.
|
34
|
+
#
|
35
|
+
# If no +keys+ option is given and there is a comment line beginning with #
|
36
|
+
# the fields here will be used as keys. Subsequence lines beginning with #
|
37
|
+
# will be ignored.
|
38
|
+
#
|
39
|
+
# If a comment line is present beginning with a # the options +select+ and
|
40
|
+
# +reject+ can be used to chose what columns to read.
|
41
|
+
#
|
42
|
+
# == Usage
|
43
|
+
# read_table(input: <glob>[, first: <uint>|last: <uint>][, select: <list>
|
44
|
+
# |, reject: <list>[, keys: <list>][, skip: <uint>
|
45
|
+
# [, delimiter: <string>]]])
|
46
|
+
#
|
47
|
+
# === Options
|
48
|
+
# * input <glob> - Input file or file glob expression.
|
49
|
+
# * first <uint> - Only read in the _first_ number of entries.
|
50
|
+
# * last <uint> - Only read in the _last_ number of entries.
|
51
|
+
# * select <list> - List of column indexes or header keys to read.
|
52
|
+
# * reject <list> - List of column indexes or header keys to skip.
|
53
|
+
# * keys <list> - List of key identifiers to use for each column.
|
54
|
+
# * skip <uint> - Number of initial lines to skip (default=0).
|
55
|
+
# * delimiter <string> - Delimter to use for separating columsn
|
56
|
+
# (default="\s+").
|
57
|
+
#
|
58
|
+
# == Examples
|
59
|
+
#
|
60
|
+
# To read all entries from a file:
|
61
|
+
#
|
62
|
+
# read_table(input: "test.tab")
|
63
|
+
#
|
64
|
+
# To read all entries from a gzipped file:
|
65
|
+
#
|
66
|
+
# read_table(input: "test.tab.gz")
|
67
|
+
#
|
68
|
+
# To read in only 10 records from a file:
|
69
|
+
#
|
70
|
+
# read_table(input: "test.tab", first: 10)
|
71
|
+
#
|
72
|
+
# To read in the last 10 records from a file:
|
73
|
+
#
|
74
|
+
# read_table(input: "test.tab", last: 10)
|
75
|
+
#
|
76
|
+
# To read all entries from multiple files:
|
77
|
+
#
|
78
|
+
# read_table(input: "test1.tab,test2.tab")
|
79
|
+
#
|
80
|
+
# To read entries from multiple files using a glob expression:
|
81
|
+
#
|
82
|
+
# read_table(input: "*.tab")
|
83
|
+
#
|
84
|
+
# Consider the following table from the file from the file test.tab:
|
85
|
+
#
|
86
|
+
# #Organism Sequence Count
|
87
|
+
# Human ATACGTCAG 23524
|
88
|
+
# Dog AGCATGAC 2442
|
89
|
+
# Mouse GACTG 234
|
90
|
+
# Cat AAATGCA 2342
|
91
|
+
#
|
92
|
+
# Reading the entire table will result in 4 records, one for each row,
|
93
|
+
# where the keys Organism, Sequence and Count are taken from the comment
|
94
|
+
# line prefixe with #:
|
95
|
+
#
|
96
|
+
# BP.new.read_tab(input: "test.tab").dump.run
|
97
|
+
#
|
98
|
+
# {:Organism=>"Human", :Sequence=>"ATACGTCAG", :Count=>23524}
|
99
|
+
# {:Organism=>"Dog", :Sequence=>"AGCATGAC", :Count=>2442}
|
100
|
+
# {:Organism=>"Mouse", :Sequence=>"GACTG", :Count=>234}
|
101
|
+
# {:Organism=>"Cat", :Sequence=>"AAATGCA", :Count=>2342}
|
102
|
+
#
|
103
|
+
# However, if the first line is skipped using the +skip+ option the keys
|
104
|
+
# will default to V0, V1, V2 ... Vn:
|
105
|
+
#
|
106
|
+
# BP.new.read_table(input: "test.tab", skip: 1).dump.run
|
107
|
+
#
|
108
|
+
# {:V0=>"Human", :V1=>"ATACGTCAG", :V2=>23524}
|
109
|
+
# {:V0=>"Dog", :V1=>"AGCATGAC", :V2=>2442}
|
110
|
+
# {:V0=>"Mouse", :V1=>"GACTG", :V2=>234}
|
111
|
+
# {:V0=>"Cat", :V1=>"AAATGCA", :V2=>2342}
|
112
|
+
#
|
113
|
+
# To explicitly name the columns (or the keys) use the +keys+ option:
|
114
|
+
#
|
115
|
+
# BP.new.
|
116
|
+
# read_table(input: "test.tab", skip: 1, keys: [:ORGANISM, :SEQ, :COUNT]).
|
117
|
+
# dump.
|
118
|
+
# run
|
119
|
+
#
|
120
|
+
# {:ORGANISM=>"Human", :SEQ=>"ATACGTCAG", :COUNT=>23524}
|
121
|
+
# {:ORGANISM=>"Dog", :SEQ=>"AGCATGAC", :COUNT=>2442}
|
122
|
+
# {:ORGANISM=>"Mouse", :SEQ=>"GACTG", :COUNT=>234}
|
123
|
+
# {:ORGANISM=>"Cat", :SEQ=>"AAATGCA", :COUNT=>2342}
|
124
|
+
#
|
125
|
+
# It is possible to select a subset of columns to read by using the
|
126
|
+
# +select+ option which takes a comma separated list of columns numbers
|
127
|
+
# (first column is designated 0) or header keys as (requires header)
|
128
|
+
# argument. So to read in only the sequence and the count so that the
|
129
|
+
# count comes before the sequence do:
|
130
|
+
#
|
131
|
+
# BP.new.read_table(input: "test.tab", skip: 1, select: [2, 1]).dump.run
|
132
|
+
#
|
133
|
+
# {:V0=>23524, :V1=>"ATACGTCAG"}
|
134
|
+
# {:V0=>2442, :V1=>"AGCATGAC"}
|
135
|
+
# {:V0=>234, :V1=>"GACTG"}
|
136
|
+
# {:V0=>2342, :V1=>"AAATGCA"}
|
137
|
+
#
|
138
|
+
# Alternatively, if a header line was present in the file:
|
139
|
+
#
|
140
|
+
# #Organism Sequence Count
|
141
|
+
#
|
142
|
+
# Then the header keys can be used:
|
143
|
+
#
|
144
|
+
# BP.new.
|
145
|
+
# read_table(input: "test.tab", skip: 1, select: [:Count, :Sequence]).
|
146
|
+
# dump.
|
147
|
+
# run
|
148
|
+
#
|
149
|
+
# {:Count=>23524, :Sequence=>"ATACGTCAG"}
|
150
|
+
# {:Count=>2442, :Sequence=>"AGCATGAC"}
|
151
|
+
# {:Count=>234, :Sequence=>"GACTG"}
|
152
|
+
# {:Count=>2342, :Sequence=>"AAATGCA"}
|
153
|
+
#
|
154
|
+
# Likewise, it is possible to reject specified columns from being read
|
155
|
+
# using the +reject+ option:
|
156
|
+
#
|
157
|
+
# BP.new.read_table(input: "test.tab", skip: 1, reject: [2, 1]).dump.run
|
158
|
+
#
|
159
|
+
# {:V0=>"Human"}
|
160
|
+
# {:V0=>"Dog"}
|
161
|
+
# {:V0=>"Mouse"}
|
162
|
+
# {:V0=>"Cat"}
|
163
|
+
#
|
164
|
+
# And again, the header keys can be used if a header is present:
|
165
|
+
#
|
166
|
+
# BP.new.
|
167
|
+
# read_table(input: "test.tab", skip: 1, reject: [:Count, :Sequence]).
|
168
|
+
# dump.
|
169
|
+
# run
|
170
|
+
#
|
171
|
+
# {:Organism=>"Human"}
|
172
|
+
# {:Organism=>"Dog"}
|
173
|
+
# {:Organism=>"Mouse"}
|
174
|
+
# {:Organism=>"Cat"}
|
175
|
+
#
|
176
|
+
# rubocop: disable ClassLength
|
177
|
+
class ReadTable
|
178
|
+
STATS = %i(records_in records_out)
|
179
|
+
|
180
|
+
# Constructor for ReadTable.
|
181
|
+
#
|
182
|
+
# @param options [Hash] Options hash.
|
183
|
+
# @option options [String] :input
|
184
|
+
# @option options [Integer] :first
|
185
|
+
# @option options [Integer] :last
|
186
|
+
# @option options [Array] :keys
|
187
|
+
# @option options [Integer] :skip
|
188
|
+
# @option options [String] :delimiter
|
189
|
+
# @option options [Boolean] :select
|
190
|
+
# @option options [Boolean] :reject
|
191
|
+
#
|
192
|
+
# @return [ReadTable] Class instance.
|
193
|
+
def initialize(options)
|
194
|
+
@options = options
|
195
|
+
@keys = options[:keys] ? options[:keys].map(&:to_sym) : nil
|
196
|
+
@skip = options[:skip] || 0
|
197
|
+
@buffer = []
|
198
|
+
|
199
|
+
check_options
|
200
|
+
end
|
201
|
+
|
202
|
+
# Return command lambda for ReadTable
|
203
|
+
#
|
204
|
+
# @return [Proc] Command lambda.
|
205
|
+
def lmb
|
206
|
+
lambda do |input, output, status|
|
207
|
+
status_init(status, STATS)
|
208
|
+
|
209
|
+
process_input(input, output)
|
210
|
+
|
211
|
+
case
|
212
|
+
when @options[:first] then read_first(output)
|
213
|
+
when @options[:last] then read_last(output)
|
214
|
+
else read_all(output)
|
215
|
+
end
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
private
|
220
|
+
|
221
|
+
# Check options.
|
222
|
+
def check_options
|
223
|
+
options_allowed(@options, :input, :first, :last, :keys, :skip, :delimiter,
|
224
|
+
:select, :reject)
|
225
|
+
options_required(@options, :input)
|
226
|
+
options_files_exist(@options, :input)
|
227
|
+
options_unique(@options, :first, :last)
|
228
|
+
options_unique(@options, :select, :reject)
|
229
|
+
options_list_unique(@options, :keys, :select, :reject)
|
230
|
+
options_assert(@options, ':first >= 0')
|
231
|
+
options_assert(@options, ':last >= 0')
|
232
|
+
options_assert(@options, ':skip >= 0')
|
233
|
+
end
|
234
|
+
|
235
|
+
# Return a hash with options for CVS#each_hash.
|
236
|
+
#
|
237
|
+
# @return [Hash] Read table options.
|
238
|
+
def read_options
|
239
|
+
{delimiter: @options[:delimiter],
|
240
|
+
select: @options[:select],
|
241
|
+
reject: @options[:reject]}
|
242
|
+
end
|
243
|
+
|
244
|
+
# Read :first entries from input files and emit to output stream.
|
245
|
+
#
|
246
|
+
# @param output [Enumerator::Yeilder] Output stream.
|
247
|
+
def read_first(output)
|
248
|
+
options_glob(@options[:input]).each do |file|
|
249
|
+
BioDSL::CSV.open(file) do |ios|
|
250
|
+
ios.skip(@skip)
|
251
|
+
|
252
|
+
ios.each_hash(read_options) do |record|
|
253
|
+
output << record
|
254
|
+
@status[:records_out] += 1
|
255
|
+
return if @status[:records_out] >= @options[:first]
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
# Read :last entries from input files and emit to output stream.
|
262
|
+
#
|
263
|
+
# @param output [Enumerator::Yeilder] Output stream.
|
264
|
+
def read_last(output)
|
265
|
+
options_glob(@options[:input]).each do |file|
|
266
|
+
BioDSL::CSV.open(file) do |ios|
|
267
|
+
ios.skip(@skip)
|
268
|
+
|
269
|
+
ios.each_hash(read_options) do |record|
|
270
|
+
@buffer << record
|
271
|
+
@buffer.shift if @buffer.size > @options[:last]
|
272
|
+
end
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
output_buffer(output)
|
277
|
+
end
|
278
|
+
|
279
|
+
# Read all entries from input files and emit to output stream.
|
280
|
+
#
|
281
|
+
# @param output [Enumerator::Yeilder] Output stream.
|
282
|
+
def read_all(output)
|
283
|
+
options_glob(@options[:input]).each do |file|
|
284
|
+
BioDSL::CSV.open(file) do |ios|
|
285
|
+
ios.skip(@skip)
|
286
|
+
|
287
|
+
ios.each_hash(read_options) do |record|
|
288
|
+
replace_keys(record) if @keys
|
289
|
+
output << record
|
290
|
+
@status[:records_out] += 1
|
291
|
+
end
|
292
|
+
end
|
293
|
+
end
|
294
|
+
end
|
295
|
+
|
296
|
+
# Replace the keys of a given record.
|
297
|
+
#
|
298
|
+
# @param record [Hash] BioDSL record.
|
299
|
+
def replace_keys(record)
|
300
|
+
record.first(@keys.size).each_with_index do |(k, v), i|
|
301
|
+
record[@keys[i]] = v
|
302
|
+
record.delete k
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
# Output all record in the buffer to the output stream.
|
307
|
+
#
|
308
|
+
# @param output [Enumerator::Yielder] Output stream.
|
309
|
+
def output_buffer(output)
|
310
|
+
@buffer.each do |record|
|
311
|
+
output << record
|
312
|
+
@status[:records_out] += 1
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
# Emit all records from the input stream to the output stream.
|
317
|
+
#
|
318
|
+
# @param input [Enumerator] Input stream.
|
319
|
+
# @param output [Enumerator::Yielder] Output stream.
|
320
|
+
def process_input(input, output)
|
321
|
+
return unless output
|
322
|
+
input.each do |record|
|
323
|
+
output << record
|
324
|
+
@status[:records_in] += 1
|
325
|
+
@status[:records_out] += 1
|
326
|
+
end
|
327
|
+
end
|
328
|
+
end
|
329
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
18
|
+
# USA. #
|
19
|
+
# #
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
21
|
+
# #
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
|
+
# #
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
25
|
+
# #
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
|
+
|
28
|
+
module BioDSL
|
29
|
+
# == Reverse sequences in the stream.
|
30
|
+
#
|
31
|
+
# +reverse_seq+ reverses sequences in the stream. If a SCORES key is found
|
32
|
+
# then the SCORES are also reversed.
|
33
|
+
#
|
34
|
+
# +reverse_seq+ can be used together with +complment_seq+ to reverse-
|
35
|
+
# complement sequences.
|
36
|
+
#
|
37
|
+
# == Usage
|
38
|
+
#
|
39
|
+
# reverse_seq()
|
40
|
+
#
|
41
|
+
# === Options
|
42
|
+
#
|
43
|
+
# == Examples
|
44
|
+
#
|
45
|
+
# Consider the following FASTQ entry in the file test.fq:
|
46
|
+
#
|
47
|
+
# @M02529:88:000000000-AC0WY:1:1101:12879:1928 2:N:0:185
|
48
|
+
# TTGTAAAACGACGGCCAGTG
|
49
|
+
# +
|
50
|
+
# >>>>>FFFFD@A?A0AE0FG
|
51
|
+
#
|
52
|
+
# To reverse the sequence simply do:
|
53
|
+
#
|
54
|
+
# BP.new.read_fastq(input:"test.fq").reverse_seq.dump.run
|
55
|
+
#
|
56
|
+
# {:SEQ_NAME=>"M02529:88:000000000-AC0WY:1:1101:12879:1928 2:N:0:185",
|
57
|
+
# :SEQ=>"GTGACCGGCAGCAAAATGTT",
|
58
|
+
# :SEQ_LEN=>20,
|
59
|
+
# :SCORES=>"GF0EA0A?A@DFFFF>>>>>"}
|
60
|
+
class ReverseSeq
|
61
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
62
|
+
residues_out)
|
63
|
+
|
64
|
+
# Constructor for ReverseSeq.
|
65
|
+
#
|
66
|
+
# @param options [Hash] Options hash.
|
67
|
+
#
|
68
|
+
# @return [ReverseSeq] Class instance.
|
69
|
+
def initialize(options)
|
70
|
+
@options = options
|
71
|
+
|
72
|
+
check_options
|
73
|
+
end
|
74
|
+
|
75
|
+
# Return command lambda for reverse_seq.
|
76
|
+
#
|
77
|
+
# @return [Proc] Command lambda.
|
78
|
+
def lmb
|
79
|
+
lambda do |input, output, status|
|
80
|
+
status_init(status, STATS)
|
81
|
+
|
82
|
+
input.each do |record|
|
83
|
+
@status[:records_in] += 1
|
84
|
+
reverse(record) if record[:SEQ]
|
85
|
+
output << record
|
86
|
+
@status[:records_out] += 1
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
# Check options.
|
94
|
+
def check_options
|
95
|
+
options_allowed(@options, nil)
|
96
|
+
end
|
97
|
+
|
98
|
+
# Reverse sequence.
|
99
|
+
#
|
100
|
+
# @param record [Hash] BioDSL record.
|
101
|
+
def reverse(record)
|
102
|
+
entry = BioDSL::Seq.new_bp(record)
|
103
|
+
entry.reverse!
|
104
|
+
|
105
|
+
@status[:sequences_in] += 1
|
106
|
+
@status[:sequences_out] += 1
|
107
|
+
@status[:residues_in] += entry.length
|
108
|
+
@status[:residues_out] += entry.length
|
109
|
+
|
110
|
+
record.merge! entry.to_bp
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|