BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
data/lib/BioDSL/csv.rb
ADDED
@@ -0,0 +1,307 @@
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
18
|
+
# USA. #
|
19
|
+
# #
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
21
|
+
# #
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
|
+
# #
|
24
|
+
# This software is part of BioDSL (www.github.com/maasha/BioDSL). #
|
25
|
+
# #
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
|
+
|
28
|
+
# Monkey patching Array to add convert_types method.
|
29
|
+
class Array
|
30
|
+
# Method that converts variable types given an array of types.
|
31
|
+
# Example: ["fish", 0.0, 1].convert_types([:to_s, :to_f, :to_i])
|
32
|
+
def convert_types(types)
|
33
|
+
if size != types.size
|
34
|
+
fail ArgumentError, "Array and types size mismatch: #{size} != " \
|
35
|
+
"#{types.size}"
|
36
|
+
end
|
37
|
+
|
38
|
+
types.each_with_index do |type, i|
|
39
|
+
self[i] = self[i].send(type)
|
40
|
+
end
|
41
|
+
|
42
|
+
self
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
module BioDSL
|
47
|
+
class CSVError < StandardError; end
|
48
|
+
|
49
|
+
# rubocop: disable ClassLength
|
50
|
+
|
51
|
+
# Class for manipulating CSV or table files.
|
52
|
+
# Allow reading and writing of gzip and bzip2 data.
|
53
|
+
# Auto-convert data types.
|
54
|
+
# Returns lines, arrays or hashes.
|
55
|
+
class CSV
|
56
|
+
def self.open(*args)
|
57
|
+
io = IO.open(*args)
|
58
|
+
|
59
|
+
if block_given?
|
60
|
+
yield new(io)
|
61
|
+
else
|
62
|
+
return new(io)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Method that reads all CSV data from a file into an array of arrays (array
|
67
|
+
# of rows) which is returned. In the default mode all columns are read.
|
68
|
+
# Using the select option subselects the columns based on a given Array or
|
69
|
+
# if a heder line is present a given Hash. Visa versa for the reject option.
|
70
|
+
# Header lines are prefixed with '#' and are returned if the include_header
|
71
|
+
# option is given.
|
72
|
+
#
|
73
|
+
# Options:
|
74
|
+
# * include_header
|
75
|
+
# * delimiter.
|
76
|
+
# * select.
|
77
|
+
# * reject.
|
78
|
+
def self.read_array(file, options = {})
|
79
|
+
data = []
|
80
|
+
|
81
|
+
open(file) do |ios|
|
82
|
+
ios.each_array(options) { |row| data << row }
|
83
|
+
end
|
84
|
+
|
85
|
+
data
|
86
|
+
end
|
87
|
+
|
88
|
+
# Method that reads all CSV data from a file into an array of hashes (array
|
89
|
+
# of rows) which is returned. In the default mode all columns are read.
|
90
|
+
# Using the select option subselects the columns based on a given Array or
|
91
|
+
# if a heder line is present a given Hash. Visa versa for the reject option.
|
92
|
+
# Header lines are prefixed with '#'.
|
93
|
+
#
|
94
|
+
# Options:
|
95
|
+
# * delimiter.
|
96
|
+
# * select.
|
97
|
+
# * reject.
|
98
|
+
def self.read_hash(file, options = {})
|
99
|
+
data = []
|
100
|
+
|
101
|
+
open(file) do |ios|
|
102
|
+
ios.each_hash(options) { |row| data << row }
|
103
|
+
end
|
104
|
+
|
105
|
+
data
|
106
|
+
end
|
107
|
+
|
108
|
+
# Constructor method for CSV.
|
109
|
+
def initialize(io)
|
110
|
+
@io = io
|
111
|
+
@delimiter = "\s"
|
112
|
+
@header = nil
|
113
|
+
@fields = nil
|
114
|
+
@types = nil
|
115
|
+
end
|
116
|
+
|
117
|
+
# Method to skip a given number or non-empty lines.
|
118
|
+
def skip(num)
|
119
|
+
while num != 0 && (line = @io.gets)
|
120
|
+
line.chomp!
|
121
|
+
|
122
|
+
num -= 1 unless line.empty?
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# Method to iterate over a CSV IO object yielding arrays or an enumerator
|
127
|
+
# CSV.each_array(options={}) { |item| block } -> ary
|
128
|
+
# CSV.each_array(options={}) -> Enumerator
|
129
|
+
#
|
130
|
+
# Options:
|
131
|
+
# * :include_header -
|
132
|
+
# * :delimiter -
|
133
|
+
# * :select -
|
134
|
+
# * :reject -
|
135
|
+
def each_array(options = {})
|
136
|
+
return to_enum :each_array unless block_given?
|
137
|
+
|
138
|
+
delimiter = options[:delimiter] || @delimiter
|
139
|
+
|
140
|
+
@io.each do |line|
|
141
|
+
line.chomp!
|
142
|
+
next if line.empty?
|
143
|
+
|
144
|
+
fields = line.split(delimiter)
|
145
|
+
|
146
|
+
if line[0] == '#'
|
147
|
+
get_header(fields, options) unless @header
|
148
|
+
get_fields(fields, options) unless @fields
|
149
|
+
|
150
|
+
yield @header.map(&:to_s) if options[:include_header]
|
151
|
+
else
|
152
|
+
get_header(fields, options) unless @header
|
153
|
+
get_fields(fields, options) unless @fields
|
154
|
+
|
155
|
+
fields = fields.values_at(*@fields) if @fields
|
156
|
+
|
157
|
+
determine_types(fields) unless @types
|
158
|
+
|
159
|
+
yield fields.convert_types(@types)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
self
|
164
|
+
end
|
165
|
+
|
166
|
+
# Method to iterate over a CSV IO object yielding hashes or an enumerator
|
167
|
+
# CSV.each_hash(options={}) { |item| block } -> hash
|
168
|
+
# CSV.each_hash(options={}) -> Enumerator
|
169
|
+
#
|
170
|
+
# Options:
|
171
|
+
# * :delimiter -
|
172
|
+
# * :select -
|
173
|
+
# * :reject -
|
174
|
+
def each_hash(options = {})
|
175
|
+
each_array(options) do |array|
|
176
|
+
hash = {}
|
177
|
+
|
178
|
+
array.convert_types(@types).each_with_index do |field, i|
|
179
|
+
hash[@header[i]] = field
|
180
|
+
end
|
181
|
+
|
182
|
+
yield hash
|
183
|
+
end
|
184
|
+
|
185
|
+
self
|
186
|
+
end
|
187
|
+
|
188
|
+
private
|
189
|
+
|
190
|
+
# Method to set the @header given a list of fields (a row).
|
191
|
+
# Options:
|
192
|
+
# * :select - list of column indexes, names or a range to select.
|
193
|
+
# * :reject - list of column indexes, names or a range to reject.
|
194
|
+
def get_header(fields, options)
|
195
|
+
if fields[0][0] == '#'
|
196
|
+
fields[0] = fields[0][1..-1]
|
197
|
+
@header = fields.map(&:to_sym)
|
198
|
+
else
|
199
|
+
@header = []
|
200
|
+
fields.each_with_index { |_field, i| @header << "V#{i}".to_sym }
|
201
|
+
end
|
202
|
+
|
203
|
+
if options[:select]
|
204
|
+
if options[:select].first.is_a? Fixnum
|
205
|
+
if options[:select].max >= @header.size
|
206
|
+
fail CSVError, "Selected columns out of bounds: #{options[:select].
|
207
|
+
select { |c| c >= @header.size }}"
|
208
|
+
end
|
209
|
+
else
|
210
|
+
options[:select].each do |value|
|
211
|
+
unless @header.include? value.to_sym
|
212
|
+
fail CSVError, "Selected value: #{value} not in header: " \
|
213
|
+
" #{@header}"
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
elsif options[:reject]
|
218
|
+
if options[:reject].first.is_a? Fixnum
|
219
|
+
if options[:reject].max >= @header.size
|
220
|
+
fail CSVError, "Rejected columns out of bounds: #{options[:reject].
|
221
|
+
reject { |c| c >= @header.size }}"
|
222
|
+
end
|
223
|
+
else
|
224
|
+
options[:reject].map do |value|
|
225
|
+
unless @header.include? value.to_sym
|
226
|
+
fail CSVError, "Rejected value: #{value} not found in header: " \
|
227
|
+
"#{@header}"
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
@header
|
234
|
+
end
|
235
|
+
|
236
|
+
# Method to determine the indexes of fields to be parsed and store these in
|
237
|
+
# @fields.
|
238
|
+
# Options:
|
239
|
+
# * :select - list of column indexes, names or a range to select.
|
240
|
+
# * :reject - list of column indexes, names or a range to reject.
|
241
|
+
def get_fields(fields, options)
|
242
|
+
if options[:select]
|
243
|
+
if options[:select].first.is_a? Fixnum
|
244
|
+
@fields = options[:select]
|
245
|
+
else
|
246
|
+
fail CSVError, 'No header found' unless @header
|
247
|
+
|
248
|
+
fields = []
|
249
|
+
|
250
|
+
options[:select].each do |value|
|
251
|
+
fields << @header.index(value.to_sym)
|
252
|
+
end
|
253
|
+
|
254
|
+
@fields = fields
|
255
|
+
end
|
256
|
+
|
257
|
+
@header = @header.values_at(*@fields)
|
258
|
+
elsif options[:reject]
|
259
|
+
if options[:reject].first.is_a? Fixnum
|
260
|
+
reject = if options[:reject].is_a?(Range)
|
261
|
+
options[:reject].to_a
|
262
|
+
else
|
263
|
+
options[:reject]
|
264
|
+
end
|
265
|
+
@fields = (0...fields.size).to_a - reject
|
266
|
+
else
|
267
|
+
fail CSVError, 'No header found' unless @header
|
268
|
+
|
269
|
+
reject = options[:reject].map(&:to_sym)
|
270
|
+
|
271
|
+
@fields = @header.map.with_index.to_h.
|
272
|
+
delete_if { |k, _| reject.include? k }.values
|
273
|
+
end
|
274
|
+
|
275
|
+
@header = @header.values_at(*@fields)
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
# Method that determines the data types used in an array of fields.
|
280
|
+
def determine_types(fields)
|
281
|
+
types = []
|
282
|
+
|
283
|
+
fields.each do |field|
|
284
|
+
field = field.to_num
|
285
|
+
|
286
|
+
if field.is_a? Fixnum
|
287
|
+
types << :to_i
|
288
|
+
elsif field.is_a? Float
|
289
|
+
types << :to_f
|
290
|
+
elsif field.is_a? String
|
291
|
+
types << :to_s
|
292
|
+
else
|
293
|
+
types << nil
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
297
|
+
@types = types
|
298
|
+
end
|
299
|
+
|
300
|
+
# IO class for CSV.
|
301
|
+
class IO < Filesys
|
302
|
+
def gets
|
303
|
+
@io.gets
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
data/lib/BioDSL/debug.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
18
|
+
# USA. #
|
19
|
+
# #
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
21
|
+
# #
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
|
+
# #
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
25
|
+
# #
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
|
+
|
28
|
+
# Namespace for BioDSL.
|
29
|
+
module BioDSL
|
30
|
+
# Class variabel visible across the BioDSL module scope.
|
31
|
+
@@debug = false
|
32
|
+
|
33
|
+
# Class variable getter method.
|
34
|
+
def self.debug
|
35
|
+
@@debug
|
36
|
+
end
|
37
|
+
|
38
|
+
# Class variable setter method.
|
39
|
+
def self.debug=(x)
|
40
|
+
@@debug = x
|
41
|
+
end
|
42
|
+
end
|
data/lib/BioDSL/fasta.rb
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
|
18
|
+
# #
|
19
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
20
|
+
# #
|
21
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
22
|
+
# #
|
23
|
+
# This software is part of BioDSL (www.BioDSL.org). #
|
24
|
+
# #
|
25
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
26
|
+
|
27
|
+
module BioDSL
|
28
|
+
# Error class for all exceptions to do with FASTA.
|
29
|
+
class FastaError < StandardError; end
|
30
|
+
|
31
|
+
class Fasta
|
32
|
+
def self.open(*args)
|
33
|
+
ios = IO.open(*args)
|
34
|
+
|
35
|
+
if block_given?
|
36
|
+
begin
|
37
|
+
yield self.new(ios)
|
38
|
+
ensure
|
39
|
+
ios.close
|
40
|
+
end
|
41
|
+
else
|
42
|
+
return self.new(ios)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.read(*args)
|
47
|
+
entries = []
|
48
|
+
|
49
|
+
Fasta.open(*args) do |ios|
|
50
|
+
ios.each do |entry|
|
51
|
+
entries << entry
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
entries
|
56
|
+
end
|
57
|
+
|
58
|
+
attr_accessor :seq_name, :seq
|
59
|
+
|
60
|
+
def initialize(io)
|
61
|
+
@io = io
|
62
|
+
@seq_name = nil
|
63
|
+
@seq = ""
|
64
|
+
@got_first = nil
|
65
|
+
@got_last = nil
|
66
|
+
end
|
67
|
+
|
68
|
+
def each
|
69
|
+
while entry = next_entry
|
70
|
+
yield entry
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def puts(*args)
|
75
|
+
@io.puts(*args)
|
76
|
+
end
|
77
|
+
|
78
|
+
# Method to get the next FASTA entry form an ios and return this
|
79
|
+
# as a Seq object. If no entry is found or eof then nil is returned.
|
80
|
+
def next_entry
|
81
|
+
@io.each do |line|
|
82
|
+
line.chomp!
|
83
|
+
|
84
|
+
next if line.empty?
|
85
|
+
|
86
|
+
if line[0] == '>'
|
87
|
+
if not @got_first and not @seq.empty?
|
88
|
+
raise FastaError, "Bad FASTA format -> content before Fasta header: #{@seq}" unless @seq.empty?
|
89
|
+
end
|
90
|
+
|
91
|
+
@got_first = true
|
92
|
+
|
93
|
+
if @seq_name
|
94
|
+
entry = Seq.new(seq_name: @seq_name, seq: @seq)
|
95
|
+
@seq_name = line[1 .. -1]
|
96
|
+
@seq = ""
|
97
|
+
|
98
|
+
raise FastaError, "Bad FASTA format -> truncated Fasta header: no content after '>'" if @seq_name.empty?
|
99
|
+
|
100
|
+
return entry
|
101
|
+
else
|
102
|
+
@seq_name = line[1 .. -1]
|
103
|
+
|
104
|
+
raise FastaError, "Bad FASTA format -> truncated Fasta header: no content after '>'" if @seq_name.empty?
|
105
|
+
end
|
106
|
+
else
|
107
|
+
@seq << line
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
if @seq_name
|
112
|
+
@got_last = true
|
113
|
+
entry = Seq.new(seq_name: @seq_name, seq: @seq)
|
114
|
+
@seq_name = nil
|
115
|
+
return entry
|
116
|
+
end
|
117
|
+
|
118
|
+
if not @got_last and not @seq.empty?
|
119
|
+
raise FastaError, "Bad FASTA format -> content witout Fasta header: #{@seq}"
|
120
|
+
end
|
121
|
+
|
122
|
+
nil
|
123
|
+
end
|
124
|
+
|
125
|
+
class IO < Filesys
|
126
|
+
def each
|
127
|
+
while not @io.eof?
|
128
|
+
yield @io.gets
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
data/lib/BioDSL/fastq.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
|
18
|
+
# #
|
19
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
20
|
+
# #
|
21
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
22
|
+
# #
|
23
|
+
# This software is part of BioDSL (www.BioDSL.org). #
|
24
|
+
# #
|
25
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
26
|
+
|
27
|
+
module BioDSL
|
28
|
+
# Error class for all exceptions to do with FASTQ.
|
29
|
+
class FastqError < StandardError; end
|
30
|
+
|
31
|
+
# Class for parsing FASTQ entries from an ios and return as Seq objects.
|
32
|
+
class Fastq < BioDSL::Filesys
|
33
|
+
def self.open(*args)
|
34
|
+
ios = IO.open(*args)
|
35
|
+
|
36
|
+
if block_given?
|
37
|
+
begin
|
38
|
+
yield self.new(ios)
|
39
|
+
ensure
|
40
|
+
ios.close
|
41
|
+
end
|
42
|
+
else
|
43
|
+
return self.new(ios)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def initialize(io)
|
48
|
+
@io = io
|
49
|
+
end
|
50
|
+
|
51
|
+
def each
|
52
|
+
while entry = next_entry
|
53
|
+
yield entry
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Method to get the next FASTQ entry from an ios and return this
|
58
|
+
# as a Seq object. If no entry is found or eof then nil is returned.
|
59
|
+
def next_entry
|
60
|
+
return nil if @io.eof?
|
61
|
+
seq_name = @io.gets[1 .. -2]
|
62
|
+
seq = @io.gets.chomp
|
63
|
+
@io.gets
|
64
|
+
qual = @io.gets.chomp
|
65
|
+
|
66
|
+
Seq.new(seq_name: seq_name, seq: seq, qual: qual)
|
67
|
+
end
|
68
|
+
|
69
|
+
class IO < Filesys
|
70
|
+
def each
|
71
|
+
while not @io.eof?
|
72
|
+
yield @io.gets
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
|
18
|
+
# #
|
19
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
20
|
+
# #
|
21
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
22
|
+
# #
|
23
|
+
# This software is part of BioDSL (www.BioDSL.org). #
|
24
|
+
# #
|
25
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
26
|
+
|
27
|
+
module BioDSL
|
28
|
+
# Error class for all exceptions to do with Filesys.
|
29
|
+
class FilesysError < StandardError; end
|
30
|
+
|
31
|
+
class Filesys
|
32
|
+
require 'open3'
|
33
|
+
|
34
|
+
include Enumerable
|
35
|
+
|
36
|
+
# Cross-platform way of finding an executable in the $PATH.
|
37
|
+
#
|
38
|
+
# which('ruby') #=> /usr/bin/ruby
|
39
|
+
def self.which(cmd)
|
40
|
+
exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
|
41
|
+
|
42
|
+
ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
|
43
|
+
exts.each { |ext|
|
44
|
+
exe = File.join(path, "#{cmd}#{ext}")
|
45
|
+
return exe if File.executable?(exe) && !File.directory?(exe)
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
nil
|
50
|
+
end
|
51
|
+
|
52
|
+
# Class method that returns a path to a unique temporary file.
|
53
|
+
# If no directory is specified reverts to the systems tmp directory.
|
54
|
+
def self.tmpfile(tmp_dir = ENV["TMPDIR"])
|
55
|
+
time = Time.now.to_i
|
56
|
+
user = ENV["USER"]
|
57
|
+
pid = $$
|
58
|
+
path = tmp_dir + [user, time + pid, pid].join("_") + ".tmp"
|
59
|
+
path
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.open(*args)
|
63
|
+
file = args.shift
|
64
|
+
mode = args.shift
|
65
|
+
options = args.shift || {}
|
66
|
+
|
67
|
+
if mode == 'w'
|
68
|
+
case options[:compress]
|
69
|
+
when :gzip
|
70
|
+
ios, = Open3.pipeline_w("gzip -f", out: file)
|
71
|
+
when :bzip, :bzip2
|
72
|
+
ios, = Open3.pipeline_w("bzip2 -c", out: file)
|
73
|
+
else
|
74
|
+
ios = File.open(file, mode, options)
|
75
|
+
end
|
76
|
+
else
|
77
|
+
type = (file.respond_to? :path) ? `file -Lk #{file.path}` : `file -Lk #{file}`
|
78
|
+
case type
|
79
|
+
when /gzip/
|
80
|
+
ios = IO.popen("gzip -cd #{file}")
|
81
|
+
when /bzip/
|
82
|
+
ios = IO.popen("bzcat #{file}")
|
83
|
+
else
|
84
|
+
ios = File.open(file, mode, options)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
if block_given?
|
89
|
+
begin
|
90
|
+
yield self.new(ios)
|
91
|
+
ensure
|
92
|
+
ios.close
|
93
|
+
end
|
94
|
+
else
|
95
|
+
return self.new(ios)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
attr_reader :io
|
100
|
+
|
101
|
+
def initialize(ios)
|
102
|
+
@io = ios
|
103
|
+
end
|
104
|
+
|
105
|
+
def gets
|
106
|
+
@io.gets
|
107
|
+
end
|
108
|
+
|
109
|
+
def puts(*args)
|
110
|
+
@io.puts(*args)
|
111
|
+
end
|
112
|
+
|
113
|
+
def read
|
114
|
+
@io.read
|
115
|
+
end
|
116
|
+
|
117
|
+
def write(arg)
|
118
|
+
@io.write arg
|
119
|
+
end
|
120
|
+
|
121
|
+
def close
|
122
|
+
@io.close
|
123
|
+
end
|
124
|
+
|
125
|
+
def eof?
|
126
|
+
@io.eof?
|
127
|
+
end
|
128
|
+
|
129
|
+
# Iterator method for parsing entries.
|
130
|
+
def each
|
131
|
+
return to_enum :each unless block_given?
|
132
|
+
|
133
|
+
@io.each { |line| yield line }
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|