bio-table 0.9.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -1
- data/README.md +46 -2
- data/VERSION +1 -1
- data/bin/bio-table +246 -215
- data/lib/bio-table.rb +1 -0
- data/lib/bio-table/count.rb +39 -0
- data/lib/bio-table/filter.rb +1 -0
- data/lib/bio-table/merge.rb +3 -1
- data/lib/bio-table/parser.rb +6 -4
- data/lib/bio-table/rewrite.rb +9 -2
- data/lib/bio-table/table_apply.rb +4 -0
- data/lib/bio-table/tableload.rb +2 -1
- data/lib/bio-table/validator.rb +4 -4
- data/test/data/regression/table1-STDIN.ref +1 -0
- metadata +3 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b33f9729f357116b8a58a31cdefdba144c5bfb5d
|
4
|
+
data.tar.gz: 04ed672ff432dcbe3b611a1441e83a84db5ebecc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ca15864e31d7c9dcfae49edbf281e27e4afae738a507787bc5b1860ee42ead5e2e4a418387fdbcb9c68ef674d86417a0760151db51b89af33ee1de5617275cee
|
7
|
+
data.tar.gz: 3cf3e0540bf9f3beea349d3e4db8d3792b6b394070171b451ea8105a9c9b89ba09a72e802060dc61aaae92f110707f0fbce5ed7662d996ef95257fb88c0e6386
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -38,6 +38,7 @@ Features:
|
|
38
38
|
* Calculate new values
|
39
39
|
* Calculate column statistics (mean, standard deviation)
|
40
40
|
* Diff between tables, selecting on specific column values
|
41
|
+
* Count elements in columns
|
41
42
|
* Merge tables side by side on column value/rowname
|
42
43
|
* Split/reduce tables by column
|
43
44
|
* Write formatted tables, e.g. HTML, LaTeX
|
@@ -174,6 +175,13 @@ in the table, use unshift headers, 0 becomes an 'ID' column
|
|
174
175
|
bio-table table1.csv --unshift-headers --columns 0,1,8,2,4,6
|
175
176
|
```
|
176
177
|
|
178
|
+
Another option will add fields to a row to get the same number of
|
179
|
+
fields
|
180
|
+
|
181
|
+
```sh
|
182
|
+
bio-table table1.csv --pad-fields
|
183
|
+
```
|
184
|
+
|
177
185
|
Duplicate columns with
|
178
186
|
|
179
187
|
```sh
|
@@ -186,6 +194,13 @@ Combine column values (more on rewrite below)
|
|
186
194
|
bio-table table1.csv --rewrite "rowname = rowname + '-' + field[0]"
|
187
195
|
```
|
188
196
|
|
197
|
+
To insert a table column simply add a tab, e.g., to inject a
|
198
|
+
column containing 'PATHWAY'
|
199
|
+
|
200
|
+
```sh
|
201
|
+
bio-table table1.csv --rewrite 'field[0] = "PATHWAY\t"+field[0]'
|
202
|
+
```
|
203
|
+
|
189
204
|
To filter for columns using a regular expression
|
190
205
|
|
191
206
|
```sh
|
@@ -220,6 +235,29 @@ Another option is to use (lazy) values:
|
|
220
235
|
|
221
236
|
which saves the typing to to_f.
|
222
237
|
|
238
|
+
Another feature is counting column elements. With
|
239
|
+
|
240
|
+
```sh
|
241
|
+
bio-table table1.csv --count 0,1,4
|
242
|
+
```
|
243
|
+
|
244
|
+
All records are combined that have the same rowname and values in
|
245
|
+
columns 0 and 3. In addition a column is added counting the number of
|
246
|
+
merged rows. So,
|
247
|
+
|
248
|
+
```
|
249
|
+
hs8 48713371 53713371 G SAMPLE005
|
250
|
+
hs8 48713371 53713371 G SAMPLE005
|
251
|
+
hs9 136643994 141643994 C SAMPLE005
|
252
|
+
```
|
253
|
+
|
254
|
+
becomes
|
255
|
+
|
256
|
+
```
|
257
|
+
hs8 48713371 53713371 G SAMPLE005 2
|
258
|
+
hs9 136643994 141643994 C SAMPLE005 1
|
259
|
+
```
|
260
|
+
|
223
261
|
### Statistics
|
224
262
|
|
225
263
|
bio-table can handle some column statistics using the Ruby statsample
|
@@ -287,9 +325,11 @@ with NA's, unless you add a filter, e.g.
|
|
287
325
|
|
288
326
|
```sh
|
289
327
|
bio-table --merge table1.csv table2.csv --num-filter "values.compact.size == values.size"
|
290
|
-
|
291
328
|
```
|
292
329
|
|
330
|
+
If you don't want the headers to be 'restyled' on merge, use the --keep-headers
|
331
|
+
override.
|
332
|
+
|
293
333
|
### Splitting a table
|
294
334
|
|
295
335
|
Splitting a table by column is possible by named or indexed columns,
|
@@ -554,7 +594,11 @@ Coming soon
|
|
554
594
|
The API doc is online. For more code examples see the test files in
|
555
595
|
the source tree.
|
556
596
|
|
557
|
-
|
597
|
+
## Troubleshooting
|
598
|
+
|
599
|
+
Run bio-table with the --debug switch to get stack traces. Use --debug
|
600
|
+
and or --trace for more output.
|
601
|
+
|
558
602
|
## Project home page
|
559
603
|
|
560
604
|
Information on the source tree, documentation, examples, issues and
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
1.0.0
|
data/bin/bio-table
CHANGED
@@ -2,280 +2,311 @@
|
|
2
2
|
#
|
3
3
|
# BioRuby bio-table Plugin BioTable
|
4
4
|
# Author:: Pjotr Prins
|
5
|
-
# Copyright:: 2012
|
6
5
|
|
7
|
-
|
8
|
-
$: << File.join(rootpath,'lib')
|
6
|
+
begin
|
9
7
|
|
10
|
-
|
8
|
+
rootpath = File.dirname(File.dirname(__FILE__))
|
9
|
+
$: << File.join(rootpath,'lib')
|
11
10
|
|
12
|
-
|
11
|
+
_VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp
|
13
12
|
|
14
|
-
|
13
|
+
INPUT_ON_STDIN = !$stdin.tty?
|
15
14
|
|
16
|
-
|
15
|
+
$stderr.print "bio-table "+_VERSION+" Copyright (C) 2012-2014 Pjotr Prins <pjotr.prins@thebird.nl>\n\n"
|
17
16
|
|
18
|
-
|
17
|
+
USAGE =<<EOU
|
19
18
|
|
20
|
-
|
19
|
+
bio-table transforms, filters and reorders table files (CSV, tab-delimited).
|
21
20
|
|
22
|
-
|
23
|
-
print USAGE
|
24
|
-
end
|
21
|
+
EOU
|
25
22
|
|
26
|
-
|
27
|
-
|
28
|
-
|
23
|
+
if ARGV.size == 0 and not INPUT_ON_STDIN
|
24
|
+
print USAGE
|
25
|
+
end
|
29
26
|
|
30
|
-
|
31
|
-
|
27
|
+
require 'bio-table'
|
28
|
+
require 'optparse'
|
29
|
+
require 'bio-logger'
|
32
30
|
|
33
|
-
Bio::Log::
|
34
|
-
Bio::Log::
|
31
|
+
log = Bio::Log::LoggerPlus.new 'bio-table'
|
32
|
+
# log.outputters = Bio::Log::Outputter.stderr
|
35
33
|
|
36
|
-
|
37
|
-
|
38
|
-
opts = OptionParser.new do |o|
|
39
|
-
o.banner = "Usage: #{File.basename($0)} [options] filename\n\n"
|
34
|
+
Bio::Log::CLI.logger('stderr')
|
35
|
+
Bio::Log::CLI.trace('info')
|
40
36
|
|
41
|
-
|
42
|
-
|
43
|
-
|
37
|
+
options = {show_help: false, write_header: true, skip: 0}
|
38
|
+
options[:show_help] = true if ARGV.size == 0 and not INPUT_ON_STDIN
|
39
|
+
opts = OptionParser.new do |o|
|
40
|
+
o.banner = "Usage: #{File.basename($0)} [options] filename\n\n"
|
44
41
|
|
45
|
-
|
46
|
-
|
47
|
-
|
42
|
+
o.on('--num-filter expression', 'Numeric filtering function') do |par|
|
43
|
+
options[:num_filter] = par
|
44
|
+
end
|
48
45
|
|
49
|
-
|
50
|
-
|
51
|
-
|
46
|
+
o.on('--filter expression', 'Generic filtering function') do |par|
|
47
|
+
options[:filter] = par
|
48
|
+
end
|
52
49
|
|
53
|
-
|
54
|
-
|
55
|
-
|
50
|
+
o.on('--rewrite expression', 'Rewrite function') do |par|
|
51
|
+
options[:rewrite] = par
|
52
|
+
end
|
56
53
|
|
57
|
-
|
58
|
-
|
59
|
-
|
54
|
+
o.on('--count list', Array, 'Merge and count similar rows') do |list|
|
55
|
+
options[:count] = list
|
56
|
+
end
|
60
57
|
|
61
|
-
|
62
|
-
|
63
|
-
|
58
|
+
o.on('--columns list', Array, 'List of column names or indices') do |l|
|
59
|
+
options[:columns] = l
|
60
|
+
end
|
64
61
|
|
65
|
-
|
66
|
-
|
67
|
-
ARGV.unshift l.first
|
68
|
-
l = ["0"]
|
62
|
+
o.on('--column-filter expression', 'Column name filtering function') do |par|
|
63
|
+
options[:column_filter] = par
|
69
64
|
end
|
70
|
-
options[:diff] = l
|
71
|
-
end
|
72
65
|
|
73
|
-
|
74
|
-
|
75
|
-
ARGV.unshift l.first
|
76
|
-
l = ["0"]
|
66
|
+
o.on('--merge','Merge tables by rowname') do
|
67
|
+
options[:merge] = true
|
77
68
|
end
|
78
|
-
options[:overlap] = l
|
79
|
-
end
|
80
|
-
|
81
|
-
o.on('--merge','Merge tables by rowname') do
|
82
|
-
options[:merge] = true
|
83
|
-
end
|
84
69
|
|
85
|
-
|
70
|
+
o.on('--diff list',Array,'Diff two input files on columns (default rownames)') do |l|
|
71
|
+
if l.size==1 and File.exist?(l.first)
|
72
|
+
ARGV.unshift l.first
|
73
|
+
l = ["0"]
|
74
|
+
end
|
75
|
+
options[:diff] = l
|
76
|
+
end
|
86
77
|
|
87
|
-
|
88
|
-
|
89
|
-
|
78
|
+
o.on('--overlap list',Array,'Find overlap of two input files on columns)') do |l|
|
79
|
+
if l.size==1 and File.exist?(l.first)
|
80
|
+
ARGV.unshift l.first
|
81
|
+
l = ["0"]
|
82
|
+
end
|
83
|
+
options[:overlap] = l
|
84
|
+
end
|
85
|
+
|
86
|
+
o.on('--merge','Merge tables by rowname') do
|
87
|
+
options[:merge] = true
|
88
|
+
end
|
90
89
|
|
91
|
-
|
92
|
-
options[:with_headers] = true
|
93
|
-
options[:write_header] = false
|
94
|
-
end
|
90
|
+
o.separator "\n\tOverrides:\n\n"
|
95
91
|
|
96
|
-
|
97
|
-
|
98
|
-
|
92
|
+
o.on('--skip lines',Integer,'Skip the first lines before parsing') do |skip|
|
93
|
+
options[:skip] = skip
|
94
|
+
end
|
99
95
|
|
100
|
-
|
101
|
-
|
102
|
-
|
96
|
+
o.on('--with-headers','Include the header element in filtering etc.') do
|
97
|
+
options[:with_headers] = true
|
98
|
+
options[:write_header] = false
|
99
|
+
end
|
103
100
|
|
104
|
-
|
105
|
-
|
106
|
-
|
101
|
+
o.on('--with-rownames','Include the rownames in filtering etc.') do
|
102
|
+
options[:with_rownames] = true
|
103
|
+
end
|
107
104
|
|
108
|
-
|
105
|
+
o.on('--unshift-headers','Add an extra header element at the front (header contains one fewer field than the number of columns)') do
|
106
|
+
options[:unshift_headers] = true
|
107
|
+
end
|
109
108
|
|
110
|
-
|
111
|
-
|
112
|
-
|
109
|
+
o.on('--keep-headers','Keep original headers on merge') do
|
110
|
+
options[:keep_headers] = true
|
111
|
+
end
|
113
112
|
|
114
|
-
o.separator "\n\tFormat and options:\n\n"
|
115
|
-
|
116
|
-
o.on('--in-format [tab,csv,split,regex]', [:tab, :csv, :split, :regex], 'Input format (default tab)') do |par|
|
117
|
-
options[:in_format] = par.to_sym
|
118
|
-
end
|
119
113
|
|
120
|
-
|
121
|
-
|
122
|
-
|
114
|
+
o.on('--strip-quotes','Strip quotes from table fields') do
|
115
|
+
options[:strip_quotes] = true
|
116
|
+
end
|
123
117
|
|
124
|
-
|
125
|
-
|
126
|
-
|
118
|
+
o.on('--pad-fields','Add empty fields if a row is too short') do
|
119
|
+
options[:pad_fields] = true
|
120
|
+
end
|
127
121
|
|
128
|
-
|
129
|
-
options[:evaluate] = s
|
130
|
-
end
|
122
|
+
o.separator "\n\tTransform:\n\n"
|
131
123
|
|
132
|
-
|
133
|
-
|
134
|
-
|
124
|
+
o.on('--transform-ids [downcase,upcase]',[:downcase,:upcase],'Transform column and row identifiers') do |par|
|
125
|
+
options[:transform_ids] = par.to_sym
|
126
|
+
end
|
135
127
|
|
136
|
-
|
137
|
-
|
138
|
-
|
128
|
+
o.separator "\n\tFormat and options:\n\n"
|
129
|
+
|
130
|
+
o.on('--in-format [tab,csv,split,regex]', [:tab, :csv, :split, :regex], 'Input format (default tab)') do |par|
|
131
|
+
options[:in_format] = par.to_sym
|
132
|
+
end
|
139
133
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
o.separator "\n\tVerbosity:\n\n"
|
145
|
-
|
146
|
-
o.on("--logger filename",String,"Log to file (default stderr)") do | name |
|
147
|
-
Bio::Log::CLI.logger(name)
|
148
|
-
end
|
134
|
+
o.on('--format [tab,csv,rdf,eval]', [:tab, :csv, :rdf, :eval], 'Output format (default tab)') do |par|
|
135
|
+
options[:format] = par.to_sym
|
136
|
+
end
|
149
137
|
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
o.on("-q", "--quiet", "Run quietly") do |q|
|
155
|
-
Bio::Log::CLI.trace('error')
|
156
|
-
end
|
157
|
-
|
158
|
-
o.on("-v", "--verbose", "Run verbosely") do |v|
|
159
|
-
Bio::Log::CLI.trace('info')
|
160
|
-
end
|
161
|
-
|
162
|
-
o.on("--debug", "Show debug messages") do |v|
|
163
|
-
Bio::Log::CLI.trace('debug')
|
164
|
-
end
|
138
|
+
o.on("--split-on command",String,"Split on string or regex (use with --in-format)") do | s |
|
139
|
+
options[:split_on] = s
|
140
|
+
end
|
165
141
|
|
166
|
-
|
142
|
+
o.on("-e command",String,"Evaluate output command (use with --format eval)") do | s |
|
143
|
+
options[:evaluate] = s
|
144
|
+
end
|
167
145
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
end
|
146
|
+
o.on("--fasta regex",String,"Read FASTA format creating ID with regex") do | regex |
|
147
|
+
options[:fasta] = regex
|
148
|
+
end
|
172
149
|
|
173
|
-
|
174
|
-
|
150
|
+
o.on('--blank-nodes','Output (RDF) blank nodes - allowing for duplicate row names') do
|
151
|
+
options[:blank_nodes] = true
|
152
|
+
end
|
175
153
|
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
154
|
+
o.on('--statistics','Output column statistics') do
|
155
|
+
options[:statistics] = true
|
156
|
+
end
|
157
|
+
|
158
|
+
o.separator "\n\tVerbosity:\n\n"
|
159
|
+
|
160
|
+
o.on("--logger filename",String,"Log to file (default stderr)") do | name |
|
161
|
+
Bio::Log::CLI.logger(name)
|
162
|
+
end
|
180
163
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
164
|
+
o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
|
165
|
+
Bio::Log::CLI.trace(s)
|
166
|
+
end
|
167
|
+
|
168
|
+
o.on("-q", "--quiet", "Run quietly") do |q|
|
169
|
+
Bio::Log::CLI.trace('error')
|
170
|
+
end
|
171
|
+
|
172
|
+
o.on("-v", "--verbose", "Run verbosely") do |v|
|
173
|
+
Bio::Log::CLI.trace('info')
|
174
|
+
end
|
175
|
+
|
176
|
+
o.on("--debug", "Show debug messages") do |v|
|
177
|
+
Bio::Log::CLI.trace('debug')
|
178
|
+
options[:debug] = true
|
179
|
+
end
|
186
180
|
|
187
|
-
|
188
|
-
logger = Bio::Log::LoggerPlus['bio-table']
|
189
|
-
logger.info [options]
|
181
|
+
o.separator ""
|
190
182
|
|
191
|
-
|
183
|
+
o.on_tail('-h', '--help', 'Display this help and exit') do
|
184
|
+
options[:show_help] = true
|
185
|
+
end
|
186
|
+
end
|
192
187
|
|
193
|
-
|
194
|
-
|
195
|
-
logger.warn "Ignoring extraneaous files "+ARGV[2..-1].join(",") if ARGV.size>2
|
196
|
-
t1 = TableReader::read_file(ARGV[0], options)
|
197
|
-
t2 = TableReader::read_file(ARGV[1], options)
|
198
|
-
t = Diff::diff_tables(t1,t2, options)
|
199
|
-
t.write(options)
|
200
|
-
exit
|
201
|
-
end
|
188
|
+
begin
|
189
|
+
opts.parse!(ARGV)
|
202
190
|
|
203
|
-
if options[:
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
t2 = TableReader::read_file(ARGV[1], options)
|
208
|
-
t = Overlap::overlap_tables(t1,t2, options)
|
209
|
-
t.write(options)
|
210
|
-
exit
|
211
|
-
end
|
191
|
+
if options[:show_help]
|
192
|
+
print opts
|
193
|
+
print USAGE
|
194
|
+
end
|
212
195
|
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
196
|
+
# TODO: your code here
|
197
|
+
# use options for your logic
|
198
|
+
rescue OptionParser::InvalidOption => e
|
199
|
+
options[:invalid_argument] = e.message
|
200
|
+
end
|
201
|
+
|
202
|
+
Bio::Log::CLI.configure('bio-table')
|
203
|
+
logger = Bio::Log::LoggerPlus['bio-table']
|
204
|
+
logger.info [options]
|
205
|
+
|
206
|
+
include BioTable
|
207
|
+
|
208
|
+
if options[:diff]
|
209
|
+
logger.warn "Column settings are ignored for --diff" if options[:columns]
|
210
|
+
logger.warn "Ignoring extraneaous files "+ARGV[2..-1].join(",") if ARGV.size>2
|
211
|
+
t1 = TableReader::read_file(ARGV[0], options)
|
212
|
+
t2 = TableReader::read_file(ARGV[1], options)
|
213
|
+
t = Diff::diff_tables(t1,t2, options)
|
214
|
+
t.write(options)
|
215
|
+
exit
|
216
|
+
end
|
217
|
+
|
218
|
+
if options[:overlap]
|
219
|
+
logger.warn "Column settings are ignored for --overlap" if options[:columns]
|
220
|
+
logger.warn "Ignoring extraneaous files "+ARGV[2..-1].join(",") if ARGV.size>2
|
221
|
+
t1 = TableReader::read_file(ARGV[0], options)
|
222
|
+
t2 = TableReader::read_file(ARGV[1], options)
|
223
|
+
t = Overlap::overlap_tables(t1,t2, options)
|
224
|
+
t.write(options)
|
225
|
+
exit
|
226
|
+
end
|
227
|
+
|
228
|
+
if options[:fasta]
|
229
|
+
logger.warn "Column settings are ignored for --fasta" if options[:columns]
|
230
|
+
ARGV.each do | fn |
|
231
|
+
print "id\tseq\n"
|
232
|
+
FastaReader.new(fn,options[:fasta]).each do | rec |
|
233
|
+
print rec.id,"\t",rec.seq,"\n"
|
234
|
+
end
|
219
235
|
end
|
236
|
+
exit
|
220
237
|
end
|
221
|
-
exit
|
222
|
-
end
|
223
238
|
|
224
|
-
if options[:merge]
|
225
|
-
|
226
|
-
|
227
|
-
|
239
|
+
if options[:merge]
|
240
|
+
ts = []
|
241
|
+
ARGV.each do | fn |
|
242
|
+
ts << TableReader::read_file(fn, options)
|
243
|
+
end
|
244
|
+
t = Merge::merge_tables(ts, options)
|
245
|
+
t.write(options)
|
246
|
+
exit
|
228
247
|
end
|
229
|
-
t = Merge::merge_tables(ts, options)
|
230
|
-
t.write(options)
|
231
|
-
exit
|
232
|
-
end
|
233
248
|
|
234
|
-
#
|
235
|
-
# We also support STDIN for the first 'file'. A non-blocking idea can
|
236
|
-
# be found here:
|
237
|
-
#
|
238
|
-
# http://eric.lubow.org/2010/ruby/multiple-input-locations-from-bash-into-ruby/
|
239
|
-
#
|
249
|
+
#
|
250
|
+
# We also support STDIN for the first 'file'. A non-blocking idea can
|
251
|
+
# be found here:
|
252
|
+
#
|
253
|
+
# http://eric.lubow.org/2010/ruby/multiple-input-locations-from-bash-into-ruby/
|
254
|
+
#
|
240
255
|
|
241
|
-
writer =
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
256
|
+
writer =
|
257
|
+
if options[:format] == :rdf
|
258
|
+
BioTable::RDF::Writer.new(options[:blank_nodes])
|
259
|
+
else
|
260
|
+
BioTable::TableWriter::Writer.new(options[:format],options[:evaluate])
|
261
|
+
end
|
262
|
+
|
263
|
+
statistics = if options[:statistics]
|
264
|
+
BioTable::Statistics::Accumulate.new
|
265
|
+
end
|
266
|
+
count = if options[:count]
|
267
|
+
BioTable::Count::CountTracker.new(options[:count])
|
268
|
+
end
|
247
269
|
|
248
|
-
if INPUT_ON_STDIN
|
249
270
|
opts = options.dup # so we can 'safely' modify options
|
250
271
|
has_input = false
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
272
|
+
walk_table = lambda { |f|
|
273
|
+
BioTable::TableLoader.emit(f, opts).each do |row, type| # type is :header or :row
|
274
|
+
if statistics
|
275
|
+
statistics.add(row,type)
|
276
|
+
else
|
277
|
+
row = count.add(row,type) if count # merge and count
|
278
|
+
writer.write(TableRow.new(row[0],row[1..-1]),type) if row
|
279
|
+
end
|
280
|
+
has_input = true
|
281
|
+
end
|
282
|
+
if count
|
283
|
+
row = count.add(row,:row,flush: true)
|
284
|
+
writer.write(TableRow.new(row[0],row[1..-1]),:row) if row
|
285
|
+
end
|
286
|
+
options[:write_header] = false if has_input # don't write the header for chained files
|
287
|
+
}
|
263
288
|
|
264
|
-
|
265
|
-
|
266
|
-
f = File.open(fn,"r")
|
267
|
-
if not opts[:in_format] and fn =~ /\.csv$/
|
268
|
-
logger.debug "Autodetected CSV file"
|
269
|
-
opts[:in_format] = :csv
|
289
|
+
if INPUT_ON_STDIN
|
290
|
+
walk_table.call(STDIN)
|
270
291
|
end
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
292
|
+
|
293
|
+
ARGV.each do | fn |
|
294
|
+
opts = options.dup # so we can 'safely' modify options
|
295
|
+
f = File.open(fn,"r")
|
296
|
+
if not opts[:in_format] and fn =~ /\.csv$/i
|
297
|
+
logger.debug "Autodetected CSV file"
|
298
|
+
opts[:in_format] = :csv
|
276
299
|
end
|
300
|
+
walk_table.call(f)
|
277
301
|
end
|
278
|
-
options[:write_header] = false # don't write the header for chained files
|
279
|
-
end
|
280
302
|
|
281
|
-
statistics.write(writer) if statistics
|
303
|
+
statistics.write(writer) if statistics
|
304
|
+
|
305
|
+
rescue => msg
|
306
|
+
if options[:debug]
|
307
|
+
raise
|
308
|
+
else
|
309
|
+
$stderr.print "Error: ",msg
|
310
|
+
exit 1
|
311
|
+
end
|
312
|
+
end
|
data/lib/bio-table.rb
CHANGED
@@ -0,0 +1,39 @@
|
|
1
|
+
module BioTable
|
2
|
+
module Count
|
3
|
+
# Track rows that have the same column items. Return the last match of the cummalative list
|
4
|
+
# with the count attached.
|
5
|
+
class CountTracker
|
6
|
+
def initialize list
|
7
|
+
@list = list.map { |item| item.to_i }
|
8
|
+
@rows = []
|
9
|
+
end
|
10
|
+
|
11
|
+
# Add a row and if it differs send the last merged edition back
|
12
|
+
# type is :header or :row
|
13
|
+
def add row, type, flush: false
|
14
|
+
return row+["count"] if type == :header
|
15
|
+
num = @rows.size
|
16
|
+
prev = @rows.last
|
17
|
+
if flush
|
18
|
+
prev+[num]
|
19
|
+
else
|
20
|
+
# Take the list and compare each item to the previous row
|
21
|
+
prev_same = if prev
|
22
|
+
@list.reduce(true) { |memo,i| memo && (row[i]==prev[i]) }
|
23
|
+
else
|
24
|
+
false
|
25
|
+
end
|
26
|
+
if prev_same
|
27
|
+
@rows << row
|
28
|
+
else
|
29
|
+
@rows = []
|
30
|
+
@rows << row
|
31
|
+
return prev+[num] if prev
|
32
|
+
end
|
33
|
+
nil
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/bio-table/filter.rb
CHANGED
data/lib/bio-table/merge.rb
CHANGED
@@ -6,7 +6,9 @@ module BioTable
|
|
6
6
|
logger = Bio::Log::LoggerPlus['bio-table']
|
7
7
|
logger.info("Merging tables")
|
8
8
|
headers = tables.first.header[0..0] +
|
9
|
-
tables.map { |t| t.header[1..-1].map{|n|
|
9
|
+
tables.map { |t| t.header[1..-1].map{|n|
|
10
|
+
(options[:keep_headers] ? n : t.name+'-'+n)
|
11
|
+
}}.flatten
|
10
12
|
t = Table.new(headers)
|
11
13
|
# index tables on rownames
|
12
14
|
idxs = []
|
data/lib/bio-table/parser.rb
CHANGED
@@ -7,21 +7,23 @@ module BioTable
|
|
7
7
|
# Converts a string into an array of string fields
|
8
8
|
def LineParser::parse(line, in_format, split_on)
|
9
9
|
if in_format == :csv
|
10
|
-
|
10
|
+
$stderr.print "WARNING: this looks like a tab delimited file to me!\n" if line =~ /\t/
|
11
|
+
CSV.parse_line(line)
|
11
12
|
elsif in_format == :split
|
12
|
-
line.
|
13
|
+
line.split(split_on).map { |field|
|
13
14
|
fld = field.strip
|
14
15
|
fld = nil if fld == "NA"
|
15
16
|
fld
|
16
17
|
}
|
17
18
|
elsif in_format == :regex
|
18
|
-
line.
|
19
|
+
line.split(/#{split_on}/).map { |field|
|
19
20
|
fld = field.strip
|
20
21
|
fld = nil if fld == "NA"
|
21
22
|
fld
|
22
23
|
}
|
23
24
|
else
|
24
|
-
|
25
|
+
$stderr.print "WARNING: this looks like a tab delimited file to me!\n" if line =~ /,"/
|
26
|
+
line.split("\t").map { |field|
|
25
27
|
fld = field.strip
|
26
28
|
fld = nil if fld == "NA"
|
27
29
|
fld
|
data/lib/bio-table/rewrite.rb
CHANGED
@@ -4,9 +4,10 @@ module BioTable
|
|
4
4
|
|
5
5
|
# Rewrite fields. Both field and fields can be used, but not at the same time.
|
6
6
|
def Rewrite::rewrite code, rowname, field
|
7
|
-
fields = field
|
8
|
-
original = field
|
7
|
+
fields = field.dup
|
8
|
+
original = field.dup
|
9
9
|
values = LazyValues.new(field)
|
10
|
+
value = values
|
10
11
|
return rowname,field if not code or code==""
|
11
12
|
begin
|
12
13
|
eval(code)
|
@@ -14,6 +15,12 @@ module BioTable
|
|
14
15
|
$stderr.print "Failed to evaluate ",rowname," ",field," with ",code,"\n"
|
15
16
|
raise
|
16
17
|
end
|
18
|
+
if (fields & original != fields.uniq) and (field & original != field.uniq)
|
19
|
+
$stderr.print [:original,original],"\n"
|
20
|
+
$stderr.print [:fields,fields],"\n"
|
21
|
+
$stderr.print [:field,field],"\n"
|
22
|
+
raise "You can not rewrite both field and fields!"
|
23
|
+
end
|
17
24
|
field = fields if fields != original
|
18
25
|
return rowname,field
|
19
26
|
end
|
@@ -51,9 +51,13 @@ module BioTable
|
|
51
51
|
return column_idx, new_header
|
52
52
|
end
|
53
53
|
|
54
|
+
# Take a line as a string and return it as a tuple of rowname and datafields
|
54
55
|
def parse_row(line_num, line, header, column_idx, prev_fields, options)
|
55
56
|
fields = LineParser::parse(line, options[:in_format], options[:split_on])
|
56
57
|
return nil,nil if fields.compact == []
|
58
|
+
if options[:pad_fields] and fields.size < header.size
|
59
|
+
fields += [''] * (header.size - fields.size)
|
60
|
+
end
|
57
61
|
fields = Formatter::strip_quotes(fields) if @strip_quotes
|
58
62
|
fields = Formatter::transform_row_ids(@transform_ids, fields) if @transform_ids
|
59
63
|
fields = Filter::apply_column_filter(fields,column_idx)
|
data/lib/bio-table/tableload.rb
CHANGED
@@ -9,7 +9,7 @@ module BioTable
|
|
9
9
|
#
|
10
10
|
# Note that you need to pass in :with_header to get the header row
|
11
11
|
def TableLoader::emit generator, options = {}
|
12
|
-
table_apply = TableApply.new(options)
|
12
|
+
table_apply = TableApply.new(options) # parser and filters
|
13
13
|
column_index = nil, prev_line = nil
|
14
14
|
skip = options[:skip]
|
15
15
|
skip = 0 if skip == nil
|
@@ -19,6 +19,7 @@ module BioTable
|
|
19
19
|
generator.each_with_index do |line, line_num|
|
20
20
|
# p [line_num, line]
|
21
21
|
if line_num-skip == 0
|
22
|
+
# ---- This is the header section
|
22
23
|
header = table_apply.parse_header(line, options)
|
23
24
|
# Validator::valid_header?(header, @header) # compare against older header when merging
|
24
25
|
column_index,header = table_apply.column_index(header) # we may rewrite the header
|
data/lib/bio-table/validator.rb
CHANGED
@@ -4,8 +4,8 @@ module BioTable
|
|
4
4
|
def Validator::valid_header? header, old_header
|
5
5
|
if old_header
|
6
6
|
if header - old_header != []
|
7
|
-
|
8
|
-
|
7
|
+
$stderr.print old_header,"\n"
|
8
|
+
$stderr.print header,"\n"
|
9
9
|
raise "Headers do not compare!"
|
10
10
|
end
|
11
11
|
end
|
@@ -15,8 +15,8 @@ module BioTable
|
|
15
15
|
def Validator::valid_row? line_number, fields, last_fields
|
16
16
|
return false if fields == nil or fields.size == 0
|
17
17
|
if last_fields and last_fields.size>0 and (fields.size != last_fields.size)
|
18
|
-
|
19
|
-
|
18
|
+
$stderr.print last_fields,"\n"
|
19
|
+
$stderr.print fields,"\n"
|
20
20
|
throw "Number of fields diverge in line #{line_number} (size #{fields.size}, expected #{last_fields.size})"
|
21
21
|
end
|
22
22
|
true
|
@@ -378,6 +378,7 @@
|
|
378
378
|
110173,9.97,18.59,12.35,13.67,14.56,14.63,12.69,18.49,14.23,16.23,,,20.48,16.47,20.68,13.14,18.88,14.3,13.67,20.54,15.99,16.15,21.33,17.06,,16.05,,,Manba,"mannosidase, beta A, lysosomal,mannosidase, beta A, lysosomal,mannosidase, beta A, lysosomal,"
|
379
379
|
110187,0,0,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0.31,0,0,0,,0,,,Abpg,"androgen binding protein gamma"
|
380
380
|
110196,11.21,27.98,12.13,16.57,16.77,12.76,14.31,13.07,9.22,10.51,,,29.03,16.7,7.62,24.09,15.31,28.3,10.92,14.73,23.27,15.13,36.77,10.05,,15.15,,,Fdps,"farnesyl diphosphate synthetase"
|
381
|
+
|
381
382
|
0 0.06 NA 0 0 0 0 0.11 0 0 0 NA NA 0 0 0 0 0 0 0 0 0 0 0 0 NA 0 NA NA Mal2 MAL2 proteolipid protein
|
382
383
|
213.15 236.88 213.95 213.15 253.49 198 231.56 200.96 255.2 214.04 231.46 NA NA 233.23 241.26 237.53 171.87 237.13 162.3 252.13 284.85 188.76 253.43 220.15 305.52 NA 217.42 NA NA Nckap1l NCK associated protein 1 like,NCK associated protein 1 like,
|
383
384
|
0 0 NA 0 0 0.07 0.04 0 0 0 0 NA NA 0.02 0 0 0 0 0 0.06 0 0 0 0.02 0 NA 0 NA NA Csdc2 RNA-binding protein pippin
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-table
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Pjotr Prins
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-05-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-logger
|
@@ -80,20 +80,6 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 2.0.0
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: bio
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
86
|
-
requirements:
|
87
|
-
- - ">="
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: 1.4.2
|
90
|
-
type: :development
|
91
|
-
prerelease: false
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
93
|
-
requirements:
|
94
|
-
- - ">="
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
version: 1.4.2
|
97
83
|
- !ruby/object:Gem::Dependency
|
98
84
|
name: rdoc
|
99
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -153,6 +139,7 @@ files:
|
|
153
139
|
- features/support/env.rb
|
154
140
|
- lib/bio-table.rb
|
155
141
|
- lib/bio-table/columns.rb
|
142
|
+
- lib/bio-table/count.rb
|
156
143
|
- lib/bio-table/diff.rb
|
157
144
|
- lib/bio-table/filter.rb
|
158
145
|
- lib/bio-table/formatter.rb
|