bio-table 0.9.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +0 -1
- data/README.md +46 -2
- data/VERSION +1 -1
- data/bin/bio-table +246 -215
- data/lib/bio-table.rb +1 -0
- data/lib/bio-table/count.rb +39 -0
- data/lib/bio-table/filter.rb +1 -0
- data/lib/bio-table/merge.rb +3 -1
- data/lib/bio-table/parser.rb +6 -4
- data/lib/bio-table/rewrite.rb +9 -2
- data/lib/bio-table/table_apply.rb +4 -0
- data/lib/bio-table/tableload.rb +2 -1
- data/lib/bio-table/validator.rb +4 -4
- data/test/data/regression/table1-STDIN.ref +1 -0
- metadata +3 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b33f9729f357116b8a58a31cdefdba144c5bfb5d
|
4
|
+
data.tar.gz: 04ed672ff432dcbe3b611a1441e83a84db5ebecc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ca15864e31d7c9dcfae49edbf281e27e4afae738a507787bc5b1860ee42ead5e2e4a418387fdbcb9c68ef674d86417a0760151db51b89af33ee1de5617275cee
|
7
|
+
data.tar.gz: 3cf3e0540bf9f3beea349d3e4db8d3792b6b394070171b451ea8105a9c9b89ba09a72e802060dc61aaae92f110707f0fbce5ed7662d996ef95257fb88c0e6386
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -38,6 +38,7 @@ Features:
|
|
38
38
|
* Calculate new values
|
39
39
|
* Calculate column statistics (mean, standard deviation)
|
40
40
|
* Diff between tables, selecting on specific column values
|
41
|
+
* Count elements in columns
|
41
42
|
* Merge tables side by side on column value/rowname
|
42
43
|
* Split/reduce tables by column
|
43
44
|
* Write formatted tables, e.g. HTML, LaTeX
|
@@ -174,6 +175,13 @@ in the table, use unshift headers, 0 becomes an 'ID' column
|
|
174
175
|
bio-table table1.csv --unshift-headers --columns 0,1,8,2,4,6
|
175
176
|
```
|
176
177
|
|
178
|
+
Another option will add fields to a row to get the same number of
|
179
|
+
fields
|
180
|
+
|
181
|
+
```sh
|
182
|
+
bio-table table1.csv --pad-fields
|
183
|
+
```
|
184
|
+
|
177
185
|
Duplicate columns with
|
178
186
|
|
179
187
|
```sh
|
@@ -186,6 +194,13 @@ Combine column values (more on rewrite below)
|
|
186
194
|
bio-table table1.csv --rewrite "rowname = rowname + '-' + field[0]"
|
187
195
|
```
|
188
196
|
|
197
|
+
To insert a table column simply add a tab, e.g., to inject a
|
198
|
+
column containing 'PATHWAY'
|
199
|
+
|
200
|
+
```sh
|
201
|
+
bio-table table1.csv --rewrite 'field[0] = "PATHWAY\t"+field[0]'
|
202
|
+
```
|
203
|
+
|
189
204
|
To filter for columns using a regular expression
|
190
205
|
|
191
206
|
```sh
|
@@ -220,6 +235,29 @@ Another option is to use (lazy) values:
|
|
220
235
|
|
221
236
|
which saves the typing to to_f.
|
222
237
|
|
238
|
+
Another feature is counting column elements. With
|
239
|
+
|
240
|
+
```sh
|
241
|
+
bio-table table1.csv --count 0,1,4
|
242
|
+
```
|
243
|
+
|
244
|
+
All records are combined that have the same rowname and values in
|
245
|
+
columns 0 and 3. In addition a column is added counting the number of
|
246
|
+
merged rows. So,
|
247
|
+
|
248
|
+
```
|
249
|
+
hs8 48713371 53713371 G SAMPLE005
|
250
|
+
hs8 48713371 53713371 G SAMPLE005
|
251
|
+
hs9 136643994 141643994 C SAMPLE005
|
252
|
+
```
|
253
|
+
|
254
|
+
becomes
|
255
|
+
|
256
|
+
```
|
257
|
+
hs8 48713371 53713371 G SAMPLE005 2
|
258
|
+
hs9 136643994 141643994 C SAMPLE005 1
|
259
|
+
```
|
260
|
+
|
223
261
|
### Statistics
|
224
262
|
|
225
263
|
bio-table can handle some column statistics using the Ruby statsample
|
@@ -287,9 +325,11 @@ with NA's, unless you add a filter, e.g.
|
|
287
325
|
|
288
326
|
```sh
|
289
327
|
bio-table --merge table1.csv table2.csv --num-filter "values.compact.size == values.size"
|
290
|
-
|
291
328
|
```
|
292
329
|
|
330
|
+
If you don't want the headers to be 'restyled' on merge, use the --keep-headers
|
331
|
+
override.
|
332
|
+
|
293
333
|
### Splitting a table
|
294
334
|
|
295
335
|
Splitting a table by column is possible by named or indexed columns,
|
@@ -554,7 +594,11 @@ Coming soon
|
|
554
594
|
The API doc is online. For more code examples see the test files in
|
555
595
|
the source tree.
|
556
596
|
|
557
|
-
|
597
|
+
## Troubleshooting
|
598
|
+
|
599
|
+
Run bio-table with the --debug switch to get stack traces. Use --debug
|
600
|
+
and or --trace for more output.
|
601
|
+
|
558
602
|
## Project home page
|
559
603
|
|
560
604
|
Information on the source tree, documentation, examples, issues and
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
1.0.0
|
data/bin/bio-table
CHANGED
@@ -2,280 +2,311 @@
|
|
2
2
|
#
|
3
3
|
# BioRuby bio-table Plugin BioTable
|
4
4
|
# Author:: Pjotr Prins
|
5
|
-
# Copyright:: 2012
|
6
5
|
|
7
|
-
|
8
|
-
$: << File.join(rootpath,'lib')
|
6
|
+
begin
|
9
7
|
|
10
|
-
|
8
|
+
rootpath = File.dirname(File.dirname(__FILE__))
|
9
|
+
$: << File.join(rootpath,'lib')
|
11
10
|
|
12
|
-
|
11
|
+
_VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp
|
13
12
|
|
14
|
-
|
13
|
+
INPUT_ON_STDIN = !$stdin.tty?
|
15
14
|
|
16
|
-
|
15
|
+
$stderr.print "bio-table "+_VERSION+" Copyright (C) 2012-2014 Pjotr Prins <pjotr.prins@thebird.nl>\n\n"
|
17
16
|
|
18
|
-
|
17
|
+
USAGE =<<EOU
|
19
18
|
|
20
|
-
|
19
|
+
bio-table transforms, filters and reorders table files (CSV, tab-delimited).
|
21
20
|
|
22
|
-
|
23
|
-
print USAGE
|
24
|
-
end
|
21
|
+
EOU
|
25
22
|
|
26
|
-
|
27
|
-
|
28
|
-
|
23
|
+
if ARGV.size == 0 and not INPUT_ON_STDIN
|
24
|
+
print USAGE
|
25
|
+
end
|
29
26
|
|
30
|
-
|
31
|
-
|
27
|
+
require 'bio-table'
|
28
|
+
require 'optparse'
|
29
|
+
require 'bio-logger'
|
32
30
|
|
33
|
-
Bio::Log::
|
34
|
-
Bio::Log::
|
31
|
+
log = Bio::Log::LoggerPlus.new 'bio-table'
|
32
|
+
# log.outputters = Bio::Log::Outputter.stderr
|
35
33
|
|
36
|
-
|
37
|
-
|
38
|
-
opts = OptionParser.new do |o|
|
39
|
-
o.banner = "Usage: #{File.basename($0)} [options] filename\n\n"
|
34
|
+
Bio::Log::CLI.logger('stderr')
|
35
|
+
Bio::Log::CLI.trace('info')
|
40
36
|
|
41
|
-
|
42
|
-
|
43
|
-
|
37
|
+
options = {show_help: false, write_header: true, skip: 0}
|
38
|
+
options[:show_help] = true if ARGV.size == 0 and not INPUT_ON_STDIN
|
39
|
+
opts = OptionParser.new do |o|
|
40
|
+
o.banner = "Usage: #{File.basename($0)} [options] filename\n\n"
|
44
41
|
|
45
|
-
|
46
|
-
|
47
|
-
|
42
|
+
o.on('--num-filter expression', 'Numeric filtering function') do |par|
|
43
|
+
options[:num_filter] = par
|
44
|
+
end
|
48
45
|
|
49
|
-
|
50
|
-
|
51
|
-
|
46
|
+
o.on('--filter expression', 'Generic filtering function') do |par|
|
47
|
+
options[:filter] = par
|
48
|
+
end
|
52
49
|
|
53
|
-
|
54
|
-
|
55
|
-
|
50
|
+
o.on('--rewrite expression', 'Rewrite function') do |par|
|
51
|
+
options[:rewrite] = par
|
52
|
+
end
|
56
53
|
|
57
|
-
|
58
|
-
|
59
|
-
|
54
|
+
o.on('--count list', Array, 'Merge and count similar rows') do |list|
|
55
|
+
options[:count] = list
|
56
|
+
end
|
60
57
|
|
61
|
-
|
62
|
-
|
63
|
-
|
58
|
+
o.on('--columns list', Array, 'List of column names or indices') do |l|
|
59
|
+
options[:columns] = l
|
60
|
+
end
|
64
61
|
|
65
|
-
|
66
|
-
|
67
|
-
ARGV.unshift l.first
|
68
|
-
l = ["0"]
|
62
|
+
o.on('--column-filter expression', 'Column name filtering function') do |par|
|
63
|
+
options[:column_filter] = par
|
69
64
|
end
|
70
|
-
options[:diff] = l
|
71
|
-
end
|
72
65
|
|
73
|
-
|
74
|
-
|
75
|
-
ARGV.unshift l.first
|
76
|
-
l = ["0"]
|
66
|
+
o.on('--merge','Merge tables by rowname') do
|
67
|
+
options[:merge] = true
|
77
68
|
end
|
78
|
-
options[:overlap] = l
|
79
|
-
end
|
80
|
-
|
81
|
-
o.on('--merge','Merge tables by rowname') do
|
82
|
-
options[:merge] = true
|
83
|
-
end
|
84
69
|
|
85
|
-
|
70
|
+
o.on('--diff list',Array,'Diff two input files on columns (default rownames)') do |l|
|
71
|
+
if l.size==1 and File.exist?(l.first)
|
72
|
+
ARGV.unshift l.first
|
73
|
+
l = ["0"]
|
74
|
+
end
|
75
|
+
options[:diff] = l
|
76
|
+
end
|
86
77
|
|
87
|
-
|
88
|
-
|
89
|
-
|
78
|
+
o.on('--overlap list',Array,'Find overlap of two input files on columns)') do |l|
|
79
|
+
if l.size==1 and File.exist?(l.first)
|
80
|
+
ARGV.unshift l.first
|
81
|
+
l = ["0"]
|
82
|
+
end
|
83
|
+
options[:overlap] = l
|
84
|
+
end
|
85
|
+
|
86
|
+
o.on('--merge','Merge tables by rowname') do
|
87
|
+
options[:merge] = true
|
88
|
+
end
|
90
89
|
|
91
|
-
|
92
|
-
options[:with_headers] = true
|
93
|
-
options[:write_header] = false
|
94
|
-
end
|
90
|
+
o.separator "\n\tOverrides:\n\n"
|
95
91
|
|
96
|
-
|
97
|
-
|
98
|
-
|
92
|
+
o.on('--skip lines',Integer,'Skip the first lines before parsing') do |skip|
|
93
|
+
options[:skip] = skip
|
94
|
+
end
|
99
95
|
|
100
|
-
|
101
|
-
|
102
|
-
|
96
|
+
o.on('--with-headers','Include the header element in filtering etc.') do
|
97
|
+
options[:with_headers] = true
|
98
|
+
options[:write_header] = false
|
99
|
+
end
|
103
100
|
|
104
|
-
|
105
|
-
|
106
|
-
|
101
|
+
o.on('--with-rownames','Include the rownames in filtering etc.') do
|
102
|
+
options[:with_rownames] = true
|
103
|
+
end
|
107
104
|
|
108
|
-
|
105
|
+
o.on('--unshift-headers','Add an extra header element at the front (header contains one fewer field than the number of columns)') do
|
106
|
+
options[:unshift_headers] = true
|
107
|
+
end
|
109
108
|
|
110
|
-
|
111
|
-
|
112
|
-
|
109
|
+
o.on('--keep-headers','Keep original headers on merge') do
|
110
|
+
options[:keep_headers] = true
|
111
|
+
end
|
113
112
|
|
114
|
-
o.separator "\n\tFormat and options:\n\n"
|
115
|
-
|
116
|
-
o.on('--in-format [tab,csv,split,regex]', [:tab, :csv, :split, :regex], 'Input format (default tab)') do |par|
|
117
|
-
options[:in_format] = par.to_sym
|
118
|
-
end
|
119
113
|
|
120
|
-
|
121
|
-
|
122
|
-
|
114
|
+
o.on('--strip-quotes','Strip quotes from table fields') do
|
115
|
+
options[:strip_quotes] = true
|
116
|
+
end
|
123
117
|
|
124
|
-
|
125
|
-
|
126
|
-
|
118
|
+
o.on('--pad-fields','Add empty fields if a row is too short') do
|
119
|
+
options[:pad_fields] = true
|
120
|
+
end
|
127
121
|
|
128
|
-
|
129
|
-
options[:evaluate] = s
|
130
|
-
end
|
122
|
+
o.separator "\n\tTransform:\n\n"
|
131
123
|
|
132
|
-
|
133
|
-
|
134
|
-
|
124
|
+
o.on('--transform-ids [downcase,upcase]',[:downcase,:upcase],'Transform column and row identifiers') do |par|
|
125
|
+
options[:transform_ids] = par.to_sym
|
126
|
+
end
|
135
127
|
|
136
|
-
|
137
|
-
|
138
|
-
|
128
|
+
o.separator "\n\tFormat and options:\n\n"
|
129
|
+
|
130
|
+
o.on('--in-format [tab,csv,split,regex]', [:tab, :csv, :split, :regex], 'Input format (default tab)') do |par|
|
131
|
+
options[:in_format] = par.to_sym
|
132
|
+
end
|
139
133
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
o.separator "\n\tVerbosity:\n\n"
|
145
|
-
|
146
|
-
o.on("--logger filename",String,"Log to file (default stderr)") do | name |
|
147
|
-
Bio::Log::CLI.logger(name)
|
148
|
-
end
|
134
|
+
o.on('--format [tab,csv,rdf,eval]', [:tab, :csv, :rdf, :eval], 'Output format (default tab)') do |par|
|
135
|
+
options[:format] = par.to_sym
|
136
|
+
end
|
149
137
|
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
o.on("-q", "--quiet", "Run quietly") do |q|
|
155
|
-
Bio::Log::CLI.trace('error')
|
156
|
-
end
|
157
|
-
|
158
|
-
o.on("-v", "--verbose", "Run verbosely") do |v|
|
159
|
-
Bio::Log::CLI.trace('info')
|
160
|
-
end
|
161
|
-
|
162
|
-
o.on("--debug", "Show debug messages") do |v|
|
163
|
-
Bio::Log::CLI.trace('debug')
|
164
|
-
end
|
138
|
+
o.on("--split-on command",String,"Split on string or regex (use with --in-format)") do | s |
|
139
|
+
options[:split_on] = s
|
140
|
+
end
|
165
141
|
|
166
|
-
|
142
|
+
o.on("-e command",String,"Evaluate output command (use with --format eval)") do | s |
|
143
|
+
options[:evaluate] = s
|
144
|
+
end
|
167
145
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
end
|
146
|
+
o.on("--fasta regex",String,"Read FASTA format creating ID with regex") do | regex |
|
147
|
+
options[:fasta] = regex
|
148
|
+
end
|
172
149
|
|
173
|
-
|
174
|
-
|
150
|
+
o.on('--blank-nodes','Output (RDF) blank nodes - allowing for duplicate row names') do
|
151
|
+
options[:blank_nodes] = true
|
152
|
+
end
|
175
153
|
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
154
|
+
o.on('--statistics','Output column statistics') do
|
155
|
+
options[:statistics] = true
|
156
|
+
end
|
157
|
+
|
158
|
+
o.separator "\n\tVerbosity:\n\n"
|
159
|
+
|
160
|
+
o.on("--logger filename",String,"Log to file (default stderr)") do | name |
|
161
|
+
Bio::Log::CLI.logger(name)
|
162
|
+
end
|
180
163
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
164
|
+
o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
|
165
|
+
Bio::Log::CLI.trace(s)
|
166
|
+
end
|
167
|
+
|
168
|
+
o.on("-q", "--quiet", "Run quietly") do |q|
|
169
|
+
Bio::Log::CLI.trace('error')
|
170
|
+
end
|
171
|
+
|
172
|
+
o.on("-v", "--verbose", "Run verbosely") do |v|
|
173
|
+
Bio::Log::CLI.trace('info')
|
174
|
+
end
|
175
|
+
|
176
|
+
o.on("--debug", "Show debug messages") do |v|
|
177
|
+
Bio::Log::CLI.trace('debug')
|
178
|
+
options[:debug] = true
|
179
|
+
end
|
186
180
|
|
187
|
-
|
188
|
-
logger = Bio::Log::LoggerPlus['bio-table']
|
189
|
-
logger.info [options]
|
181
|
+
o.separator ""
|
190
182
|
|
191
|
-
|
183
|
+
o.on_tail('-h', '--help', 'Display this help and exit') do
|
184
|
+
options[:show_help] = true
|
185
|
+
end
|
186
|
+
end
|
192
187
|
|
193
|
-
|
194
|
-
|
195
|
-
logger.warn "Ignoring extraneaous files "+ARGV[2..-1].join(",") if ARGV.size>2
|
196
|
-
t1 = TableReader::read_file(ARGV[0], options)
|
197
|
-
t2 = TableReader::read_file(ARGV[1], options)
|
198
|
-
t = Diff::diff_tables(t1,t2, options)
|
199
|
-
t.write(options)
|
200
|
-
exit
|
201
|
-
end
|
188
|
+
begin
|
189
|
+
opts.parse!(ARGV)
|
202
190
|
|
203
|
-
if options[:
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
t2 = TableReader::read_file(ARGV[1], options)
|
208
|
-
t = Overlap::overlap_tables(t1,t2, options)
|
209
|
-
t.write(options)
|
210
|
-
exit
|
211
|
-
end
|
191
|
+
if options[:show_help]
|
192
|
+
print opts
|
193
|
+
print USAGE
|
194
|
+
end
|
212
195
|
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
196
|
+
# TODO: your code here
|
197
|
+
# use options for your logic
|
198
|
+
rescue OptionParser::InvalidOption => e
|
199
|
+
options[:invalid_argument] = e.message
|
200
|
+
end
|
201
|
+
|
202
|
+
Bio::Log::CLI.configure('bio-table')
|
203
|
+
logger = Bio::Log::LoggerPlus['bio-table']
|
204
|
+
logger.info [options]
|
205
|
+
|
206
|
+
include BioTable
|
207
|
+
|
208
|
+
if options[:diff]
|
209
|
+
logger.warn "Column settings are ignored for --diff" if options[:columns]
|
210
|
+
logger.warn "Ignoring extraneaous files "+ARGV[2..-1].join(",") if ARGV.size>2
|
211
|
+
t1 = TableReader::read_file(ARGV[0], options)
|
212
|
+
t2 = TableReader::read_file(ARGV[1], options)
|
213
|
+
t = Diff::diff_tables(t1,t2, options)
|
214
|
+
t.write(options)
|
215
|
+
exit
|
216
|
+
end
|
217
|
+
|
218
|
+
if options[:overlap]
|
219
|
+
logger.warn "Column settings are ignored for --overlap" if options[:columns]
|
220
|
+
logger.warn "Ignoring extraneaous files "+ARGV[2..-1].join(",") if ARGV.size>2
|
221
|
+
t1 = TableReader::read_file(ARGV[0], options)
|
222
|
+
t2 = TableReader::read_file(ARGV[1], options)
|
223
|
+
t = Overlap::overlap_tables(t1,t2, options)
|
224
|
+
t.write(options)
|
225
|
+
exit
|
226
|
+
end
|
227
|
+
|
228
|
+
if options[:fasta]
|
229
|
+
logger.warn "Column settings are ignored for --fasta" if options[:columns]
|
230
|
+
ARGV.each do | fn |
|
231
|
+
print "id\tseq\n"
|
232
|
+
FastaReader.new(fn,options[:fasta]).each do | rec |
|
233
|
+
print rec.id,"\t",rec.seq,"\n"
|
234
|
+
end
|
219
235
|
end
|
236
|
+
exit
|
220
237
|
end
|
221
|
-
exit
|
222
|
-
end
|
223
238
|
|
224
|
-
if options[:merge]
|
225
|
-
|
226
|
-
|
227
|
-
|
239
|
+
if options[:merge]
|
240
|
+
ts = []
|
241
|
+
ARGV.each do | fn |
|
242
|
+
ts << TableReader::read_file(fn, options)
|
243
|
+
end
|
244
|
+
t = Merge::merge_tables(ts, options)
|
245
|
+
t.write(options)
|
246
|
+
exit
|
228
247
|
end
|
229
|
-
t = Merge::merge_tables(ts, options)
|
230
|
-
t.write(options)
|
231
|
-
exit
|
232
|
-
end
|
233
248
|
|
234
|
-
#
|
235
|
-
# We also support STDIN for the first 'file'. A non-blocking idea can
|
236
|
-
# be found here:
|
237
|
-
#
|
238
|
-
# http://eric.lubow.org/2010/ruby/multiple-input-locations-from-bash-into-ruby/
|
239
|
-
#
|
249
|
+
#
|
250
|
+
# We also support STDIN for the first 'file'. A non-blocking idea can
|
251
|
+
# be found here:
|
252
|
+
#
|
253
|
+
# http://eric.lubow.org/2010/ruby/multiple-input-locations-from-bash-into-ruby/
|
254
|
+
#
|
240
255
|
|
241
|
-
writer =
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
256
|
+
writer =
|
257
|
+
if options[:format] == :rdf
|
258
|
+
BioTable::RDF::Writer.new(options[:blank_nodes])
|
259
|
+
else
|
260
|
+
BioTable::TableWriter::Writer.new(options[:format],options[:evaluate])
|
261
|
+
end
|
262
|
+
|
263
|
+
statistics = if options[:statistics]
|
264
|
+
BioTable::Statistics::Accumulate.new
|
265
|
+
end
|
266
|
+
count = if options[:count]
|
267
|
+
BioTable::Count::CountTracker.new(options[:count])
|
268
|
+
end
|
247
269
|
|
248
|
-
if INPUT_ON_STDIN
|
249
270
|
opts = options.dup # so we can 'safely' modify options
|
250
271
|
has_input = false
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
272
|
+
walk_table = lambda { |f|
|
273
|
+
BioTable::TableLoader.emit(f, opts).each do |row, type| # type is :header or :row
|
274
|
+
if statistics
|
275
|
+
statistics.add(row,type)
|
276
|
+
else
|
277
|
+
row = count.add(row,type) if count # merge and count
|
278
|
+
writer.write(TableRow.new(row[0],row[1..-1]),type) if row
|
279
|
+
end
|
280
|
+
has_input = true
|
281
|
+
end
|
282
|
+
if count
|
283
|
+
row = count.add(row,:row,flush: true)
|
284
|
+
writer.write(TableRow.new(row[0],row[1..-1]),:row) if row
|
285
|
+
end
|
286
|
+
options[:write_header] = false if has_input # don't write the header for chained files
|
287
|
+
}
|
263
288
|
|
264
|
-
|
265
|
-
|
266
|
-
f = File.open(fn,"r")
|
267
|
-
if not opts[:in_format] and fn =~ /\.csv$/
|
268
|
-
logger.debug "Autodetected CSV file"
|
269
|
-
opts[:in_format] = :csv
|
289
|
+
if INPUT_ON_STDIN
|
290
|
+
walk_table.call(STDIN)
|
270
291
|
end
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
292
|
+
|
293
|
+
ARGV.each do | fn |
|
294
|
+
opts = options.dup # so we can 'safely' modify options
|
295
|
+
f = File.open(fn,"r")
|
296
|
+
if not opts[:in_format] and fn =~ /\.csv$/i
|
297
|
+
logger.debug "Autodetected CSV file"
|
298
|
+
opts[:in_format] = :csv
|
276
299
|
end
|
300
|
+
walk_table.call(f)
|
277
301
|
end
|
278
|
-
options[:write_header] = false # don't write the header for chained files
|
279
|
-
end
|
280
302
|
|
281
|
-
statistics.write(writer) if statistics
|
303
|
+
statistics.write(writer) if statistics
|
304
|
+
|
305
|
+
rescue => msg
|
306
|
+
if options[:debug]
|
307
|
+
raise
|
308
|
+
else
|
309
|
+
$stderr.print "Error: ",msg
|
310
|
+
exit 1
|
311
|
+
end
|
312
|
+
end
|
data/lib/bio-table.rb
CHANGED
@@ -0,0 +1,39 @@
|
|
1
|
+
module BioTable
|
2
|
+
module Count
|
3
|
+
# Track rows that have the same column items. Return the last match of the cummalative list
|
4
|
+
# with the count attached.
|
5
|
+
class CountTracker
|
6
|
+
def initialize list
|
7
|
+
@list = list.map { |item| item.to_i }
|
8
|
+
@rows = []
|
9
|
+
end
|
10
|
+
|
11
|
+
# Add a row and if it differs send the last merged edition back
|
12
|
+
# type is :header or :row
|
13
|
+
def add row, type, flush: false
|
14
|
+
return row+["count"] if type == :header
|
15
|
+
num = @rows.size
|
16
|
+
prev = @rows.last
|
17
|
+
if flush
|
18
|
+
prev+[num]
|
19
|
+
else
|
20
|
+
# Take the list and compare each item to the previous row
|
21
|
+
prev_same = if prev
|
22
|
+
@list.reduce(true) { |memo,i| memo && (row[i]==prev[i]) }
|
23
|
+
else
|
24
|
+
false
|
25
|
+
end
|
26
|
+
if prev_same
|
27
|
+
@rows << row
|
28
|
+
else
|
29
|
+
@rows = []
|
30
|
+
@rows << row
|
31
|
+
return prev+[num] if prev
|
32
|
+
end
|
33
|
+
nil
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/bio-table/filter.rb
CHANGED
data/lib/bio-table/merge.rb
CHANGED
@@ -6,7 +6,9 @@ module BioTable
|
|
6
6
|
logger = Bio::Log::LoggerPlus['bio-table']
|
7
7
|
logger.info("Merging tables")
|
8
8
|
headers = tables.first.header[0..0] +
|
9
|
-
tables.map { |t| t.header[1..-1].map{|n|
|
9
|
+
tables.map { |t| t.header[1..-1].map{|n|
|
10
|
+
(options[:keep_headers] ? n : t.name+'-'+n)
|
11
|
+
}}.flatten
|
10
12
|
t = Table.new(headers)
|
11
13
|
# index tables on rownames
|
12
14
|
idxs = []
|
data/lib/bio-table/parser.rb
CHANGED
@@ -7,21 +7,23 @@ module BioTable
|
|
7
7
|
# Converts a string into an array of string fields
|
8
8
|
def LineParser::parse(line, in_format, split_on)
|
9
9
|
if in_format == :csv
|
10
|
-
|
10
|
+
$stderr.print "WARNING: this looks like a tab delimited file to me!\n" if line =~ /\t/
|
11
|
+
CSV.parse_line(line)
|
11
12
|
elsif in_format == :split
|
12
|
-
line.
|
13
|
+
line.split(split_on).map { |field|
|
13
14
|
fld = field.strip
|
14
15
|
fld = nil if fld == "NA"
|
15
16
|
fld
|
16
17
|
}
|
17
18
|
elsif in_format == :regex
|
18
|
-
line.
|
19
|
+
line.split(/#{split_on}/).map { |field|
|
19
20
|
fld = field.strip
|
20
21
|
fld = nil if fld == "NA"
|
21
22
|
fld
|
22
23
|
}
|
23
24
|
else
|
24
|
-
|
25
|
+
$stderr.print "WARNING: this looks like a tab delimited file to me!\n" if line =~ /,"/
|
26
|
+
line.split("\t").map { |field|
|
25
27
|
fld = field.strip
|
26
28
|
fld = nil if fld == "NA"
|
27
29
|
fld
|
data/lib/bio-table/rewrite.rb
CHANGED
@@ -4,9 +4,10 @@ module BioTable
|
|
4
4
|
|
5
5
|
# Rewrite fields. Both field and fields can be used, but not at the same time.
|
6
6
|
def Rewrite::rewrite code, rowname, field
|
7
|
-
fields = field
|
8
|
-
original = field
|
7
|
+
fields = field.dup
|
8
|
+
original = field.dup
|
9
9
|
values = LazyValues.new(field)
|
10
|
+
value = values
|
10
11
|
return rowname,field if not code or code==""
|
11
12
|
begin
|
12
13
|
eval(code)
|
@@ -14,6 +15,12 @@ module BioTable
|
|
14
15
|
$stderr.print "Failed to evaluate ",rowname," ",field," with ",code,"\n"
|
15
16
|
raise
|
16
17
|
end
|
18
|
+
if (fields & original != fields.uniq) and (field & original != field.uniq)
|
19
|
+
$stderr.print [:original,original],"\n"
|
20
|
+
$stderr.print [:fields,fields],"\n"
|
21
|
+
$stderr.print [:field,field],"\n"
|
22
|
+
raise "You can not rewrite both field and fields!"
|
23
|
+
end
|
17
24
|
field = fields if fields != original
|
18
25
|
return rowname,field
|
19
26
|
end
|
@@ -51,9 +51,13 @@ module BioTable
|
|
51
51
|
return column_idx, new_header
|
52
52
|
end
|
53
53
|
|
54
|
+
# Take a line as a string and return it as a tuple of rowname and datafields
|
54
55
|
def parse_row(line_num, line, header, column_idx, prev_fields, options)
|
55
56
|
fields = LineParser::parse(line, options[:in_format], options[:split_on])
|
56
57
|
return nil,nil if fields.compact == []
|
58
|
+
if options[:pad_fields] and fields.size < header.size
|
59
|
+
fields += [''] * (header.size - fields.size)
|
60
|
+
end
|
57
61
|
fields = Formatter::strip_quotes(fields) if @strip_quotes
|
58
62
|
fields = Formatter::transform_row_ids(@transform_ids, fields) if @transform_ids
|
59
63
|
fields = Filter::apply_column_filter(fields,column_idx)
|
data/lib/bio-table/tableload.rb
CHANGED
@@ -9,7 +9,7 @@ module BioTable
|
|
9
9
|
#
|
10
10
|
# Note that you need to pass in :with_header to get the header row
|
11
11
|
def TableLoader::emit generator, options = {}
|
12
|
-
table_apply = TableApply.new(options)
|
12
|
+
table_apply = TableApply.new(options) # parser and filters
|
13
13
|
column_index = nil, prev_line = nil
|
14
14
|
skip = options[:skip]
|
15
15
|
skip = 0 if skip == nil
|
@@ -19,6 +19,7 @@ module BioTable
|
|
19
19
|
generator.each_with_index do |line, line_num|
|
20
20
|
# p [line_num, line]
|
21
21
|
if line_num-skip == 0
|
22
|
+
# ---- This is the header section
|
22
23
|
header = table_apply.parse_header(line, options)
|
23
24
|
# Validator::valid_header?(header, @header) # compare against older header when merging
|
24
25
|
column_index,header = table_apply.column_index(header) # we may rewrite the header
|
data/lib/bio-table/validator.rb
CHANGED
@@ -4,8 +4,8 @@ module BioTable
|
|
4
4
|
def Validator::valid_header? header, old_header
|
5
5
|
if old_header
|
6
6
|
if header - old_header != []
|
7
|
-
|
8
|
-
|
7
|
+
$stderr.print old_header,"\n"
|
8
|
+
$stderr.print header,"\n"
|
9
9
|
raise "Headers do not compare!"
|
10
10
|
end
|
11
11
|
end
|
@@ -15,8 +15,8 @@ module BioTable
|
|
15
15
|
def Validator::valid_row? line_number, fields, last_fields
|
16
16
|
return false if fields == nil or fields.size == 0
|
17
17
|
if last_fields and last_fields.size>0 and (fields.size != last_fields.size)
|
18
|
-
|
19
|
-
|
18
|
+
$stderr.print last_fields,"\n"
|
19
|
+
$stderr.print fields,"\n"
|
20
20
|
throw "Number of fields diverge in line #{line_number} (size #{fields.size}, expected #{last_fields.size})"
|
21
21
|
end
|
22
22
|
true
|
@@ -378,6 +378,7 @@
|
|
378
378
|
110173,9.97,18.59,12.35,13.67,14.56,14.63,12.69,18.49,14.23,16.23,,,20.48,16.47,20.68,13.14,18.88,14.3,13.67,20.54,15.99,16.15,21.33,17.06,,16.05,,,Manba,"mannosidase, beta A, lysosomal,mannosidase, beta A, lysosomal,mannosidase, beta A, lysosomal,"
|
379
379
|
110187,0,0,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0.31,0,0,0,,0,,,Abpg,"androgen binding protein gamma"
|
380
380
|
110196,11.21,27.98,12.13,16.57,16.77,12.76,14.31,13.07,9.22,10.51,,,29.03,16.7,7.62,24.09,15.31,28.3,10.92,14.73,23.27,15.13,36.77,10.05,,15.15,,,Fdps,"farnesyl diphosphate synthetase"
|
381
|
+
|
381
382
|
0 0.06 NA 0 0 0 0 0.11 0 0 0 NA NA 0 0 0 0 0 0 0 0 0 0 0 0 NA 0 NA NA Mal2 MAL2 proteolipid protein
|
382
383
|
213.15 236.88 213.95 213.15 253.49 198 231.56 200.96 255.2 214.04 231.46 NA NA 233.23 241.26 237.53 171.87 237.13 162.3 252.13 284.85 188.76 253.43 220.15 305.52 NA 217.42 NA NA Nckap1l NCK associated protein 1 like,NCK associated protein 1 like,
|
383
384
|
0 0 NA 0 0 0.07 0.04 0 0 0 0 NA NA 0.02 0 0 0 0 0 0.06 0 0 0 0.02 0 NA 0 NA NA Csdc2 RNA-binding protein pippin
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-table
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Pjotr Prins
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-05-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-logger
|
@@ -80,20 +80,6 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 2.0.0
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: bio
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
86
|
-
requirements:
|
87
|
-
- - ">="
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: 1.4.2
|
90
|
-
type: :development
|
91
|
-
prerelease: false
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
93
|
-
requirements:
|
94
|
-
- - ">="
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
version: 1.4.2
|
97
83
|
- !ruby/object:Gem::Dependency
|
98
84
|
name: rdoc
|
99
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -153,6 +139,7 @@ files:
|
|
153
139
|
- features/support/env.rb
|
154
140
|
- lib/bio-table.rb
|
155
141
|
- lib/bio-table/columns.rb
|
142
|
+
- lib/bio-table/count.rb
|
156
143
|
- lib/bio-table/diff.rb
|
157
144
|
- lib/bio-table/filter.rb
|
158
145
|
- lib/bio-table/formatter.rb
|