bio-table 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.2
4
+ - 1.9.3
5
+ - jruby-19mode # JRuby in 1.9 mode
6
+ - rbx-19mode
7
+ # - 1.8.7
8
+ # - jruby-18mode # JRuby in 1.8 mode
9
+ # - rbx-18mode
10
+
11
+ # uncomment this line if your project needs to run something other than `rake`:
12
+ # script: bundle exec rspec spec
data/Gemfile ADDED
@@ -0,0 +1,18 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ gem "bio-logger"
7
+
8
+ # Add dependencies to develop your gem here.
9
+ # Include everything needed to run rake, tests, features, etc.
10
+ group :development do
11
+ gem "rspec", "~> 2.8.0"
12
+ gem "rdoc", "~> 3.12"
13
+ gem "cucumber", ">= 0"
14
+ gem "bundler", "> 1.0.0"
15
+ gem "jeweler", "~> 1.8.3"
16
+ gem "bio", ">= 1.4.2"
17
+ gem "rdoc", "~> 3.12"
18
+ end
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 Pjotr Prins
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,283 @@
1
+ # bio-table
2
+
3
+ [![Build Status](https://secure.travis-ci.org/pjotrp/bioruby-table.png)](http://travis-ci.org/pjotrp/bioruby-table)
4
+
5
+ Tables of data are often used in bioinformatics, especially in
6
+ conjunction with Excel spreadsheets and DB queries. This biogem
7
+ contains support for reading tables, writing tables, and manipulation
8
+ of rows and columns, both using a command line interface and through a
9
+ Ruby library. If you don't like R dataframes, maybe you like this.
10
+ Also, because bio-table is command line driven, it easily fits in a
11
+ pipe-line setup.
12
+
13
+ Quick example, say we want to filter out rows that contain certain
14
+ p-values listed in the 4th column:
15
+
16
+ ```
17
+ bio-table test/data/input/table1.csv --num-filter "values[3] <= 0.05"
18
+ ```
19
+
20
+ bio-table should be lazy, be good for big data, and the library
21
+ support a functional style of programming. You don't need to know Ruby
22
+ to use the command line interface (CLI).
23
+
24
+ Note: this software is under active development!
25
+
26
+ ## Installation
27
+
28
+ ```sh
29
+ gem install bio-table
30
+ ```
31
+
32
+ ## The command line interface (CLI)
33
+
34
+ ### Transforming a table
35
+
36
+ Tables can be transformed through the command line. To transform a
37
+ comma separated file to a tab delimited one
38
+
39
+ ```
40
+ bio-table test/data/input/table1.csv --in-format csv --format tab > test1.tab
41
+ ```
42
+
43
+ Tab is actually the general default. Still, if the file name ends in
44
+ csv, it will assume CSV. To convert the table back
45
+
46
+ ```
47
+ bio-table test1.tab --format csv > table1.csv
48
+ ```
49
+
50
+ To filter out rows that contain certain values
51
+
52
+ ```
53
+ bio-table test/data/input/table1.csv --num-filter "values[3] <= 0.05" > test1a.tab
54
+ ```
55
+
56
+ The filter ignores the header row, and the row names. If you need
57
+ either, use the switches --with-header and --with-rownames. With math, list all rows
58
+
59
+ ```
60
+ bio-table test/data/input/table1.csv --num-filter "values[3]-values[6] >= 0.05" > test1a.tab
61
+ ```
62
+
63
+ or, list all rows that have a least a field with values >= 1000.0
64
+
65
+ ```
66
+ bio-table test/data/input/table1.csv --num-filter "values.max >= 1000.0" > test1a.tab
67
+ ```
68
+
69
+ Produce all rows that have at least 3 values above 3.0 and 1 one value
70
+ above 10.0:
71
+
72
+ ```
73
+ bio-table test/data/input/table1.csv --num-filter "values.max >= 10.0 and values.count{|x| x>=3.0} > 3"
74
+ ```
75
+
76
+ How is that for expressiveness? Looks like Ruby to me.
77
+
78
+ The --num-filter will convert fields lazily to numerical values (only
79
+ valid numbers are converted). If there are NA (nil) values in the table, you
80
+ may wish to remove them, like this
81
+
82
+ ```
83
+ bio-table test/data/input/table1.csv --num-filter "values[0..12].compact.max >= 1000.0" > test1a.tab
84
+ ```
85
+
86
+ which takes the first 13 fields and compact removes the nil values.
87
+
88
+ Also string comparisons and regular expressions can be used. E.g.
89
+ filter on rownames and a row field both containing 'BGT'
90
+
91
+ ```
92
+ # not yet implemented
93
+ bio-table test/data/input/table1.csv --filter "rowname =~ /BGT/ and field[1] =~ /BGT/" > test1a.tab
94
+ ```
95
+
96
+ To reorder/reduce table columns by name
97
+
98
+ ```
99
+ bio-table test/data/input/table1.csv --columns AJ,B6,Axb1,Axb4,AXB13,Axb15,Axb19 > test1a.tab
100
+ ```
101
+
102
+ or use their index numbers
103
+
104
+ ```
105
+ bio-table test/data/input/table1.csv --columns 0,1,8,2,4,6 > test1a.tab
106
+ ```
107
+
108
+ ### Sorting a table
109
+
110
+ To sort a table on column 4 and 2
111
+
112
+ ```
113
+ # not yet implemented
114
+ bio-table test/data/input/table1.csv --sort 4,2 > test1a.tab
115
+ ```
116
+
117
+ Note: not all is implemented (just yet). Please check bio-table --help first.
118
+
119
+ ### Combining a table
120
+
121
+ You can combine/concat tables by passing in multiple file names
122
+
123
+ bio-table test/data/input/table1.csv test/data/input/table2.csv
124
+
125
+ assuming they have the same headers (you can use the --columns switch!)
126
+
127
+ ### Splitting a table
128
+
129
+ Splitting a table by column is possible by named or indexed columns,
130
+ see the --columns switch.
131
+
132
+ more soon
133
+
134
+ ### Diffing and overlapping tables
135
+
136
+ With two tables it may be interesting to see the differences, or
137
+ overlap, based on shared columns. The bio-table diff command shows the
138
+ difference between two tables using the row names (i.e. those rows
139
+ with rownames that appear in table2, but not in table1)
140
+
141
+ bio-table --diff 0 table1.csv table2.csv
142
+
143
+ To find it the other way, switch the file names
144
+
145
+ bio-table --diff 0 table1.csv table2.csv
146
+
147
+ To diff on something else
148
+
149
+ bio-table --diff 0,3 table2.csv table1.csv
150
+
151
+ creates a (hopefully unique) key using columns 0 and 3 (0 is the rownames column).
152
+
153
+ Similarly
154
+
155
+ bio-table --overlap 2 table1.csv table2.csv
156
+
157
+ finds the overlapping rows, based on column 2 (NYI)
158
+
159
+ ### Different parsers
160
+
161
+ more soon
162
+
163
+ ## Usage
164
+
165
+ ```ruby
166
+ require 'bio-table'
167
+ include BioTable
168
+ ```
169
+
170
+ ### Reading, transforming, and writing a table
171
+
172
+ Note: the Ruby API below is a work in progress.
173
+
174
+ Tables are two dimensional matrixes, which can be read from a file
175
+
176
+ ```
177
+ t = Table.read_file('test/data/input/table1.csv')
178
+ p t.header # print the header array
179
+ p t.name[0],t[0] # print the row name and row row
180
+ p t[0][0] # print the top corner field
181
+ ```
182
+
183
+ The table reader has quite a few options for defining field separator,
184
+ which column to use for names etc. More interestingly you can pass a
185
+ function to limit the amount of row read into memory:
186
+
187
+ ```
188
+ t = Table.read_file('test/data/input/table1.csv',
189
+ :by_row => { | row | row[0..3] } )
190
+ ```
191
+
192
+ will create a table of the column name +row[0]+ and 2 table fields. You can use
193
+ the same idea to reformat and reorder table columns when reading data
194
+ into the table. E.g.
195
+
196
+ ```
197
+ t = Table.read_file('test/data/input/table1.csv',
198
+ :by_row => { | row | [row.rowname, row[0..3], row[6].to_i].flatten } )
199
+ ```
200
+
201
+ When a header can not be transformed, it may fail. You can test for
202
+ the header with row.header?, but in this case you
203
+ can pass in a :by_header, which will have :by_row only call on
204
+ actual table rows.
205
+
206
+ ```
207
+ t = Table.read_file('test/data/input/table1.csv',
208
+ :by_header => { | header | ["Row name", header[0..3], header[6]].flatten } )
209
+ :by_row => { | row | [row.rowname, row[0..3], row[6].to_i].flatten } )
210
+ ```
211
+
212
+ When by_row returns nil or false, the table row is skipped. One way to
213
+ transform a file, and not loading it in memory, is
214
+
215
+ ```
216
+ f = File.new('test.tab','w')
217
+ t = Table.read_file('test/data/input/table1.csv',
218
+ :by_row => { | row |
219
+ TableRow::write(f,[row.rowname,row[0..3],row[6].to_i].flatten, :separator => "\t")
220
+ nil # don't create a table in memory, effectively a filter
221
+ })
222
+ ```
223
+
224
+ Another function is :filter which only acts on rows, but can not
225
+ transform them.
226
+
227
+ To write a full table from memory to file use
228
+
229
+ ```
230
+ t.write_file('test1a.csv')
231
+ ```
232
+
233
+ again columns can be reordered/transformed using a function. Another
234
+ option is by passing in an list of column numbers or header names, so
235
+ only those get written, e.g.
236
+
237
+ ```
238
+ t.write_file('test1a.csv', columns: [0,1,2,4,6,8])
239
+ t.write_file('test1b.csv', columns: ["AJ","B6","Axb1","Axb4","AXB13","Axb15","Axb19"] )
240
+ ```
241
+
242
+ other options are available for excluding row names (rownames: false), etc.
243
+
244
+ To sort a table file, the current routine is to load the file in
245
+ memory and sort according to table columns. In the near future we aim
246
+ to have a low-memory version, by reading only the sorting columns in
247
+ memory, and indexing them before writing output. That means reading a
248
+ file twice, but being able to handle much larger data.
249
+
250
+ ### Loading a numerical matrix
251
+
252
+ Coming soon
253
+
254
+ ### More...
255
+
256
+ The API doc is online. For more code examples see the test files in
257
+ the source tree.
258
+
259
+
260
+ ## Project home page
261
+
262
+ Information on the source tree, documentation, examples, issues and
263
+ how to contribute, see
264
+
265
+ http://github.com/pjotrp/bioruby-table
266
+
267
+ The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
268
+
269
+ ## Cite
270
+
271
+ If you use this software, please cite one of
272
+
273
+ * [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
274
+ * [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
275
+
276
+ ## Biogems.info
277
+
278
+ This Biogem is published at [#bio-table](http://biogems.info/index.html)
279
+
280
+ ## Copyright
281
+
282
+ Copyright (c) 2012 Pjotr Prins. See LICENSE.txt for further details.
283
+
@@ -0,0 +1,55 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "bio-table"
18
+ gem.homepage = "http://github.com/pjotrp/bioruby-table"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{Transforming/filtering tab/csv files}
21
+ gem.description = %Q{Functions and tools for tranforming and changing tab delimited and comma separated table files - useful for Excel sheets and SQL/RDF output}
22
+ gem.email = "pjotr.public01@thebird.nl"
23
+ gem.authors = ["Pjotr Prins"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rspec/core'
29
+ require 'rspec/core/rake_task'
30
+ RSpec::Core::RakeTask.new(:spec) do |spec|
31
+ spec.pattern = FileList['spec/**/*_spec.rb']
32
+ end
33
+
34
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
35
+ spec.pattern = 'spec/**/*_spec.rb'
36
+ spec.rcov = true
37
+ end
38
+
39
+ require 'cucumber/rake/task'
40
+ Cucumber::Rake::Task.new do |features|
41
+ end
42
+
43
+ task :test => [ :cucumber ]
44
+
45
+ task :default => :test
46
+
47
+ require 'rdoc/task'
48
+ Rake::RDocTask.new do |rdoc|
49
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
50
+
51
+ rdoc.rdoc_dir = 'rdoc'
52
+ rdoc.title = "bio-table #{version}"
53
+ rdoc.rdoc_files.include('README*')
54
+ rdoc.rdoc_files.include('lib/**/*.rb')
55
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,141 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # BioRuby bio-table Plugin BioTable
4
+ # Author:: Pjotr Prins
5
+ # Copyright:: 2012
6
+
7
+ rootpath = File.dirname(File.dirname(__FILE__))
8
+ $: << File.join(rootpath,'lib')
9
+
10
+ _VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp
11
+
12
+ $stderr.print "bio-table "+_VERSION+" Copyright (C) 2012 Pjotr Prins <pjotr.prins@thebird.nl>\n\n"
13
+
14
+ USAGE =<<EOU
15
+
16
+ bio-table transforms, filters and reorders table files (CSV, tab-delimited).
17
+
18
+ EOU
19
+
20
+ if ARGV.size == 0
21
+ print USAGE
22
+ end
23
+
24
+ require 'bio-table'
25
+ require 'optparse'
26
+ require 'bio-logger'
27
+
28
+ log = Bio::Log::LoggerPlus.new 'bio-table'
29
+ # log.outputters = Bio::Log::Outputter.stderr
30
+
31
+ Bio::Log::CLI.logger('stderr')
32
+ Bio::Log::CLI.trace('info')
33
+
34
+ options = {show_help: false, write_header: true}
35
+ options[:show_help] = true if ARGV.size == 0
36
+ opts = OptionParser.new do |o|
37
+ o.banner = "Usage: #{File.basename($0)} [options] filename\n\n"
38
+
39
+ o.on('--in-format [tab,csv]', [:tab, :csv], 'Input format (default tab)') do |par|
40
+ options[:in_format] = par.to_sym
41
+ end
42
+
43
+ o.on('--format [tab,csv]', [:tab, :csv], 'Output format (default tab)') do |par|
44
+ options[:format] = par.to_sym
45
+ end
46
+
47
+ o.on('--num-filter func', 'Numeric filtering function') do |par|
48
+ options[:num_filter] = par
49
+ end
50
+
51
+ o.on('--columns list', Array, 'List of column names or indices') do |l|
52
+ options[:columns] = l
53
+ end
54
+
55
+ o.on('--diff list',Array,'Diff two input files on columns') do |l|
56
+ options[:diff] = l
57
+ end
58
+
59
+ o.on('--overlap list',Array,'Find overlap of two input files on columns)') do |l|
60
+ options[:overlap] = l
61
+ end
62
+
63
+
64
+ # o.on('--with-header','Include the header element in filtering etc.') do
65
+ # options[:with_header] = true
66
+ # end
67
+
68
+ o.on('--with-rownames','Include the rownames in filtering etc.') do
69
+ options[:with_rownames] = true
70
+ end
71
+
72
+ o.separator ""
73
+
74
+ o.on("--logger filename",String,"Log to file (default stderr)") do | name |
75
+ Bio::Log::CLI.logger(name)
76
+ end
77
+
78
+ o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
79
+ Bio::Log::CLI.trace(s)
80
+ end
81
+
82
+ o.on("-q", "--quiet", "Run quietly") do |q|
83
+ Bio::Log::CLI.trace('error')
84
+ end
85
+
86
+ o.on("-v", "--verbose", "Run verbosely") do |v|
87
+ Bio::Log::CLI.trace('info')
88
+ end
89
+
90
+ o.on("--debug", "Show debug messages") do |v|
91
+ Bio::Log::CLI.trace('debug')
92
+ end
93
+
94
+ o.separator ""
95
+
96
+ o.on_tail('-h', '--help', 'Display this help and exit') do
97
+ options[:show_help] = true
98
+ end
99
+ end
100
+
101
+ begin
102
+ opts.parse!(ARGV)
103
+
104
+ if options[:show_help]
105
+ print opts
106
+ print USAGE
107
+ end
108
+
109
+ # TODO: your code here
110
+ # use options for your logic
111
+ rescue OptionParser::InvalidOption => e
112
+ options[:invalid_argument] = e.message
113
+ end
114
+
115
+ Bio::Log::CLI.configure('bio-table')
116
+ logger = Bio::Log::LoggerPlus['bio-table']
117
+ logger.info [options, ARGV]
118
+
119
+ include BioTable
120
+
121
+ if options[:diff]
122
+ logger.warn "Column settings are ignored for --diff" if options[:columns]
123
+ logger.warn "Ignoring extraneaousfiles" if ARGV.size>2
124
+ t1 = TableReader::read_file(ARGV[0], options)
125
+ t2 = TableReader::read_file(ARGV[1], options)
126
+ t = Diff::diff_tables(t1,t2, options)
127
+ t.write(options)
128
+ exit
129
+ end
130
+
131
+ if options[:overlap]
132
+ logger.warn "Column settings are ignored for --overlap" if options[:columns]
133
+ exit
134
+ end
135
+
136
+
137
+ ARGV.each do | fn |
138
+ t = TableReader::read_file(fn, options)
139
+ t.write(options)
140
+ options[:write_header] = false # don't write the header for chained files
141
+ end