bio-table 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/.travis.yml +12 -0
- data/Gemfile +18 -0
- data/LICENSE.txt +20 -0
- data/README.md +283 -0
- data/Rakefile +55 -0
- data/VERSION +1 -0
- data/bin/bio-table +141 -0
- data/features/bio-table-csv-reader-feature.rb +28 -0
- data/features/bio-table-csv-reader.feature +22 -0
- data/features/step_definitions/bio-table_steps.rb +0 -0
- data/features/support/env.rb +13 -0
- data/lib/bio-table.rb +23 -0
- data/lib/bio-table/columns.rb +11 -0
- data/lib/bio-table/diff.rb +30 -0
- data/lib/bio-table/filter.rb +57 -0
- data/lib/bio-table/formatter.rb +28 -0
- data/lib/bio-table/overlap.rb +1 -0
- data/lib/bio-table/parser.rb +22 -0
- data/lib/bio-table/table.rb +121 -0
- data/lib/bio-table/tablereader.rb +13 -0
- data/lib/bio-table/tablerow.rb +29 -0
- data/lib/bio-table/tablewriter.rb +6 -0
- data/lib/bio-table/validator.rb +26 -0
- data/spec/bio-table_spec.rb +7 -0
- data/spec/spec_helper.rb +12 -0
- data/test/data/input/table1.csv +381 -0
- metadata +168 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
# require 'bio-table'
|
2
|
+
|
3
|
+
Given /^a comma separated table$/ do |string|
|
4
|
+
@lines = string.split(/\n/)
|
5
|
+
end
|
6
|
+
|
7
|
+
When /^I read the multi\-line string$/ do
|
8
|
+
end
|
9
|
+
|
10
|
+
Then /^I should correctly parse the comma\-separated headers into "(.*?)","(.*?)","(.*?)"$/ do |arg1, arg2, arg3|
|
11
|
+
@t = BioTable::Table.new
|
12
|
+
@t.read_lines(@lines, :in_format => :csv)
|
13
|
+
@t.header[0].should == arg1
|
14
|
+
@t.header[1].should == arg2
|
15
|
+
@t.header[2].should == arg3
|
16
|
+
end
|
17
|
+
|
18
|
+
And /^I should correctly parse the first line into$/ do |string|
|
19
|
+
line1 = @t[0]
|
20
|
+
s = eval(string);
|
21
|
+
line1.fields.should == s
|
22
|
+
end
|
23
|
+
|
24
|
+
And /^it should have the rowname "(.*?)"$/ do |arg1|
|
25
|
+
@t.rownames[0].should == "105853"
|
26
|
+
end
|
27
|
+
|
28
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
Feature: Read CSV table
|
2
|
+
|
3
|
+
bio-table should read multiple table formats.
|
4
|
+
|
5
|
+
Scenario: Read a comma separated table
|
6
|
+
Given a comma separated table
|
7
|
+
"""
|
8
|
+
#Gene,AJ,B6,Axb1,Axb2,Axb4,Axb12,AXB13,Axb15,Axb19,Axb23,Axb24,Bxa1,Bxa2,Bxa4,Bxa7,Bxa8,Bxa12,Bxa11,Bxa13,Bxa15,Bxa16,Axb5,Axb6,Axb8,Axb1,Bxa24,Bxa25,Bxa26,gene_symbol,gene_desc
|
9
|
+
105853,0.06,0,0,0,0,0,0.11,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,0,,,Mal2,"MAL2 proteolipid protein"
|
10
|
+
105855,236.88,213.95,213.15,253.49,198,231.56,200.96,255.2,214.04,231.46,,,233.23,241.26,237.53,171.87,237.13,162.3,252.13,284.85,188.76,253.43,220.15,305.52,,217.42,,,Nckap1l,"NCK associated protein 1 like,NCK associated protein 1 like,"
|
11
|
+
105859,0,0.14,0,0,0.07,0.04,0,0,0,0,,,0.02,0,0,0,0,0,0.06,0,0,0,0.02,0,,0,,,Csdc2,"RNA-binding protein pippin"
|
12
|
+
105866,0,0,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,0,,,Krt72,"keratin 72"
|
13
|
+
"""
|
14
|
+
When I read the multi-line string
|
15
|
+
Then I should correctly parse the comma-separated headers into "#Gene","AJ","B6"
|
16
|
+
And I should correctly parse the first line into
|
17
|
+
"""
|
18
|
+
["0.06", "0", "0", "0", "0", "0", "0.11", "0", "0", "0", nil, nil, "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", nil, "0", nil, nil, "Mal2", "MAL2 proteolipid protein"]
|
19
|
+
"""
|
20
|
+
And it should have the rowname "105853"
|
21
|
+
|
22
|
+
|
File without changes
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
begin
|
3
|
+
Bundler.setup(:default, :development)
|
4
|
+
rescue Bundler::BundlerError => e
|
5
|
+
$stderr.puts e.message
|
6
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
7
|
+
exit e.status_code
|
8
|
+
end
|
9
|
+
|
10
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
|
11
|
+
require 'bio-table'
|
12
|
+
|
13
|
+
require 'rspec/expectations'
|
data/lib/bio-table.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# Please require your code below, respecting the naming conventions in the
|
2
|
+
# bioruby directory tree.
|
3
|
+
#
|
4
|
+
# For example, say you have a plugin named bio-plugin, the only uncommented
|
5
|
+
# line in this file would be
|
6
|
+
#
|
7
|
+
# require 'bio/bio-plugin/plugin'
|
8
|
+
#
|
9
|
+
# In this file only require other files. Avoid other source code.
|
10
|
+
|
11
|
+
require 'bio-logger'
|
12
|
+
require 'bio-table/columns.rb'
|
13
|
+
require 'bio-table/validator.rb'
|
14
|
+
require 'bio-table/filter.rb'
|
15
|
+
require 'bio-table/parser.rb'
|
16
|
+
require 'bio-table/formatter.rb'
|
17
|
+
require 'bio-table/tablerow.rb'
|
18
|
+
require 'bio-table/table.rb'
|
19
|
+
require 'bio-table/tablereader.rb'
|
20
|
+
require 'bio-table/tablewriter.rb'
|
21
|
+
require 'bio-table/diff.rb'
|
22
|
+
require 'bio-table/overlap.rb'
|
23
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# Diff module
|
2
|
+
#
|
3
|
+
|
4
|
+
module BioTable
|
5
|
+
|
6
|
+
module Diff
|
7
|
+
|
8
|
+
def self.diff_tables t1, t2, options
|
9
|
+
logger = Bio::Log::LoggerPlus['bio-table']
|
10
|
+
columns = Columns::to_list(options[:diff])
|
11
|
+
t = Table.new(t1.header)
|
12
|
+
l1 = t1.map { |row| columns.map { |i| row.all_fields[i] } }
|
13
|
+
l2 = t2.map { |row| columns.map { |i| row.all_fields[i] } }
|
14
|
+
logger.warn "Not all selected keys are unique!" if l1.uniq.size != l1.size or l2.uniq.size != l2.size
|
15
|
+
diff = l2 - l1
|
16
|
+
# create index for table 2
|
17
|
+
idx2 = {}
|
18
|
+
t2.each do | row |
|
19
|
+
key = columns.map { |i| row.all_fields[i] }
|
20
|
+
idx2[key] = row
|
21
|
+
end
|
22
|
+
diff.each do |values|
|
23
|
+
t.push(t2.row_by_columns(columns.zip(values),idx2))
|
24
|
+
end
|
25
|
+
t
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module BioTable
|
2
|
+
|
3
|
+
module Filter
|
4
|
+
|
5
|
+
# Create an index to the column headers, so header A,B,C,D with columns
|
6
|
+
# C,A returns [2,0]. It can be the column index is already indexed, return
|
7
|
+
# it in that case.
|
8
|
+
#
|
9
|
+
def Filter::create_column_index columns, header
|
10
|
+
return nil if not columns
|
11
|
+
numbers = columns.dup.delete_if { |v| not valid_int?(v) }
|
12
|
+
if numbers.size == columns.size
|
13
|
+
return columns.map { |v| v.to_i }
|
14
|
+
end
|
15
|
+
|
16
|
+
index = []
|
17
|
+
columns.each do | name |
|
18
|
+
pos = header.index(name)
|
19
|
+
raise "Column name #{name} not found!" if pos == nil
|
20
|
+
index << pos
|
21
|
+
end
|
22
|
+
return index
|
23
|
+
end
|
24
|
+
|
25
|
+
def Filter::apply_column_filter fields, index
|
26
|
+
if index
|
27
|
+
index.map { |idx| fields[idx] }
|
28
|
+
else
|
29
|
+
fields
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def Filter::valid_int?(s)
|
34
|
+
s.to_i.to_s == s
|
35
|
+
end
|
36
|
+
|
37
|
+
def Filter::valid_number?(s)
|
38
|
+
s.to_s.match(/\A[+-]?\d+?(\.\d+)?\Z/) == nil ? false : true
|
39
|
+
end
|
40
|
+
|
41
|
+
def Filter::numeric code, fields
|
42
|
+
return true if code == nil
|
43
|
+
if fields
|
44
|
+
values = fields.map { |field| (valid_number?(field) ? field.to_f : nil ) } # FIXME: not so lazy
|
45
|
+
begin
|
46
|
+
eval(code)
|
47
|
+
rescue
|
48
|
+
$stderr.print "Failed to evaluate ",fields," with ",code,"\n"
|
49
|
+
raise
|
50
|
+
end
|
51
|
+
else
|
52
|
+
false
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module BioTable
|
2
|
+
|
3
|
+
class TabFormatter
|
4
|
+
def write list
|
5
|
+
print list.map{|field| (field==nil ? "NA" : field)}.join("\t"),"\n"
|
6
|
+
end
|
7
|
+
|
8
|
+
end
|
9
|
+
|
10
|
+
class CsvFormatter
|
11
|
+
|
12
|
+
def write list
|
13
|
+
csv_string = CSV.generate do |csv|
|
14
|
+
csv << list
|
15
|
+
end
|
16
|
+
print csv_string
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
module FormatFactory
|
21
|
+
def self.create format
|
22
|
+
# @logger.info("Formatting to #{format}")
|
23
|
+
return CsvFormatter.new if format == :csv
|
24
|
+
return TabFormatter.new
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
# Overlap module
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module BioTable
|
4
|
+
|
5
|
+
module LineParser
|
6
|
+
|
7
|
+
# Converts a string into an array of string fields
|
8
|
+
def LineParser::parse(line, in_format)
|
9
|
+
if in_format == :csv
|
10
|
+
CSV.parse(line)[0]
|
11
|
+
else
|
12
|
+
line.strip.split("\t").map { |field|
|
13
|
+
fld = field.strip
|
14
|
+
fld = nil if fld == "NA"
|
15
|
+
fld
|
16
|
+
}
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
module BioTable
|
2
|
+
|
3
|
+
class Table
|
4
|
+
|
5
|
+
include Enumerable
|
6
|
+
|
7
|
+
attr_reader :header, :rows, :rownames
|
8
|
+
|
9
|
+
def initialize header=nil
|
10
|
+
@header = header if header
|
11
|
+
@logger = Bio::Log::LoggerPlus['bio-table']
|
12
|
+
@rows = []
|
13
|
+
@rownames = []
|
14
|
+
end
|
15
|
+
|
16
|
+
# Read lines (list of string) and add them to the table, setting row names
|
17
|
+
# and row fields. The first row is assumed to be the header and ignored if the
|
18
|
+
# header has been set.
|
19
|
+
|
20
|
+
def read_lines lines, options = {}
|
21
|
+
num_filter = options[:num_filter]
|
22
|
+
@logger.debug "Filtering on #{num_filter}" if num_filter
|
23
|
+
use_columns = options[:columns]
|
24
|
+
@logger.debug "Filtering on columns #{use_columns}" if use_columns
|
25
|
+
include_rownamess = options[:with_rownamess]
|
26
|
+
@logger.debug "Include row names" if include_rownamess
|
27
|
+
first_column = (include_rownamess ? 0 : 1)
|
28
|
+
|
29
|
+
# parse the header
|
30
|
+
header = LineParser::parse(lines[0], options[:in_format])
|
31
|
+
Validator::valid_header?(header, @header)
|
32
|
+
@header = header if not @header
|
33
|
+
|
34
|
+
column_index = Filter::create_column_index(use_columns,header)
|
35
|
+
@header = Filter::apply_column_filter(header,column_index)
|
36
|
+
|
37
|
+
(lines[1..-1]).each do | line |
|
38
|
+
fields = LineParser::parse(line, options[:in_format])
|
39
|
+
fields = Filter::apply_column_filter(fields,column_index)
|
40
|
+
rownames = fields[0]
|
41
|
+
data_fields = fields[first_column..-1]
|
42
|
+
next if not Validator::valid_row?(data_fields,@header,@rows)
|
43
|
+
next if not Filter::numeric(num_filter,data_fields)
|
44
|
+
@rownames << rownames if not include_rownamess # otherwise doubles rownamess
|
45
|
+
@rows << data_fields
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def read_file filename, options = {}
|
50
|
+
lines = []
|
51
|
+
if not options[:in_format] and filename =~ /\.csv$/
|
52
|
+
@logger.debug "Autodetected CSV file"
|
53
|
+
options[:in_format] = :csv
|
54
|
+
end
|
55
|
+
@logger.debug(options)
|
56
|
+
# Read the file lines into an Array, not lazy FIXME
|
57
|
+
File.open(filename).each_line do | line |
|
58
|
+
lines.push line
|
59
|
+
end
|
60
|
+
read_lines(lines, options)
|
61
|
+
end
|
62
|
+
|
63
|
+
def write options = {}
|
64
|
+
format = options[:format]
|
65
|
+
format = :tab if not format
|
66
|
+
formatter = FormatFactory::create(format)
|
67
|
+
formatter.write(@header) if options[:write_header]
|
68
|
+
each do | tablerow |
|
69
|
+
# p tablerow
|
70
|
+
formatter.write(tablerow.all_fields) if tablerow.valid?
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def push rownames,fields = nil
|
75
|
+
if fields == nil and rownames.kind_of?(TableRow)
|
76
|
+
@rownames << rownames.rowname
|
77
|
+
@rows << rownames.fields
|
78
|
+
else
|
79
|
+
@rownames << rownames
|
80
|
+
@rows << fields
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def [] row
|
85
|
+
if row
|
86
|
+
TableRow.new(@rownames[row],@rows[row])
|
87
|
+
else
|
88
|
+
nil
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def row_by_name name
|
93
|
+
self[rownames.index(name)]
|
94
|
+
end
|
95
|
+
|
96
|
+
def row_by_columns zip,idx=nil
|
97
|
+
index = zip.first[0]
|
98
|
+
value = zip.first[1]
|
99
|
+
if idx
|
100
|
+
row = idx[zip.transpose[1]]
|
101
|
+
return row if row.match_all_fields?(zip)
|
102
|
+
else
|
103
|
+
each do | row |
|
104
|
+
fields = row.all_fields
|
105
|
+
if fields[index] == value
|
106
|
+
return row if row.match_all_fields?(zip)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
nil
|
111
|
+
end
|
112
|
+
|
113
|
+
def each
|
114
|
+
@rows.each_with_index do | row,i |
|
115
|
+
yield TableRow.new(@rownames[i], row)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module BioTable
|
2
|
+
|
3
|
+
class TableRow
|
4
|
+
attr_reader :rowname, :fields
|
5
|
+
def initialize rowname, fields
|
6
|
+
@rowname = rowname
|
7
|
+
@fields = fields
|
8
|
+
end
|
9
|
+
|
10
|
+
def all_fields
|
11
|
+
([@rowname] << @fields).flatten
|
12
|
+
end
|
13
|
+
|
14
|
+
def valid?
|
15
|
+
fields != nil and fields.size > 0
|
16
|
+
end
|
17
|
+
|
18
|
+
def match_all_fields? zip
|
19
|
+
row_fields = all_fields
|
20
|
+
zip.each do | a |
|
21
|
+
index = a[0]
|
22
|
+
value = a[1]
|
23
|
+
return false if row_fields[index] != value
|
24
|
+
end
|
25
|
+
true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module BioTable
|
2
|
+
|
3
|
+
module Validator
|
4
|
+
def Validator::valid_header? header, old_header
|
5
|
+
if old_header
|
6
|
+
if header - old_header != []
|
7
|
+
p old_header
|
8
|
+
p header
|
9
|
+
raise "Headers do not compare!"
|
10
|
+
end
|
11
|
+
end
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
def Validator::valid_row? fields, header, rows
|
16
|
+
return false if fields == nil or fields.size == 0
|
17
|
+
if rows.size>0 and (fields.size != rows.last.size)
|
18
|
+
p rows.last
|
19
|
+
p fields
|
20
|
+
throw "Number of fields diverge in line #{rows.size + 1} (size #{fields.size}, expected #{rows.last.size})"
|
21
|
+
end
|
22
|
+
true
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|