bio-table 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.rspec +1 -0
- data/.travis.yml +12 -0
- data/Gemfile +18 -0
- data/LICENSE.txt +20 -0
- data/README.md +283 -0
- data/Rakefile +55 -0
- data/VERSION +1 -0
- data/bin/bio-table +141 -0
- data/features/bio-table-csv-reader-feature.rb +28 -0
- data/features/bio-table-csv-reader.feature +22 -0
- data/features/step_definitions/bio-table_steps.rb +0 -0
- data/features/support/env.rb +13 -0
- data/lib/bio-table.rb +23 -0
- data/lib/bio-table/columns.rb +11 -0
- data/lib/bio-table/diff.rb +30 -0
- data/lib/bio-table/filter.rb +57 -0
- data/lib/bio-table/formatter.rb +28 -0
- data/lib/bio-table/overlap.rb +1 -0
- data/lib/bio-table/parser.rb +22 -0
- data/lib/bio-table/table.rb +121 -0
- data/lib/bio-table/tablereader.rb +13 -0
- data/lib/bio-table/tablerow.rb +29 -0
- data/lib/bio-table/tablewriter.rb +6 -0
- data/lib/bio-table/validator.rb +26 -0
- data/spec/bio-table_spec.rb +7 -0
- data/spec/spec_helper.rb +12 -0
- data/test/data/input/table1.csv +381 -0
- metadata +168 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
# require 'bio-table'
|
2
|
+
|
3
|
+
Given /^a comma separated table$/ do |string|
|
4
|
+
@lines = string.split(/\n/)
|
5
|
+
end
|
6
|
+
|
7
|
+
When /^I read the multi\-line string$/ do
|
8
|
+
end
|
9
|
+
|
10
|
+
Then /^I should correctly parse the comma\-separated headers into "(.*?)","(.*?)","(.*?)"$/ do |arg1, arg2, arg3|
|
11
|
+
@t = BioTable::Table.new
|
12
|
+
@t.read_lines(@lines, :in_format => :csv)
|
13
|
+
@t.header[0].should == arg1
|
14
|
+
@t.header[1].should == arg2
|
15
|
+
@t.header[2].should == arg3
|
16
|
+
end
|
17
|
+
|
18
|
+
And /^I should correctly parse the first line into$/ do |string|
|
19
|
+
line1 = @t[0]
|
20
|
+
s = eval(string);
|
21
|
+
line1.fields.should == s
|
22
|
+
end
|
23
|
+
|
24
|
+
And /^it should have the rowname "(.*?)"$/ do |arg1|
|
25
|
+
@t.rownames[0].should == "105853"
|
26
|
+
end
|
27
|
+
|
28
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
Feature: Read CSV table
|
2
|
+
|
3
|
+
bio-table should read multiple table formats.
|
4
|
+
|
5
|
+
Scenario: Read a comma separated table
|
6
|
+
Given a comma separated table
|
7
|
+
"""
|
8
|
+
#Gene,AJ,B6,Axb1,Axb2,Axb4,Axb12,AXB13,Axb15,Axb19,Axb23,Axb24,Bxa1,Bxa2,Bxa4,Bxa7,Bxa8,Bxa12,Bxa11,Bxa13,Bxa15,Bxa16,Axb5,Axb6,Axb8,Axb1,Bxa24,Bxa25,Bxa26,gene_symbol,gene_desc
|
9
|
+
105853,0.06,0,0,0,0,0,0.11,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,0,,,Mal2,"MAL2 proteolipid protein"
|
10
|
+
105855,236.88,213.95,213.15,253.49,198,231.56,200.96,255.2,214.04,231.46,,,233.23,241.26,237.53,171.87,237.13,162.3,252.13,284.85,188.76,253.43,220.15,305.52,,217.42,,,Nckap1l,"NCK associated protein 1 like,NCK associated protein 1 like,"
|
11
|
+
105859,0,0.14,0,0,0.07,0.04,0,0,0,0,,,0.02,0,0,0,0,0,0.06,0,0,0,0.02,0,,0,,,Csdc2,"RNA-binding protein pippin"
|
12
|
+
105866,0,0,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,0,,,Krt72,"keratin 72"
|
13
|
+
"""
|
14
|
+
When I read the multi-line string
|
15
|
+
Then I should correctly parse the comma-separated headers into "#Gene","AJ","B6"
|
16
|
+
And I should correctly parse the first line into
|
17
|
+
"""
|
18
|
+
["0.06", "0", "0", "0", "0", "0", "0.11", "0", "0", "0", nil, nil, "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", nil, "0", nil, nil, "Mal2", "MAL2 proteolipid protein"]
|
19
|
+
"""
|
20
|
+
And it should have the rowname "105853"
|
21
|
+
|
22
|
+
|
File without changes
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
begin
|
3
|
+
Bundler.setup(:default, :development)
|
4
|
+
rescue Bundler::BundlerError => e
|
5
|
+
$stderr.puts e.message
|
6
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
7
|
+
exit e.status_code
|
8
|
+
end
|
9
|
+
|
10
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
|
11
|
+
require 'bio-table'
|
12
|
+
|
13
|
+
require 'rspec/expectations'
|
data/lib/bio-table.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# Please require your code below, respecting the naming conventions in the
|
2
|
+
# bioruby directory tree.
|
3
|
+
#
|
4
|
+
# For example, say you have a plugin named bio-plugin, the only uncommented
|
5
|
+
# line in this file would be
|
6
|
+
#
|
7
|
+
# require 'bio/bio-plugin/plugin'
|
8
|
+
#
|
9
|
+
# In this file only require other files. Avoid other source code.
|
10
|
+
|
11
|
+
require 'bio-logger'
|
12
|
+
require 'bio-table/columns.rb'
|
13
|
+
require 'bio-table/validator.rb'
|
14
|
+
require 'bio-table/filter.rb'
|
15
|
+
require 'bio-table/parser.rb'
|
16
|
+
require 'bio-table/formatter.rb'
|
17
|
+
require 'bio-table/tablerow.rb'
|
18
|
+
require 'bio-table/table.rb'
|
19
|
+
require 'bio-table/tablereader.rb'
|
20
|
+
require 'bio-table/tablewriter.rb'
|
21
|
+
require 'bio-table/diff.rb'
|
22
|
+
require 'bio-table/overlap.rb'
|
23
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# Diff module
|
2
|
+
#
|
3
|
+
|
4
|
+
module BioTable
|
5
|
+
|
6
|
+
module Diff
|
7
|
+
|
8
|
+
def self.diff_tables t1, t2, options
|
9
|
+
logger = Bio::Log::LoggerPlus['bio-table']
|
10
|
+
columns = Columns::to_list(options[:diff])
|
11
|
+
t = Table.new(t1.header)
|
12
|
+
l1 = t1.map { |row| columns.map { |i| row.all_fields[i] } }
|
13
|
+
l2 = t2.map { |row| columns.map { |i| row.all_fields[i] } }
|
14
|
+
logger.warn "Not all selected keys are unique!" if l1.uniq.size != l1.size or l2.uniq.size != l2.size
|
15
|
+
diff = l2 - l1
|
16
|
+
# create index for table 2
|
17
|
+
idx2 = {}
|
18
|
+
t2.each do | row |
|
19
|
+
key = columns.map { |i| row.all_fields[i] }
|
20
|
+
idx2[key] = row
|
21
|
+
end
|
22
|
+
diff.each do |values|
|
23
|
+
t.push(t2.row_by_columns(columns.zip(values),idx2))
|
24
|
+
end
|
25
|
+
t
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module BioTable
|
2
|
+
|
3
|
+
module Filter
|
4
|
+
|
5
|
+
# Create an index to the column headers, so header A,B,C,D with columns
|
6
|
+
# C,A returns [2,0]. It can be the column index is already indexed, return
|
7
|
+
# it in that case.
|
8
|
+
#
|
9
|
+
def Filter::create_column_index columns, header
|
10
|
+
return nil if not columns
|
11
|
+
numbers = columns.dup.delete_if { |v| not valid_int?(v) }
|
12
|
+
if numbers.size == columns.size
|
13
|
+
return columns.map { |v| v.to_i }
|
14
|
+
end
|
15
|
+
|
16
|
+
index = []
|
17
|
+
columns.each do | name |
|
18
|
+
pos = header.index(name)
|
19
|
+
raise "Column name #{name} not found!" if pos == nil
|
20
|
+
index << pos
|
21
|
+
end
|
22
|
+
return index
|
23
|
+
end
|
24
|
+
|
25
|
+
def Filter::apply_column_filter fields, index
|
26
|
+
if index
|
27
|
+
index.map { |idx| fields[idx] }
|
28
|
+
else
|
29
|
+
fields
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def Filter::valid_int?(s)
|
34
|
+
s.to_i.to_s == s
|
35
|
+
end
|
36
|
+
|
37
|
+
def Filter::valid_number?(s)
|
38
|
+
s.to_s.match(/\A[+-]?\d+?(\.\d+)?\Z/) == nil ? false : true
|
39
|
+
end
|
40
|
+
|
41
|
+
def Filter::numeric code, fields
|
42
|
+
return true if code == nil
|
43
|
+
if fields
|
44
|
+
values = fields.map { |field| (valid_number?(field) ? field.to_f : nil ) } # FIXME: not so lazy
|
45
|
+
begin
|
46
|
+
eval(code)
|
47
|
+
rescue
|
48
|
+
$stderr.print "Failed to evaluate ",fields," with ",code,"\n"
|
49
|
+
raise
|
50
|
+
end
|
51
|
+
else
|
52
|
+
false
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module BioTable
|
2
|
+
|
3
|
+
class TabFormatter
|
4
|
+
def write list
|
5
|
+
print list.map{|field| (field==nil ? "NA" : field)}.join("\t"),"\n"
|
6
|
+
end
|
7
|
+
|
8
|
+
end
|
9
|
+
|
10
|
+
class CsvFormatter
|
11
|
+
|
12
|
+
def write list
|
13
|
+
csv_string = CSV.generate do |csv|
|
14
|
+
csv << list
|
15
|
+
end
|
16
|
+
print csv_string
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
module FormatFactory
|
21
|
+
def self.create format
|
22
|
+
# @logger.info("Formatting to #{format}")
|
23
|
+
return CsvFormatter.new if format == :csv
|
24
|
+
return TabFormatter.new
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
# Overlap module
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module BioTable
|
4
|
+
|
5
|
+
module LineParser
|
6
|
+
|
7
|
+
# Converts a string into an array of string fields
|
8
|
+
def LineParser::parse(line, in_format)
|
9
|
+
if in_format == :csv
|
10
|
+
CSV.parse(line)[0]
|
11
|
+
else
|
12
|
+
line.strip.split("\t").map { |field|
|
13
|
+
fld = field.strip
|
14
|
+
fld = nil if fld == "NA"
|
15
|
+
fld
|
16
|
+
}
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
module BioTable
|
2
|
+
|
3
|
+
class Table
|
4
|
+
|
5
|
+
include Enumerable
|
6
|
+
|
7
|
+
attr_reader :header, :rows, :rownames
|
8
|
+
|
9
|
+
def initialize header=nil
|
10
|
+
@header = header if header
|
11
|
+
@logger = Bio::Log::LoggerPlus['bio-table']
|
12
|
+
@rows = []
|
13
|
+
@rownames = []
|
14
|
+
end
|
15
|
+
|
16
|
+
# Read lines (list of string) and add them to the table, setting row names
|
17
|
+
# and row fields. The first row is assumed to be the header and ignored if the
|
18
|
+
# header has been set.
|
19
|
+
|
20
|
+
def read_lines lines, options = {}
|
21
|
+
num_filter = options[:num_filter]
|
22
|
+
@logger.debug "Filtering on #{num_filter}" if num_filter
|
23
|
+
use_columns = options[:columns]
|
24
|
+
@logger.debug "Filtering on columns #{use_columns}" if use_columns
|
25
|
+
include_rownamess = options[:with_rownamess]
|
26
|
+
@logger.debug "Include row names" if include_rownamess
|
27
|
+
first_column = (include_rownamess ? 0 : 1)
|
28
|
+
|
29
|
+
# parse the header
|
30
|
+
header = LineParser::parse(lines[0], options[:in_format])
|
31
|
+
Validator::valid_header?(header, @header)
|
32
|
+
@header = header if not @header
|
33
|
+
|
34
|
+
column_index = Filter::create_column_index(use_columns,header)
|
35
|
+
@header = Filter::apply_column_filter(header,column_index)
|
36
|
+
|
37
|
+
(lines[1..-1]).each do | line |
|
38
|
+
fields = LineParser::parse(line, options[:in_format])
|
39
|
+
fields = Filter::apply_column_filter(fields,column_index)
|
40
|
+
rownames = fields[0]
|
41
|
+
data_fields = fields[first_column..-1]
|
42
|
+
next if not Validator::valid_row?(data_fields,@header,@rows)
|
43
|
+
next if not Filter::numeric(num_filter,data_fields)
|
44
|
+
@rownames << rownames if not include_rownamess # otherwise doubles rownamess
|
45
|
+
@rows << data_fields
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def read_file filename, options = {}
|
50
|
+
lines = []
|
51
|
+
if not options[:in_format] and filename =~ /\.csv$/
|
52
|
+
@logger.debug "Autodetected CSV file"
|
53
|
+
options[:in_format] = :csv
|
54
|
+
end
|
55
|
+
@logger.debug(options)
|
56
|
+
# Read the file lines into an Array, not lazy FIXME
|
57
|
+
File.open(filename).each_line do | line |
|
58
|
+
lines.push line
|
59
|
+
end
|
60
|
+
read_lines(lines, options)
|
61
|
+
end
|
62
|
+
|
63
|
+
def write options = {}
|
64
|
+
format = options[:format]
|
65
|
+
format = :tab if not format
|
66
|
+
formatter = FormatFactory::create(format)
|
67
|
+
formatter.write(@header) if options[:write_header]
|
68
|
+
each do | tablerow |
|
69
|
+
# p tablerow
|
70
|
+
formatter.write(tablerow.all_fields) if tablerow.valid?
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def push rownames,fields = nil
|
75
|
+
if fields == nil and rownames.kind_of?(TableRow)
|
76
|
+
@rownames << rownames.rowname
|
77
|
+
@rows << rownames.fields
|
78
|
+
else
|
79
|
+
@rownames << rownames
|
80
|
+
@rows << fields
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def [] row
|
85
|
+
if row
|
86
|
+
TableRow.new(@rownames[row],@rows[row])
|
87
|
+
else
|
88
|
+
nil
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def row_by_name name
|
93
|
+
self[rownames.index(name)]
|
94
|
+
end
|
95
|
+
|
96
|
+
def row_by_columns zip,idx=nil
|
97
|
+
index = zip.first[0]
|
98
|
+
value = zip.first[1]
|
99
|
+
if idx
|
100
|
+
row = idx[zip.transpose[1]]
|
101
|
+
return row if row.match_all_fields?(zip)
|
102
|
+
else
|
103
|
+
each do | row |
|
104
|
+
fields = row.all_fields
|
105
|
+
if fields[index] == value
|
106
|
+
return row if row.match_all_fields?(zip)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
nil
|
111
|
+
end
|
112
|
+
|
113
|
+
def each
|
114
|
+
@rows.each_with_index do | row,i |
|
115
|
+
yield TableRow.new(@rownames[i], row)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module BioTable
|
2
|
+
|
3
|
+
class TableRow
|
4
|
+
attr_reader :rowname, :fields
|
5
|
+
def initialize rowname, fields
|
6
|
+
@rowname = rowname
|
7
|
+
@fields = fields
|
8
|
+
end
|
9
|
+
|
10
|
+
def all_fields
|
11
|
+
([@rowname] << @fields).flatten
|
12
|
+
end
|
13
|
+
|
14
|
+
def valid?
|
15
|
+
fields != nil and fields.size > 0
|
16
|
+
end
|
17
|
+
|
18
|
+
def match_all_fields? zip
|
19
|
+
row_fields = all_fields
|
20
|
+
zip.each do | a |
|
21
|
+
index = a[0]
|
22
|
+
value = a[1]
|
23
|
+
return false if row_fields[index] != value
|
24
|
+
end
|
25
|
+
true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module BioTable
|
2
|
+
|
3
|
+
module Validator
|
4
|
+
def Validator::valid_header? header, old_header
|
5
|
+
if old_header
|
6
|
+
if header - old_header != []
|
7
|
+
p old_header
|
8
|
+
p header
|
9
|
+
raise "Headers do not compare!"
|
10
|
+
end
|
11
|
+
end
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
def Validator::valid_row? fields, header, rows
|
16
|
+
return false if fields == nil or fields.size == 0
|
17
|
+
if rows.size>0 and (fields.size != rows.last.size)
|
18
|
+
p rows.last
|
19
|
+
p fields
|
20
|
+
throw "Number of fields diverge in line #{rows.size + 1} (size #{fields.size}, expected #{rows.last.size})"
|
21
|
+
end
|
22
|
+
true
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|