bio-table 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ # require 'bio-table'
2
+
3
+ Given /^a comma separated table$/ do |string|
4
+ @lines = string.split(/\n/)
5
+ end
6
+
7
+ When /^I read the multi\-line string$/ do
8
+ end
9
+
10
+ Then /^I should correctly parse the comma\-separated headers into "(.*?)","(.*?)","(.*?)"$/ do |arg1, arg2, arg3|
11
+ @t = BioTable::Table.new
12
+ @t.read_lines(@lines, :in_format => :csv)
13
+ @t.header[0].should == arg1
14
+ @t.header[1].should == arg2
15
+ @t.header[2].should == arg3
16
+ end
17
+
18
+ And /^I should correctly parse the first line into$/ do |string|
19
+ line1 = @t[0]
20
+ s = eval(string);
21
+ line1.fields.should == s
22
+ end
23
+
24
+ And /^it should have the rowname "(.*?)"$/ do |arg1|
25
+ @t.rownames[0].should == "105853"
26
+ end
27
+
28
+
@@ -0,0 +1,22 @@
1
+ Feature: Read CSV table
2
+
3
+ bio-table should read multiple table formats.
4
+
5
+ Scenario: Read a comma separated table
6
+ Given a comma separated table
7
+ """
8
+ #Gene,AJ,B6,Axb1,Axb2,Axb4,Axb12,AXB13,Axb15,Axb19,Axb23,Axb24,Bxa1,Bxa2,Bxa4,Bxa7,Bxa8,Bxa12,Bxa11,Bxa13,Bxa15,Bxa16,Axb5,Axb6,Axb8,Axb1,Bxa24,Bxa25,Bxa26,gene_symbol,gene_desc
9
+ 105853,0.06,0,0,0,0,0,0.11,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,0,,,Mal2,"MAL2 proteolipid protein"
10
+ 105855,236.88,213.95,213.15,253.49,198,231.56,200.96,255.2,214.04,231.46,,,233.23,241.26,237.53,171.87,237.13,162.3,252.13,284.85,188.76,253.43,220.15,305.52,,217.42,,,Nckap1l,"NCK associated protein 1 like,NCK associated protein 1 like,"
11
+ 105859,0,0.14,0,0,0.07,0.04,0,0,0,0,,,0.02,0,0,0,0,0,0.06,0,0,0,0.02,0,,0,,,Csdc2,"RNA-binding protein pippin"
12
+ 105866,0,0,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,0,,,Krt72,"keratin 72"
13
+ """
14
+ When I read the multi-line string
15
+ Then I should correctly parse the comma-separated headers into "#Gene","AJ","B6"
16
+ And I should correctly parse the first line into
17
+ """
18
+ ["0.06", "0", "0", "0", "0", "0", "0.11", "0", "0", "0", nil, nil, "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", nil, "0", nil, nil, "Mal2", "MAL2 proteolipid protein"]
19
+ """
20
+ And it should have the rowname "105853"
21
+
22
+
@@ -0,0 +1,13 @@
1
+ require 'bundler'
2
+ begin
3
+ Bundler.setup(:default, :development)
4
+ rescue Bundler::BundlerError => e
5
+ $stderr.puts e.message
6
+ $stderr.puts "Run `bundle install` to install missing gems"
7
+ exit e.status_code
8
+ end
9
+
10
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
11
+ require 'bio-table'
12
+
13
+ require 'rspec/expectations'
@@ -0,0 +1,23 @@
1
+ # Please require your code below, respecting the naming conventions in the
2
+ # bioruby directory tree.
3
+ #
4
+ # For example, say you have a plugin named bio-plugin, the only uncommented
5
+ # line in this file would be
6
+ #
7
+ # require 'bio/bio-plugin/plugin'
8
+ #
9
+ # In this file only require other files. Avoid other source code.
10
+
11
+ require 'bio-logger'
12
+ require 'bio-table/columns.rb'
13
+ require 'bio-table/validator.rb'
14
+ require 'bio-table/filter.rb'
15
+ require 'bio-table/parser.rb'
16
+ require 'bio-table/formatter.rb'
17
+ require 'bio-table/tablerow.rb'
18
+ require 'bio-table/table.rb'
19
+ require 'bio-table/tablereader.rb'
20
+ require 'bio-table/tablewriter.rb'
21
+ require 'bio-table/diff.rb'
22
+ require 'bio-table/overlap.rb'
23
+
@@ -0,0 +1,11 @@
1
+ module BioTable
2
+
3
+ module Columns
4
+
5
+ # Returns a list of selected columns, [0] if none set
6
+ def self.to_list list
7
+ return [0] if list == nil or list == ""
8
+ list.map { |item| item.to_i }
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,30 @@
1
+ # Diff module
2
+ #
3
+
4
+ module BioTable
5
+
6
+ module Diff
7
+
8
+ def self.diff_tables t1, t2, options
9
+ logger = Bio::Log::LoggerPlus['bio-table']
10
+ columns = Columns::to_list(options[:diff])
11
+ t = Table.new(t1.header)
12
+ l1 = t1.map { |row| columns.map { |i| row.all_fields[i] } }
13
+ l2 = t2.map { |row| columns.map { |i| row.all_fields[i] } }
14
+ logger.warn "Not all selected keys are unique!" if l1.uniq.size != l1.size or l2.uniq.size != l2.size
15
+ diff = l2 - l1
16
+ # create index for table 2
17
+ idx2 = {}
18
+ t2.each do | row |
19
+ key = columns.map { |i| row.all_fields[i] }
20
+ idx2[key] = row
21
+ end
22
+ diff.each do |values|
23
+ t.push(t2.row_by_columns(columns.zip(values),idx2))
24
+ end
25
+ t
26
+ end
27
+
28
+ end
29
+
30
+ end
@@ -0,0 +1,57 @@
1
+ module BioTable
2
+
3
+ module Filter
4
+
5
+ # Create an index to the column headers, so header A,B,C,D with columns
6
+ # C,A returns [2,0]. It can be the column index is already indexed, return
7
+ # it in that case.
8
+ #
9
+ def Filter::create_column_index columns, header
10
+ return nil if not columns
11
+ numbers = columns.dup.delete_if { |v| not valid_int?(v) }
12
+ if numbers.size == columns.size
13
+ return columns.map { |v| v.to_i }
14
+ end
15
+
16
+ index = []
17
+ columns.each do | name |
18
+ pos = header.index(name)
19
+ raise "Column name #{name} not found!" if pos == nil
20
+ index << pos
21
+ end
22
+ return index
23
+ end
24
+
25
+ def Filter::apply_column_filter fields, index
26
+ if index
27
+ index.map { |idx| fields[idx] }
28
+ else
29
+ fields
30
+ end
31
+ end
32
+
33
+ def Filter::valid_int?(s)
34
+ s.to_i.to_s == s
35
+ end
36
+
37
+ def Filter::valid_number?(s)
38
+ s.to_s.match(/\A[+-]?\d+?(\.\d+)?\Z/) == nil ? false : true
39
+ end
40
+
41
+ def Filter::numeric code, fields
42
+ return true if code == nil
43
+ if fields
44
+ values = fields.map { |field| (valid_number?(field) ? field.to_f : nil ) } # FIXME: not so lazy
45
+ begin
46
+ eval(code)
47
+ rescue
48
+ $stderr.print "Failed to evaluate ",fields," with ",code,"\n"
49
+ raise
50
+ end
51
+ else
52
+ false
53
+ end
54
+ end
55
+ end
56
+
57
+ end
@@ -0,0 +1,28 @@
1
+ module BioTable
2
+
3
+ class TabFormatter
4
+ def write list
5
+ print list.map{|field| (field==nil ? "NA" : field)}.join("\t"),"\n"
6
+ end
7
+
8
+ end
9
+
10
+ class CsvFormatter
11
+
12
+ def write list
13
+ csv_string = CSV.generate do |csv|
14
+ csv << list
15
+ end
16
+ print csv_string
17
+ end
18
+ end
19
+
20
+ module FormatFactory
21
+ def self.create format
22
+ # @logger.info("Formatting to #{format}")
23
+ return CsvFormatter.new if format == :csv
24
+ return TabFormatter.new
25
+ end
26
+ end
27
+ end
28
+
@@ -0,0 +1 @@
1
+ # Overlap module
@@ -0,0 +1,22 @@
1
+ require 'csv'
2
+
3
+ module BioTable
4
+
5
+ module LineParser
6
+
7
+ # Converts a string into an array of string fields
8
+ def LineParser::parse(line, in_format)
9
+ if in_format == :csv
10
+ CSV.parse(line)[0]
11
+ else
12
+ line.strip.split("\t").map { |field|
13
+ fld = field.strip
14
+ fld = nil if fld == "NA"
15
+ fld
16
+ }
17
+ end
18
+ end
19
+
20
+ end
21
+
22
+ end
@@ -0,0 +1,121 @@
1
+ module BioTable
2
+
3
+ class Table
4
+
5
+ include Enumerable
6
+
7
+ attr_reader :header, :rows, :rownames
8
+
9
+ def initialize header=nil
10
+ @header = header if header
11
+ @logger = Bio::Log::LoggerPlus['bio-table']
12
+ @rows = []
13
+ @rownames = []
14
+ end
15
+
16
+ # Read lines (list of string) and add them to the table, setting row names
17
+ # and row fields. The first row is assumed to be the header and ignored if the
18
+ # header has been set.
19
+
20
+ def read_lines lines, options = {}
21
+ num_filter = options[:num_filter]
22
+ @logger.debug "Filtering on #{num_filter}" if num_filter
23
+ use_columns = options[:columns]
24
+ @logger.debug "Filtering on columns #{use_columns}" if use_columns
25
+ include_rownamess = options[:with_rownamess]
26
+ @logger.debug "Include row names" if include_rownamess
27
+ first_column = (include_rownamess ? 0 : 1)
28
+
29
+ # parse the header
30
+ header = LineParser::parse(lines[0], options[:in_format])
31
+ Validator::valid_header?(header, @header)
32
+ @header = header if not @header
33
+
34
+ column_index = Filter::create_column_index(use_columns,header)
35
+ @header = Filter::apply_column_filter(header,column_index)
36
+
37
+ (lines[1..-1]).each do | line |
38
+ fields = LineParser::parse(line, options[:in_format])
39
+ fields = Filter::apply_column_filter(fields,column_index)
40
+ rownames = fields[0]
41
+ data_fields = fields[first_column..-1]
42
+ next if not Validator::valid_row?(data_fields,@header,@rows)
43
+ next if not Filter::numeric(num_filter,data_fields)
44
+ @rownames << rownames if not include_rownamess # otherwise doubles rownamess
45
+ @rows << data_fields
46
+ end
47
+ end
48
+
49
+ def read_file filename, options = {}
50
+ lines = []
51
+ if not options[:in_format] and filename =~ /\.csv$/
52
+ @logger.debug "Autodetected CSV file"
53
+ options[:in_format] = :csv
54
+ end
55
+ @logger.debug(options)
56
+ # Read the file lines into an Array, not lazy FIXME
57
+ File.open(filename).each_line do | line |
58
+ lines.push line
59
+ end
60
+ read_lines(lines, options)
61
+ end
62
+
63
+ def write options = {}
64
+ format = options[:format]
65
+ format = :tab if not format
66
+ formatter = FormatFactory::create(format)
67
+ formatter.write(@header) if options[:write_header]
68
+ each do | tablerow |
69
+ # p tablerow
70
+ formatter.write(tablerow.all_fields) if tablerow.valid?
71
+ end
72
+ end
73
+
74
+ def push rownames,fields = nil
75
+ if fields == nil and rownames.kind_of?(TableRow)
76
+ @rownames << rownames.rowname
77
+ @rows << rownames.fields
78
+ else
79
+ @rownames << rownames
80
+ @rows << fields
81
+ end
82
+ end
83
+
84
+ def [] row
85
+ if row
86
+ TableRow.new(@rownames[row],@rows[row])
87
+ else
88
+ nil
89
+ end
90
+ end
91
+
92
+ def row_by_name name
93
+ self[rownames.index(name)]
94
+ end
95
+
96
+ def row_by_columns zip,idx=nil
97
+ index = zip.first[0]
98
+ value = zip.first[1]
99
+ if idx
100
+ row = idx[zip.transpose[1]]
101
+ return row if row.match_all_fields?(zip)
102
+ else
103
+ each do | row |
104
+ fields = row.all_fields
105
+ if fields[index] == value
106
+ return row if row.match_all_fields?(zip)
107
+ end
108
+ end
109
+ end
110
+ nil
111
+ end
112
+
113
+ def each
114
+ @rows.each_with_index do | row,i |
115
+ yield TableRow.new(@rownames[i], row)
116
+ end
117
+ end
118
+
119
+ end
120
+
121
+ end
@@ -0,0 +1,13 @@
1
+ module BioTable
2
+
3
+ module TableReader
4
+ def TableReader::read_file filename, options = {}
5
+ logger = Bio::Log::LoggerPlus['bio-table']
6
+ logger.info("Parsing #{filename}")
7
+ t = Table.new
8
+ t.read_file(filename, options)
9
+ t
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,29 @@
1
+ module BioTable
2
+
3
+ class TableRow
4
+ attr_reader :rowname, :fields
5
+ def initialize rowname, fields
6
+ @rowname = rowname
7
+ @fields = fields
8
+ end
9
+
10
+ def all_fields
11
+ ([@rowname] << @fields).flatten
12
+ end
13
+
14
+ def valid?
15
+ fields != nil and fields.size > 0
16
+ end
17
+
18
+ def match_all_fields? zip
19
+ row_fields = all_fields
20
+ zip.each do | a |
21
+ index = a[0]
22
+ value = a[1]
23
+ return false if row_fields[index] != value
24
+ end
25
+ true
26
+ end
27
+ end
28
+
29
+ end
@@ -0,0 +1,6 @@
1
+ module BioTable
2
+
3
+ module TableWriter
4
+ end
5
+
6
+ end
@@ -0,0 +1,26 @@
1
+ module BioTable
2
+
3
+ module Validator
4
+ def Validator::valid_header? header, old_header
5
+ if old_header
6
+ if header - old_header != []
7
+ p old_header
8
+ p header
9
+ raise "Headers do not compare!"
10
+ end
11
+ end
12
+ true
13
+ end
14
+
15
+ def Validator::valid_row? fields, header, rows
16
+ return false if fields == nil or fields.size == 0
17
+ if rows.size>0 and (fields.size != rows.last.size)
18
+ p rows.last
19
+ p fields
20
+ throw "Number of fields diverge in line #{rows.size + 1} (size #{fields.size}, expected #{rows.last.size})"
21
+ end
22
+ true
23
+ end
24
+ end
25
+
26
+ end