bio-table 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,28 @@
1
+ # require 'bio-table'
2
+
3
+ Given /^a comma separated table$/ do |string|
4
+ @lines = string.split(/\n/)
5
+ end
6
+
7
+ When /^I read the multi\-line string$/ do
8
+ end
9
+
10
+ Then /^I should correctly parse the comma\-separated headers into "(.*?)","(.*?)","(.*?)"$/ do |arg1, arg2, arg3|
11
+ @t = BioTable::Table.new
12
+ @t.read_lines(@lines, :in_format => :csv)
13
+ @t.header[0].should == arg1
14
+ @t.header[1].should == arg2
15
+ @t.header[2].should == arg3
16
+ end
17
+
18
+ And /^I should correctly parse the first line into$/ do |string|
19
+ line1 = @t[0]
20
+ s = eval(string);
21
+ line1.fields.should == s
22
+ end
23
+
24
+ And /^it should have the rowname "(.*?)"$/ do |arg1|
25
+ @t.rownames[0].should == "105853"
26
+ end
27
+
28
+
@@ -0,0 +1,22 @@
1
+ Feature: Read CSV table
2
+
3
+ bio-table should read multiple table formats.
4
+
5
+ Scenario: Read a comma separated table
6
+ Given a comma separated table
7
+ """
8
+ #Gene,AJ,B6,Axb1,Axb2,Axb4,Axb12,AXB13,Axb15,Axb19,Axb23,Axb24,Bxa1,Bxa2,Bxa4,Bxa7,Bxa8,Bxa12,Bxa11,Bxa13,Bxa15,Bxa16,Axb5,Axb6,Axb8,Axb1,Bxa24,Bxa25,Bxa26,gene_symbol,gene_desc
9
+ 105853,0.06,0,0,0,0,0,0.11,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,0,,,Mal2,"MAL2 proteolipid protein"
10
+ 105855,236.88,213.95,213.15,253.49,198,231.56,200.96,255.2,214.04,231.46,,,233.23,241.26,237.53,171.87,237.13,162.3,252.13,284.85,188.76,253.43,220.15,305.52,,217.42,,,Nckap1l,"NCK associated protein 1 like,NCK associated protein 1 like,"
11
+ 105859,0,0.14,0,0,0.07,0.04,0,0,0,0,,,0.02,0,0,0,0,0,0.06,0,0,0,0.02,0,,0,,,Csdc2,"RNA-binding protein pippin"
12
+ 105866,0,0,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,0,,,Krt72,"keratin 72"
13
+ """
14
+ When I read the multi-line string
15
+ Then I should correctly parse the comma-separated headers into "#Gene","AJ","B6"
16
+ And I should correctly parse the first line into
17
+ """
18
+ ["0.06", "0", "0", "0", "0", "0", "0.11", "0", "0", "0", nil, nil, "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", nil, "0", nil, nil, "Mal2", "MAL2 proteolipid protein"]
19
+ """
20
+ And it should have the rowname "105853"
21
+
22
+
@@ -0,0 +1,13 @@
1
+ require 'bundler'
2
+ begin
3
+ Bundler.setup(:default, :development)
4
+ rescue Bundler::BundlerError => e
5
+ $stderr.puts e.message
6
+ $stderr.puts "Run `bundle install` to install missing gems"
7
+ exit e.status_code
8
+ end
9
+
10
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
11
+ require 'bio-table'
12
+
13
+ require 'rspec/expectations'
@@ -0,0 +1,23 @@
1
+ # Please require your code below, respecting the naming conventions in the
2
+ # bioruby directory tree.
3
+ #
4
+ # For example, say you have a plugin named bio-plugin, the only uncommented
5
+ # line in this file would be
6
+ #
7
+ # require 'bio/bio-plugin/plugin'
8
+ #
9
+ # In this file only require other files. Avoid other source code.
10
+
11
+ require 'bio-logger'
12
+ require 'bio-table/columns.rb'
13
+ require 'bio-table/validator.rb'
14
+ require 'bio-table/filter.rb'
15
+ require 'bio-table/parser.rb'
16
+ require 'bio-table/formatter.rb'
17
+ require 'bio-table/tablerow.rb'
18
+ require 'bio-table/table.rb'
19
+ require 'bio-table/tablereader.rb'
20
+ require 'bio-table/tablewriter.rb'
21
+ require 'bio-table/diff.rb'
22
+ require 'bio-table/overlap.rb'
23
+
@@ -0,0 +1,11 @@
1
+ module BioTable
2
+
3
+ module Columns
4
+
5
+ # Returns a list of selected columns, [0] if none set
6
+ def self.to_list list
7
+ return [0] if list == nil or list == ""
8
+ list.map { |item| item.to_i }
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,30 @@
1
+ # Diff module
2
+ #
3
+
4
+ module BioTable
5
+
6
+ module Diff
7
+
8
+ def self.diff_tables t1, t2, options
9
+ logger = Bio::Log::LoggerPlus['bio-table']
10
+ columns = Columns::to_list(options[:diff])
11
+ t = Table.new(t1.header)
12
+ l1 = t1.map { |row| columns.map { |i| row.all_fields[i] } }
13
+ l2 = t2.map { |row| columns.map { |i| row.all_fields[i] } }
14
+ logger.warn "Not all selected keys are unique!" if l1.uniq.size != l1.size or l2.uniq.size != l2.size
15
+ diff = l2 - l1
16
+ # create index for table 2
17
+ idx2 = {}
18
+ t2.each do | row |
19
+ key = columns.map { |i| row.all_fields[i] }
20
+ idx2[key] = row
21
+ end
22
+ diff.each do |values|
23
+ t.push(t2.row_by_columns(columns.zip(values),idx2))
24
+ end
25
+ t
26
+ end
27
+
28
+ end
29
+
30
+ end
@@ -0,0 +1,57 @@
1
+ module BioTable
2
+
3
+ module Filter
4
+
5
+ # Create an index to the column headers, so header A,B,C,D with columns
6
+ # C,A returns [2,0]. It can be the column index is already indexed, return
7
+ # it in that case.
8
+ #
9
+ def Filter::create_column_index columns, header
10
+ return nil if not columns
11
+ numbers = columns.dup.delete_if { |v| not valid_int?(v) }
12
+ if numbers.size == columns.size
13
+ return columns.map { |v| v.to_i }
14
+ end
15
+
16
+ index = []
17
+ columns.each do | name |
18
+ pos = header.index(name)
19
+ raise "Column name #{name} not found!" if pos == nil
20
+ index << pos
21
+ end
22
+ return index
23
+ end
24
+
25
+ def Filter::apply_column_filter fields, index
26
+ if index
27
+ index.map { |idx| fields[idx] }
28
+ else
29
+ fields
30
+ end
31
+ end
32
+
33
+ def Filter::valid_int?(s)
34
+ s.to_i.to_s == s
35
+ end
36
+
37
+ def Filter::valid_number?(s)
38
+ s.to_s.match(/\A[+-]?\d+?(\.\d+)?\Z/) == nil ? false : true
39
+ end
40
+
41
+ def Filter::numeric code, fields
42
+ return true if code == nil
43
+ if fields
44
+ values = fields.map { |field| (valid_number?(field) ? field.to_f : nil ) } # FIXME: not so lazy
45
+ begin
46
+ eval(code)
47
+ rescue
48
+ $stderr.print "Failed to evaluate ",fields," with ",code,"\n"
49
+ raise
50
+ end
51
+ else
52
+ false
53
+ end
54
+ end
55
+ end
56
+
57
+ end
@@ -0,0 +1,28 @@
1
+ module BioTable
2
+
3
+ class TabFormatter
4
+ def write list
5
+ print list.map{|field| (field==nil ? "NA" : field)}.join("\t"),"\n"
6
+ end
7
+
8
+ end
9
+
10
+ class CsvFormatter
11
+
12
+ def write list
13
+ csv_string = CSV.generate do |csv|
14
+ csv << list
15
+ end
16
+ print csv_string
17
+ end
18
+ end
19
+
20
+ module FormatFactory
21
+ def self.create format
22
+ # @logger.info("Formatting to #{format}")
23
+ return CsvFormatter.new if format == :csv
24
+ return TabFormatter.new
25
+ end
26
+ end
27
+ end
28
+
@@ -0,0 +1 @@
1
+ # Overlap module
@@ -0,0 +1,22 @@
1
+ require 'csv'
2
+
3
+ module BioTable
4
+
5
+ module LineParser
6
+
7
+ # Converts a string into an array of string fields
8
+ def LineParser::parse(line, in_format)
9
+ if in_format == :csv
10
+ CSV.parse(line)[0]
11
+ else
12
+ line.strip.split("\t").map { |field|
13
+ fld = field.strip
14
+ fld = nil if fld == "NA"
15
+ fld
16
+ }
17
+ end
18
+ end
19
+
20
+ end
21
+
22
+ end
@@ -0,0 +1,121 @@
1
+ module BioTable
2
+
3
+ class Table
4
+
5
+ include Enumerable
6
+
7
+ attr_reader :header, :rows, :rownames
8
+
9
+ def initialize header=nil
10
+ @header = header if header
11
+ @logger = Bio::Log::LoggerPlus['bio-table']
12
+ @rows = []
13
+ @rownames = []
14
+ end
15
+
16
+ # Read lines (list of string) and add them to the table, setting row names
17
+ # and row fields. The first row is assumed to be the header and ignored if the
18
+ # header has been set.
19
+
20
+ def read_lines lines, options = {}
21
+ num_filter = options[:num_filter]
22
+ @logger.debug "Filtering on #{num_filter}" if num_filter
23
+ use_columns = options[:columns]
24
+ @logger.debug "Filtering on columns #{use_columns}" if use_columns
25
+ include_rownamess = options[:with_rownamess]
26
+ @logger.debug "Include row names" if include_rownamess
27
+ first_column = (include_rownamess ? 0 : 1)
28
+
29
+ # parse the header
30
+ header = LineParser::parse(lines[0], options[:in_format])
31
+ Validator::valid_header?(header, @header)
32
+ @header = header if not @header
33
+
34
+ column_index = Filter::create_column_index(use_columns,header)
35
+ @header = Filter::apply_column_filter(header,column_index)
36
+
37
+ (lines[1..-1]).each do | line |
38
+ fields = LineParser::parse(line, options[:in_format])
39
+ fields = Filter::apply_column_filter(fields,column_index)
40
+ rownames = fields[0]
41
+ data_fields = fields[first_column..-1]
42
+ next if not Validator::valid_row?(data_fields,@header,@rows)
43
+ next if not Filter::numeric(num_filter,data_fields)
44
+ @rownames << rownames if not include_rownamess # otherwise doubles rownamess
45
+ @rows << data_fields
46
+ end
47
+ end
48
+
49
+ def read_file filename, options = {}
50
+ lines = []
51
+ if not options[:in_format] and filename =~ /\.csv$/
52
+ @logger.debug "Autodetected CSV file"
53
+ options[:in_format] = :csv
54
+ end
55
+ @logger.debug(options)
56
+ # Read the file lines into an Array, not lazy FIXME
57
+ File.open(filename).each_line do | line |
58
+ lines.push line
59
+ end
60
+ read_lines(lines, options)
61
+ end
62
+
63
+ def write options = {}
64
+ format = options[:format]
65
+ format = :tab if not format
66
+ formatter = FormatFactory::create(format)
67
+ formatter.write(@header) if options[:write_header]
68
+ each do | tablerow |
69
+ # p tablerow
70
+ formatter.write(tablerow.all_fields) if tablerow.valid?
71
+ end
72
+ end
73
+
74
+ def push rownames,fields = nil
75
+ if fields == nil and rownames.kind_of?(TableRow)
76
+ @rownames << rownames.rowname
77
+ @rows << rownames.fields
78
+ else
79
+ @rownames << rownames
80
+ @rows << fields
81
+ end
82
+ end
83
+
84
+ def [] row
85
+ if row
86
+ TableRow.new(@rownames[row],@rows[row])
87
+ else
88
+ nil
89
+ end
90
+ end
91
+
92
+ def row_by_name name
93
+ self[rownames.index(name)]
94
+ end
95
+
96
+ def row_by_columns zip,idx=nil
97
+ index = zip.first[0]
98
+ value = zip.first[1]
99
+ if idx
100
+ row = idx[zip.transpose[1]]
101
+ return row if row.match_all_fields?(zip)
102
+ else
103
+ each do | row |
104
+ fields = row.all_fields
105
+ if fields[index] == value
106
+ return row if row.match_all_fields?(zip)
107
+ end
108
+ end
109
+ end
110
+ nil
111
+ end
112
+
113
+ def each
114
+ @rows.each_with_index do | row,i |
115
+ yield TableRow.new(@rownames[i], row)
116
+ end
117
+ end
118
+
119
+ end
120
+
121
+ end
@@ -0,0 +1,13 @@
1
+ module BioTable
2
+
3
+ module TableReader
4
+ def TableReader::read_file filename, options = {}
5
+ logger = Bio::Log::LoggerPlus['bio-table']
6
+ logger.info("Parsing #{filename}")
7
+ t = Table.new
8
+ t.read_file(filename, options)
9
+ t
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,29 @@
1
+ module BioTable
2
+
3
+ class TableRow
4
+ attr_reader :rowname, :fields
5
+ def initialize rowname, fields
6
+ @rowname = rowname
7
+ @fields = fields
8
+ end
9
+
10
+ def all_fields
11
+ ([@rowname] << @fields).flatten
12
+ end
13
+
14
+ def valid?
15
+ fields != nil and fields.size > 0
16
+ end
17
+
18
+ def match_all_fields? zip
19
+ row_fields = all_fields
20
+ zip.each do | a |
21
+ index = a[0]
22
+ value = a[1]
23
+ return false if row_fields[index] != value
24
+ end
25
+ true
26
+ end
27
+ end
28
+
29
+ end
@@ -0,0 +1,6 @@
1
+ module BioTable
2
+
3
+ module TableWriter
4
+ end
5
+
6
+ end
@@ -0,0 +1,26 @@
1
+ module BioTable
2
+
3
+ module Validator
4
+ def Validator::valid_header? header, old_header
5
+ if old_header
6
+ if header - old_header != []
7
+ p old_header
8
+ p header
9
+ raise "Headers do not compare!"
10
+ end
11
+ end
12
+ true
13
+ end
14
+
15
+ def Validator::valid_row? fields, header, rows
16
+ return false if fields == nil or fields.size == 0
17
+ if rows.size>0 and (fields.size != rows.last.size)
18
+ p rows.last
19
+ p fields
20
+ throw "Number of fields diverge in line #{rows.size + 1} (size #{fields.size}, expected #{rows.last.size})"
21
+ end
22
+ true
23
+ end
24
+ end
25
+
26
+ end