RubyGems - bio-table - Versions diffs - 0.9.0 → 1.0.0 - Mend

bio-table 0.9.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/Gemfile +0 -1
data/README.md +46 -2
data/VERSION +1 -1
data/bin/bio-table +246 -215
data/lib/bio-table.rb +1 -0
data/lib/bio-table/count.rb +39 -0
data/lib/bio-table/filter.rb +1 -0
data/lib/bio-table/merge.rb +3 -1
data/lib/bio-table/parser.rb +6 -4
data/lib/bio-table/rewrite.rb +9 -2
data/lib/bio-table/table_apply.rb +4 -0
data/lib/bio-table/tableload.rb +2 -1
data/lib/bio-table/validator.rb +4 -4
data/test/data/regression/table1-STDIN.ref +1 -0
metadata +3 -16

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 208de532730ef88e9a1ecc2961f08ceaa464d846
-  data.tar.gz: 4b88172ae40141a1db2dcb5c3a7b55daeea1beed
+  metadata.gz: b33f9729f357116b8a58a31cdefdba144c5bfb5d
+  data.tar.gz: 04ed672ff432dcbe3b611a1441e83a84db5ebecc
 SHA512:
-  metadata.gz: 5d0a5f044e44a89fd047db4443fca2d8729af55f0cc7106ab45e939e3e92c7356dffd7f9308a63cb4a0ff39b313c9321edf3cb637af587f8b7ff8796c1083415
-  data.tar.gz: cb9abfe846b2c1949d027e38415e486719cf59db5fb99d38ec495b1b828d8a838dca19ee63c302844f82fb23dc4ec2a61c1749cbfb286f77ac0eecffa14e470e
+  metadata.gz: ca15864e31d7c9dcfae49edbf281e27e4afae738a507787bc5b1860ee42ead5e2e4a418387fdbcb9c68ef674d86417a0760151db51b89af33ee1de5617275cee
+  data.tar.gz: 3cf3e0540bf9f3beea349d3e4db8d3792b6b394070171b451ea8105a9c9b89ba09a72e802060dc61aaae92f110707f0fbce5ed7662d996ef95257fb88c0e6386

data/Gemfile CHANGED

@@ -12,7 +12,6 @@ group :development do
   gem "cucumber"
   gem "bundler"
   gem "jeweler", "~> 2.0.0"
-  gem "bio", ">= 1.4.2"
   gem "rdoc"
   gem "regressiontest", ">= 0.0.2"
 end

data/README.md CHANGED

@@ -38,6 +38,7 @@ Features:
 * Calculate new values
 * Calculate column statistics (mean, standard deviation)
 * Diff between tables, selecting on specific column values
+* Count elements in columns
 * Merge tables side by side on column value/rowname
 * Split/reduce tables by column
 * Write formatted tables, e.g. HTML, LaTeX
@@ -174,6 +175,13 @@ in the table, use unshift headers, 0 becomes an 'ID' column
     bio-table table1.csv --unshift-headers --columns 0,1,8,2,4,6
 ```
+Another option will add fields to a row to get the same number of
+fields
+```sh
+    bio-table table1.csv --pad-fields
+```
 Duplicate columns with
 ```sh
@@ -186,6 +194,13 @@ Combine column values (more on rewrite below)
     bio-table table1.csv --rewrite "rowname = rowname + '-' + field[0]"
 ```
+To insert a table column simply add a tab, e.g., to inject a
+column containing 'PATHWAY'
+```sh
+    bio-table table1.csv --rewrite 'field[0] = "PATHWAY\t"+field[0]'
+```
 To filter for columns using a regular expression
 ```sh
@@ -220,6 +235,29 @@ Another option is to use (lazy) values:
 which saves the typing to to_f.
+Another feature is counting column elements. With
+```sh
+    bio-table table1.csv --count 0,1,4
+```
+All records are combined that have the same rowname and values in
+columns 0 and 3. In addition a column is added counting the number of
+merged rows. So,
+```
+    hs8     48713371        53713371        G       SAMPLE005
+    hs8     48713371        53713371        G       SAMPLE005
+    hs9     136643994       141643994       C       SAMPLE005
+```
+becomes
+```
+    hs8     48713371        53713371        G       SAMPLE005    2
+    hs9     136643994       141643994       C       SAMPLE005    1
+```
 ### Statistics
 bio-table can handle some column statistics using the Ruby statsample
@@ -287,9 +325,11 @@ with NA's, unless you add a filter, e.g.
 ```sh
     bio-table --merge table1.csv table2.csv --num-filter "values.compact.size == values.size"
 ```
+If you don't want the headers to be 'restyled' on merge, use the --keep-headers
+override.
 ### Splitting a table
 Splitting a table by column is possible by named or indexed columns,
@@ -554,7 +594,11 @@ Coming soon
 The API doc is online. For more code examples see the test files in
 the source tree.
+## Troubleshooting
+Run bio-table with the --debug switch to get stack traces. Use --debug
+and or --trace for more output.
 ## Project home page
 Information on the source tree, documentation, examples, issues and

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.9.0
1	+ 1.0.0

data/bin/bio-table CHANGED

@@ -2,280 +2,311 @@
 #
 # BioRuby bio-table Plugin BioTable
 # Author:: Pjotr Prins
-# Copyright:: 2012
-rootpath = File.dirname(File.dirname(__FILE__))
-$: << File.join(rootpath,'lib')
+begin
-_VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp
+  rootpath = File.dirname(File.dirname(__FILE__))
+  $: << File.join(rootpath,'lib')
-INPUT_ON_STDIN = !$stdin.tty?
+  _VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp
-$stderr.print "bio-table "+_VERSION+" Copyright (C) 2012 Pjotr Prins <pjotr.prins@thebird.nl>\n\n"
+  INPUT_ON_STDIN = !$stdin.tty?
-USAGE =<<EOU
+  $stderr.print "bio-table "+_VERSION+" Copyright (C) 2012-2014 Pjotr Prins <pjotr.prins@thebird.nl>\n\n"
-bio-table transforms, filters and reorders table files (CSV, tab-delimited).
+  USAGE =<<EOU
-EOU
+  bio-table transforms, filters and reorders table files (CSV, tab-delimited).
-if ARGV.size == 0 and not INPUT_ON_STDIN
-  print USAGE
-end
+EOU
-require 'bio-table'
-require 'optparse'
-require 'bio-logger'
+  if ARGV.size == 0 and not INPUT_ON_STDIN
+    print USAGE
+  end
-log = Bio::Log::LoggerPlus.new 'bio-table'
-# log.outputters = Bio::Log::Outputter.stderr
+  require 'bio-table'
+  require 'optparse'
+  require 'bio-logger'
-Bio::Log::CLI.logger('stderr')
-Bio::Log::CLI.trace('info')
+  log = Bio::Log::LoggerPlus.new 'bio-table'
+  # log.outputters = Bio::Log::Outputter.stderr
-options = {show_help: false, write_header: true, skip: 0}
-options[:show_help] = true if ARGV.size == 0 and not INPUT_ON_STDIN
-opts = OptionParser.new do |o|
-  o.banner = "Usage: #{File.basename($0)} [options] filename\n\n"
+  Bio::Log::CLI.logger('stderr')
+  Bio::Log::CLI.trace('info')
-  o.on('--num-filter expression', 'Numeric filtering function') do |par|
-    options[:num_filter] = par
-  end
+  options = {show_help: false, write_header: true, skip: 0}
+  options[:show_help] = true if ARGV.size == 0 and not INPUT_ON_STDIN
+  opts = OptionParser.new do |o|
+    o.banner = "Usage: #{File.basename($0)} [options] filename\n\n"
-  o.on('--filter expression', 'Generic filtering function') do |par|
-    options[:filter] = par
-  end
+    o.on('--num-filter expression', 'Numeric filtering function') do |par|
+      options[:num_filter] = par
+    end
-  o.on('--rewrite expression', 'Rewrite function') do |par|
-    options[:rewrite] = par
-  end
+    o.on('--filter expression', 'Generic filtering function') do |par|
+      options[:filter] = par
+    end
-  o.on('--columns list', Array, 'List of column names or indices') do |l|
-    options[:columns] = l
-  end
+    o.on('--rewrite expression', 'Rewrite function') do |par|
+      options[:rewrite] = par
+    end
-  o.on('--column-filter expression', 'Column name filtering function') do |par|
-    options[:column_filter] = par
-  end
+    o.on('--count list', Array, 'Merge and count similar rows') do |list|
+      options[:count] = list
+    end
-  o.on('--merge','Merge tables by rowname') do
-    options[:merge] = true
-  end
+    o.on('--columns list', Array, 'List of column names or indices') do |l|
+      options[:columns] = l
+    end
-  o.on('--diff list',Array,'Diff two input files on columns (default rownames)') do |l|
-    if l.size==1 and File.exist?(l.first)
-      ARGV.unshift l.first
-      l = ["0"]
+    o.on('--column-filter expression', 'Column name filtering function') do |par|
+      options[:column_filter] = par
     end
-    options[:diff] = l
-  end
-  o.on('--overlap list',Array,'Find overlap of two input files on columns)') do |l|
-    if l.size==1 and File.exist?(l.first)
-      ARGV.unshift l.first
-      l = ["0"]
+    o.on('--merge','Merge tables by rowname') do
+      options[:merge] = true
     end
-    options[:overlap] = l
-  end
-  o.on('--merge','Merge tables by rowname') do
-    options[:merge] = true
-  end
-  o.separator "\n\tOverrides:\n\n"
+    o.on('--diff list',Array,'Diff two input files on columns (default rownames)') do |l|
+      if l.size==1 and File.exist?(l.first)
+        ARGV.unshift l.first
+        l = ["0"]
+      end
+      options[:diff] = l
+    end
-  o.on('--skip lines',Integer,'Skip the first lines before parsing') do |skip|
-    options[:skip] = skip
-  end
+    o.on('--overlap list',Array,'Find overlap of two input files on columns)') do |l|
+      if l.size==1 and File.exist?(l.first)
+        ARGV.unshift l.first
+        l = ["0"]
+      end
+      options[:overlap] = l
+    end
+    o.on('--merge','Merge tables by rowname') do
+      options[:merge] = true
+    end
-  o.on('--with-headers','Include the header element in filtering etc.') do
-    options[:with_headers] = true
-    options[:write_header] = false
-  end
+    o.separator "\n\tOverrides:\n\n"
-  o.on('--with-rownames','Include the rownames in filtering etc.') do
-    options[:with_rownames] = true
-  end
+    o.on('--skip lines',Integer,'Skip the first lines before parsing') do |skip|
+      options[:skip] = skip
+    end
-  o.on('--unshift-headers','Add an extra header element at the front (header contains one fewer field than the number of columns)') do
-    options[:unshift_headers] = true
-  end
+    o.on('--with-headers','Include the header element in filtering etc.') do
+      options[:with_headers] = true
+      options[:write_header] = false
+    end
-  o.on('--strip-quotes','Strip quotes from table fields') do
-    options[:strip_quotes] = true
-  end
+    o.on('--with-rownames','Include the rownames in filtering etc.') do
+      options[:with_rownames] = true
+    end
-  o.separator "\n\tTransform:\n\n"
+    o.on('--unshift-headers','Add an extra header element at the front (header contains one fewer field than the number of columns)') do
+      options[:unshift_headers] = true
+    end
-  o.on('--transform-ids [downcase,upcase]',[:downcase,:upcase],'Transform column and row identifiers') do |par|
-    options[:transform_ids] = par.to_sym
-  end
+    o.on('--keep-headers','Keep original headers on merge') do
+      options[:keep_headers] = true
+    end
-  o.separator "\n\tFormat and options:\n\n"
-  o.on('--in-format [tab,csv,split,regex]', [:tab, :csv, :split, :regex], 'Input format (default tab)') do |par|
-    options[:in_format] = par.to_sym
-  end
-  o.on('--format [tab,csv,rdf,eval]', [:tab, :csv, :rdf, :eval], 'Output format (default tab)') do |par|
-    options[:format] = par.to_sym
-  end
+    o.on('--strip-quotes','Strip quotes from table fields') do
+      options[:strip_quotes] = true
+    end
-  o.on("--split-on command",String,"Split on string or regex (use with --in-format)") do | s |
-    options[:split_on] = s
-  end
+    o.on('--pad-fields','Add empty fields if a row is too short') do
+      options[:pad_fields] = true
+    end
-  o.on("-e command",String,"Evaluate output command (use with --format eval)") do | s |
-    options[:evaluate] = s
-  end
+    o.separator "\n\tTransform:\n\n"
-  o.on("--fasta regex",String,"Read FASTA format creating ID with regex") do | regex |
-    options[:fasta] = regex
-  end
+    o.on('--transform-ids [downcase,upcase]',[:downcase,:upcase],'Transform column and row identifiers') do |par|
+      options[:transform_ids] = par.to_sym
+    end
-  o.on('--blank-nodes','Output (RDF) blank nodes - allowing for duplicate row names') do
-    options[:blank_nodes] = true
-  end
+    o.separator "\n\tFormat and options:\n\n"
+    o.on('--in-format [tab,csv,split,regex]', [:tab, :csv, :split, :regex], 'Input format (default tab)') do |par|
+      options[:in_format] = par.to_sym
+    end
-  o.on('--statistics','Output column statistics') do
-    options[:statistics] = true
-  end
-  o.separator "\n\tVerbosity:\n\n"
-  o.on("--logger filename",String,"Log to file (default stderr)") do | name |
-    Bio::Log::CLI.logger(name)
-  end
+    o.on('--format [tab,csv,rdf,eval]', [:tab, :csv, :rdf, :eval], 'Output format (default tab)') do |par|
+      options[:format] = par.to_sym
+    end
-  o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
-    Bio::Log::CLI.trace(s)
-  end
-  o.on("-q", "--quiet", "Run quietly") do |q|
-    Bio::Log::CLI.trace('error')
-  end
-  o.on("-v", "--verbose", "Run verbosely") do |v|
-    Bio::Log::CLI.trace('info')
-  end
-  o.on("--debug", "Show debug messages") do |v|
-    Bio::Log::CLI.trace('debug')
-  end
+    o.on("--split-on command",String,"Split on string or regex (use with --in-format)") do | s |
+      options[:split_on] = s
+    end
-  o.separator ""
+    o.on("-e command",String,"Evaluate output command (use with --format eval)") do | s |
+      options[:evaluate] = s
+    end
-  o.on_tail('-h', '--help', 'Display this help and exit') do
-    options[:show_help] = true
-  end
-end
+    o.on("--fasta regex",String,"Read FASTA format creating ID with regex") do | regex |
+      options[:fasta] = regex
+    end
-begin
-  opts.parse!(ARGV)
+    o.on('--blank-nodes','Output (RDF) blank nodes - allowing for duplicate row names') do
+      options[:blank_nodes] = true
+    end
-  if options[:show_help]
-    print opts
-    print USAGE
-  end
+    o.on('--statistics','Output column statistics') do
+      options[:statistics] = true
+    end
+    o.separator "\n\tVerbosity:\n\n"
+    o.on("--logger filename",String,"Log to file (default stderr)") do | name |
+      Bio::Log::CLI.logger(name)
+    end
-  # TODO: your code here
-  # use options for your logic
-rescue OptionParser::InvalidOption => e
-  options[:invalid_argument] = e.message
-end
+    o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
+      Bio::Log::CLI.trace(s)
+    end
+    o.on("-q", "--quiet", "Run quietly") do |q|
+      Bio::Log::CLI.trace('error')
+    end
+    o.on("-v", "--verbose", "Run verbosely") do |v|
+      Bio::Log::CLI.trace('info')
+    end
+    o.on("--debug", "Show debug messages") do |v|
+      Bio::Log::CLI.trace('debug')
+      options[:debug] = true
+    end
-Bio::Log::CLI.configure('bio-table')
-logger = Bio::Log::LoggerPlus['bio-table']
-logger.info [options]
+    o.separator ""
-include BioTable
+    o.on_tail('-h', '--help', 'Display this help and exit') do
+      options[:show_help] = true
+    end
+  end
-if options[:diff]
-  logger.warn "Column settings are ignored for --diff" if options[:columns]
-  logger.warn "Ignoring extraneaous files "+ARGV[2..-1].join(",") if ARGV.size>2
-  t1 = TableReader::read_file(ARGV[0], options)
-  t2 = TableReader::read_file(ARGV[1], options)
-  t = Diff::diff_tables(t1,t2, options)
-  t.write(options)
-  exit
-end
+  begin
+    opts.parse!(ARGV)
-if options[:overlap]
-  logger.warn "Column settings are ignored for --overlap" if options[:columns]
-  logger.warn "Ignoring extraneaous files "+ARGV[2..-1].join(",") if ARGV.size>2
-  t1 = TableReader::read_file(ARGV[0], options)
-  t2 = TableReader::read_file(ARGV[1], options)
-  t = Overlap::overlap_tables(t1,t2, options)
-  t.write(options)
-  exit
-end
+    if options[:show_help]
+      print opts
+      print USAGE
+    end
-if options[:fasta]
-  logger.warn "Column settings are ignored for --fasta" if options[:columns]
-  ARGV.each do | fn |
-    print "id\tseq\n"
-    FastaReader.new(fn,options[:fasta]).each do | rec |
-      print rec.id,"\t",rec.seq,"\n"
+    # TODO: your code here
+    # use options for your logic
+  rescue OptionParser::InvalidOption => e
+    options[:invalid_argument] = e.message
+  end
+  Bio::Log::CLI.configure('bio-table')
+  logger = Bio::Log::LoggerPlus['bio-table']
+  logger.info [options]
+  include BioTable
+  if options[:diff]
+    logger.warn "Column settings are ignored for --diff" if options[:columns]
+    logger.warn "Ignoring extraneaous files "+ARGV[2..-1].join(",") if ARGV.size>2
+    t1 = TableReader::read_file(ARGV[0], options)
+    t2 = TableReader::read_file(ARGV[1], options)
+    t = Diff::diff_tables(t1,t2, options)
+    t.write(options)
+    exit
+  end
+  if options[:overlap]
+    logger.warn "Column settings are ignored for --overlap" if options[:columns]
+    logger.warn "Ignoring extraneaous files "+ARGV[2..-1].join(",") if ARGV.size>2
+    t1 = TableReader::read_file(ARGV[0], options)
+    t2 = TableReader::read_file(ARGV[1], options)
+    t = Overlap::overlap_tables(t1,t2, options)
+    t.write(options)
+    exit
+  end
+  if options[:fasta]
+    logger.warn "Column settings are ignored for --fasta" if options[:columns]
+    ARGV.each do | fn |
+      print "id\tseq\n"
+      FastaReader.new(fn,options[:fasta]).each do | rec |
+        print rec.id,"\t",rec.seq,"\n"
+      end
     end
+    exit
   end
-  exit
-end
-if options[:merge]
-  ts = []
-  ARGV.each do | fn |
-    ts << TableReader::read_file(fn, options)
+  if options[:merge]
+    ts = []
+    ARGV.each do | fn |
+      ts << TableReader::read_file(fn, options)
+    end
+    t = Merge::merge_tables(ts, options)
+    t.write(options)
+    exit
   end
-  t = Merge::merge_tables(ts, options)
-  t.write(options)
-  exit
-end
-#
-# We also support STDIN for the first 'file'. A non-blocking idea can
-# be found here:
-#
-# http://eric.lubow.org/2010/ruby/multiple-input-locations-from-bash-into-ruby/
-#
+  #
+  # We also support STDIN for the first 'file'. A non-blocking idea can
+  # be found here:
+  #
+  # http://eric.lubow.org/2010/ruby/multiple-input-locations-from-bash-into-ruby/
+  #
-writer =
-  if options[:format] == :rdf
-    BioTable::RDF::Writer.new(options[:blank_nodes])
-  else
-    BioTable::TableWriter::Writer.new(options[:format],options[:evaluate])
-  end
+  writer =
+    if options[:format] == :rdf
+      BioTable::RDF::Writer.new(options[:blank_nodes])
+    else
+      BioTable::TableWriter::Writer.new(options[:format],options[:evaluate])
+    end
+  statistics = if options[:statistics]
+                 BioTable::Statistics::Accumulate.new
+               end
+  count      = if options[:count]
+                 BioTable::Count::CountTracker.new(options[:count])
+               end
-if INPUT_ON_STDIN
   opts = options.dup # so we can 'safely' modify options
   has_input = false
-  BioTable::TableLoader.emit(STDIN, opts).each do |row, type|
-    writer.write(TableRow.new(row[0],row[1..-1]),type)
-    has_input = true
-  end
-  options[:write_header] = false if has_input  # don't write the header for chained files
-end
-statistics = if options[:statistics]
-               BioTable::Statistics::Accumulate.new
-             else
-               nil
-             end
+  walk_table = lambda { |f|
+    BioTable::TableLoader.emit(f, opts).each do |row, type| # type is :header or :row
+      if statistics
+        statistics.add(row,type)
+      else
+        row = count.add(row,type) if count # merge and count
+        writer.write(TableRow.new(row[0],row[1..-1]),type) if row
+      end
+      has_input = true
+    end
+    if count
+      row = count.add(row,:row,flush: true)
+      writer.write(TableRow.new(row[0],row[1..-1]),:row) if row
+    end
+    options[:write_header] = false if has_input  # don't write the header for chained files
+  }
-ARGV.each do | fn |
-  opts = options.dup # so we can 'safely' modify options
-  f = File.open(fn,"r")
-  if not opts[:in_format] and fn =~ /\.csv$/
-    logger.debug "Autodetected CSV file"
-    opts[:in_format] = :csv
+  if INPUT_ON_STDIN
+    walk_table.call(STDIN)
   end
-  BioTable::TableLoader.emit(f, opts).each do |row,type|
-    if statistics
-      statistics.add(row,type)
-    else
-      writer.write(TableRow.new(row[0],row[1..-1]),type)
+  ARGV.each do | fn |
+    opts = options.dup # so we can 'safely' modify options
+    f = File.open(fn,"r")
+    if not opts[:in_format] and fn =~ /\.csv$/i
+      logger.debug "Autodetected CSV file"
+      opts[:in_format] = :csv
     end
+    walk_table.call(f)
   end
-  options[:write_header] = false  # don't write the header for chained files
-end
-statistics.write(writer) if statistics
+  statistics.write(writer) if statistics
+rescue => msg
+  if options[:debug]
+    raise
+  else
+    $stderr.print "Error: ",msg
+    exit 1
+  end
+end

data/lib/bio-table.rb CHANGED

@@ -30,5 +30,6 @@ require 'bio-table/parsers/fastareader.rb'
 module BioTable
   autoload :Statistics,'bio-table/statistics'
+  autoload :Count,'bio-table/count'
 end

data/lib/bio-table/count.rb ADDED

@@ -0,0 +1,39 @@
+module BioTable
+  module Count
+    # Track rows that have the same column items. Return the last match of the cummalative list
+    # with the count attached.
+    class CountTracker
+      def initialize list
+        @list = list.map { |item| item.to_i }
+        @rows = []
+      end
+      # Add a row and if it differs send the last merged edition back
+      # type is :header or :row
+      def add row, type, flush: false
+        return row+["count"] if type == :header
+        num = @rows.size
+        prev = @rows.last
+        if flush
+          prev+[num]
+        else
+          # Take the list and compare each item to the previous row
+          prev_same = if prev
+                        @list.reduce(true) { |memo,i| memo && (row[i]==prev[i]) }
+                      else
+                        false
+                      end
+          if prev_same
+            @rows << row
+          else
+            @rows = []
+            @rows << row
+            return prev+[num] if prev
+          end
+          nil
+        end
+      end
+    end
+  end
+end

data/lib/bio-table/filter.rb CHANGED

@@ -17,6 +17,7 @@ module BioTable
         field = @fields[index]
         @values[index] = (Filter::valid_number?(field) ? field.to_f : nil )
       end
+      @values[index].freeze
       @values[index]
     end

data/lib/bio-table/merge.rb CHANGED

@@ -6,7 +6,9 @@ module BioTable
       logger = Bio::Log::LoggerPlus['bio-table']
       logger.info("Merging tables")
       headers = tables.first.header[0..0] +
-        tables.map { |t| t.header[1..-1].map{|n| t.name+'-'+n} }.flatten
+        tables.map { |t| t.header[1..-1].map{|n|
+          (options[:keep_headers] ? n : t.name+'-'+n)
+        }}.flatten
       t = Table.new(headers)
       # index tables on rownames
       idxs = []

data/lib/bio-table/parser.rb CHANGED

@@ -7,21 +7,23 @@ module BioTable
     # Converts a string into an array of string fields
     def LineParser::parse(line, in_format, split_on)
       if in_format == :csv
-        CSV.parse(line)[0]
+        $stderr.print "WARNING: this looks like a tab delimited file to me!\n" if line =~ /\t/
+        CSV.parse_line(line)
       elsif in_format == :split
-        line.strip.split(split_on).map { |field|
+        line.split(split_on).map { |field|
           fld = field.strip
           fld = nil if fld == "NA"
           fld
         }
       elsif in_format == :regex
-        line.strip.split(/#{split_on}/).map { |field|
+        line.split(/#{split_on}/).map { |field|
           fld = field.strip
           fld = nil if fld == "NA"
           fld
         }
       else
-        line.strip.split("\t").map { |field|
+        $stderr.print "WARNING: this looks like a tab delimited file to me!\n" if line =~ /,"/
+        line.split("\t").map { |field|
           fld = field.strip
           fld = nil if fld == "NA"
           fld

data/lib/bio-table/rewrite.rb CHANGED

@@ -4,9 +4,10 @@ module BioTable
     # Rewrite fields. Both field and fields can be used, but not at the same time.
     def Rewrite::rewrite code, rowname, field
-      fields = field
-      original = field
+      fields = field.dup
+      original = field.dup
       values = LazyValues.new(field)
+      value = values
       return rowname,field if not code or code==""
       begin
         eval(code)
@@ -14,6 +15,12 @@ module BioTable
         $stderr.print "Failed to evaluate ",rowname," ",field," with ",code,"\n"
         raise
       end
+      if (fields & original != fields.uniq) and (field & original != field.uniq)
+        $stderr.print [:original,original],"\n"
+        $stderr.print [:fields,fields],"\n"
+        $stderr.print [:field,field],"\n"
+        raise "You can not rewrite both field and fields!"
+      end
       field = fields if fields != original
       return rowname,field
     end

data/lib/bio-table/table_apply.rb CHANGED

@@ -51,9 +51,13 @@ module BioTable
       return column_idx, new_header
     end
+    # Take a line as a string and return it as a tuple of rowname and datafields
     def parse_row(line_num, line, header, column_idx, prev_fields, options)
       fields = LineParser::parse(line, options[:in_format], options[:split_on])
       return nil,nil if fields.compact == []
+      if options[:pad_fields] and fields.size < header.size
+        fields += [''] * (header.size - fields.size)
+      end
       fields = Formatter::strip_quotes(fields) if @strip_quotes
       fields = Formatter::transform_row_ids(@transform_ids, fields) if @transform_ids
       fields = Filter::apply_column_filter(fields,column_idx)

data/lib/bio-table/tableload.rb CHANGED

@@ -9,7 +9,7 @@ module BioTable
     #
     # Note that you need to pass in :with_header to get the header row
     def TableLoader::emit generator, options = {}
-      table_apply = TableApply.new(options)
+      table_apply = TableApply.new(options) # parser and filters
       column_index = nil, prev_line = nil
       skip = options[:skip]
       skip = 0 if skip == nil
@@ -19,6 +19,7 @@ module BioTable
         generator.each_with_index do |line, line_num|
           # p [line_num, line]
           if line_num-skip == 0
+            # ---- This is the header section
             header = table_apply.parse_header(line, options)
             # Validator::valid_header?(header, @header)  # compare against older header when merging
             column_index,header = table_apply.column_index(header) # we may rewrite the header

data/lib/bio-table/validator.rb CHANGED

@@ -4,8 +4,8 @@ module BioTable
     def Validator::valid_header? header, old_header
       if old_header
         if header - old_header != []
-          p old_header
-          p header
+          $stderr.print old_header,"\n"
+          $stderr.print header,"\n"
           raise "Headers do not compare!"
         end
       end
@@ -15,8 +15,8 @@ module BioTable
     def Validator::valid_row? line_number, fields, last_fields
       return false if fields == nil or fields.size == 0
       if last_fields and last_fields.size>0 and (fields.size != last_fields.size)
-        p last_fields
-        p fields
+        $stderr.print last_fields,"\n"
+        $stderr.print fields,"\n"
         throw "Number of fields diverge in line #{line_number} (size #{fields.size}, expected #{last_fields.size})"
       end
       true

data/test/data/regression/table1-STDIN.ref CHANGED

@@ -378,6 +378,7 @@
 110173,9.97,18.59,12.35,13.67,14.56,14.63,12.69,18.49,14.23,16.23,,,20.48,16.47,20.68,13.14,18.88,14.3,13.67,20.54,15.99,16.15,21.33,17.06,,16.05,,,Manba,"mannosidase, beta A, lysosomal,mannosidase, beta A, lysosomal,mannosidase, beta A, lysosomal,"
 110187,0,0,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0.31,0,0,0,,0,,,Abpg,"androgen binding protein gamma"
 110196,11.21,27.98,12.13,16.57,16.77,12.76,14.31,13.07,9.22,10.51,,,29.03,16.7,7.62,24.09,15.31,28.3,10.92,14.73,23.27,15.13,36.77,10.05,,15.15,,,Fdps,"farnesyl diphosphate synthetase"
 0	0.06	NA	0	0	0	0	0.11	0	0	0	NA	NA	0	0	0	0	0	0	0	0	0	0	0	0	NA	0	NA	NA	Mal2	MAL2 proteolipid protein
 213.15	236.88	213.95	213.15	253.49	198	231.56	200.96	255.2	214.04	231.46	NA	NA	233.23	241.26	237.53	171.87	237.13	162.3	252.13	284.85	188.76	253.43	220.15	305.52	NA	217.42	NA	NA	Nckap1l	NCK associated protein 1 like,NCK associated protein 1 like,
 0	0	NA	0	0	0.07	0.04	0	0	0	0	NA	NA	0.02	0	0	0	0	0	0.06	0	0	0	0.02	0	NA	0	NA	NA	Csdc2	RNA-binding protein pippin

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bio-table
 version: !ruby/object:Gem::Version
-  version: 0.9.0
+  version: 1.0.0
 platform: ruby
 authors:
 - Pjotr Prins
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-02-27 00:00:00.000000000 Z
+date: 2014-05-31 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bio-logger
@@ -80,20 +80,6 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: 2.0.0
-- !ruby/object:Gem::Dependency
-  name: bio
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 1.4.2
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 1.4.2
 - !ruby/object:Gem::Dependency
   name: rdoc
   requirement: !ruby/object:Gem::Requirement
@@ -153,6 +139,7 @@ files:
 - features/support/env.rb
 - lib/bio-table.rb
 - lib/bio-table/columns.rb
+- lib/bio-table/count.rb
 - lib/bio-table/diff.rb
 - lib/bio-table/filter.rb
 - lib/bio-table/formatter.rb