RubyGems - wukong - Versions diffs - 0.1.1 - Mend

wukong 0.1.1

Files changed (143) hide show

data/LICENSE.textile +107 -0
data/README.textile +166 -0
data/bin/cutc +30 -0
data/bin/cuttab +5 -0
data/bin/greptrue +8 -0
data/bin/hdp-cat +3 -0
data/bin/hdp-catd +3 -0
data/bin/hdp-du +81 -0
data/bin/hdp-get +3 -0
data/bin/hdp-kill +3 -0
data/bin/hdp-ls +10 -0
data/bin/hdp-mkdir +3 -0
data/bin/hdp-mv +3 -0
data/bin/hdp-parts_to_keys.rb +77 -0
data/bin/hdp-ps +3 -0
data/bin/hdp-put +3 -0
data/bin/hdp-rm +11 -0
data/bin/hdp-sort +29 -0
data/bin/hdp-stream +29 -0
data/bin/hdp-stream-flat +18 -0
data/bin/hdp-sync +17 -0
data/bin/hdp-wc +67 -0
data/bin/md5sort +20 -0
data/bin/tabchar +5 -0
data/bin/uniqc +3 -0
data/bin/wu-hist +3 -0
data/bin/wu-lign +177 -0
data/bin/wu-sum +30 -0
data/doc/INSTALL.textile +41 -0
data/doc/LICENSE.textile +107 -0
data/doc/README-tutorial.textile +163 -0
data/doc/README-wulign.textile +59 -0
data/doc/README-wutils.textile +128 -0
data/doc/TODO.textile +61 -0
data/doc/UsingWukong-part1-setup.textile +2 -0
data/doc/UsingWukong-part2-scraping.textile +2 -0
data/doc/UsingWukong-part3-parsing.textile +132 -0
data/doc/code/api_response_example.txt +20 -0
data/doc/code/parser_skeleton.rb +38 -0
data/doc/hadoop-nfs.textile +51 -0
data/doc/hadoop-setup.textile +29 -0
data/doc/index.textile +124 -0
data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
data/doc/links.textile +42 -0
data/doc/overview.textile +91 -0
data/doc/pig/PigLatinExpressionsList.txt +122 -0
data/doc/pig/PigLatinReferenceManual.html +19134 -0
data/doc/pig/PigLatinReferenceManual.txt +1640 -0
data/doc/tips.textile +116 -0
data/doc/usage.textile +102 -0
data/doc/utils.textile +48 -0
data/examples/README.txt +17 -0
data/examples/and_pig/sample_queries.rb +128 -0
data/examples/apache_log_parser.rb +53 -0
data/examples/count_keys.rb +56 -0
data/examples/count_keys_at_mapper.rb +57 -0
data/examples/graph/adjacency_list.rb +74 -0
data/examples/graph/breadth_first_search.rb +79 -0
data/examples/graph/gen_2paths.rb +68 -0
data/examples/graph/gen_multi_edge.rb +103 -0
data/examples/graph/gen_symmetric_links.rb +53 -0
data/examples/package-local.rb +100 -0
data/examples/package.rb +96 -0
data/examples/pagerank/README.textile +6 -0
data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
data/examples/pagerank/pagerank.rb +88 -0
data/examples/pagerank/pagerank_initialize.rb +46 -0
data/examples/pagerank/run_pagerank.sh +19 -0
data/examples/rank_and_bin.rb +173 -0
data/examples/run_all.sh +47 -0
data/examples/sample_records.rb +44 -0
data/examples/size.rb +60 -0
data/examples/word_count.rb +95 -0
data/lib/wukong.rb +11 -0
data/lib/wukong/and_pig.rb +62 -0
data/lib/wukong/and_pig/README.textile +12 -0
data/lib/wukong/and_pig/as.rb +37 -0
data/lib/wukong/and_pig/data_types.rb +30 -0
data/lib/wukong/and_pig/functions.rb +50 -0
data/lib/wukong/and_pig/generate.rb +85 -0
data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
data/lib/wukong/and_pig/junk.rb +51 -0
data/lib/wukong/and_pig/operators.rb +8 -0
data/lib/wukong/and_pig/operators/compound.rb +29 -0
data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
data/lib/wukong/and_pig/operators/execution.rb +15 -0
data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
data/lib/wukong/and_pig/operators/foreach.rb +98 -0
data/lib/wukong/and_pig/operators/groupies.rb +212 -0
data/lib/wukong/and_pig/operators/load_store.rb +65 -0
data/lib/wukong/and_pig/operators/meta.rb +42 -0
data/lib/wukong/and_pig/operators/relational.rb +129 -0
data/lib/wukong/and_pig/pig_struct.rb +48 -0
data/lib/wukong/and_pig/pig_var.rb +95 -0
data/lib/wukong/and_pig/symbol.rb +29 -0
data/lib/wukong/and_pig/utils.rb +0 -0
data/lib/wukong/bad_record.rb +18 -0
data/lib/wukong/boot.rb +47 -0
data/lib/wukong/datatypes.rb +24 -0
data/lib/wukong/datatypes/enum.rb +123 -0
data/lib/wukong/dfs.rb +80 -0
data/lib/wukong/encoding.rb +111 -0
data/lib/wukong/extensions.rb +15 -0
data/lib/wukong/extensions/array.rb +18 -0
data/lib/wukong/extensions/blank.rb +93 -0
data/lib/wukong/extensions/class.rb +189 -0
data/lib/wukong/extensions/date_time.rb +24 -0
data/lib/wukong/extensions/emittable.rb +82 -0
data/lib/wukong/extensions/hash.rb +120 -0
data/lib/wukong/extensions/hash_like.rb +119 -0
data/lib/wukong/extensions/hashlike_class.rb +47 -0
data/lib/wukong/extensions/module.rb +2 -0
data/lib/wukong/extensions/pathname.rb +27 -0
data/lib/wukong/extensions/string.rb +65 -0
data/lib/wukong/extensions/struct.rb +17 -0
data/lib/wukong/extensions/symbol.rb +11 -0
data/lib/wukong/logger.rb +53 -0
data/lib/wukong/models/graph.rb +27 -0
data/lib/wukong/rdf.rb +104 -0
data/lib/wukong/schema.rb +37 -0
data/lib/wukong/script.rb +265 -0
data/lib/wukong/script/hadoop_command.rb +111 -0
data/lib/wukong/script/local_command.rb +14 -0
data/lib/wukong/streamer.rb +13 -0
data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
data/lib/wukong/streamer/base.rb +76 -0
data/lib/wukong/streamer/count_keys.rb +30 -0
data/lib/wukong/streamer/count_lines.rb +26 -0
data/lib/wukong/streamer/filter.rb +20 -0
data/lib/wukong/streamer/line_streamer.rb +12 -0
data/lib/wukong/streamer/list_reducer.rb +20 -0
data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
data/lib/wukong/streamer/set_reducer.rb +14 -0
data/lib/wukong/streamer/struct_streamer.rb +48 -0
data/lib/wukong/streamer/summing_reducer.rb +29 -0
data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
data/lib/wukong/typed_struct.rb +12 -0
data/lib/wukong/wukong_class.rb +21 -0
data/spec/bin/hdp-wc_spec.rb +4 -0
data/spec/spec_helper.rb +0 -0
data/wukong.gemspec +179 -0
metadata +214 -0

data/bin/hdp-parts_to_keys.rb ADDED Viewed

@@ -0,0 +1,77 @@
+#!/usr/bin/env ruby
+dir_to_rename = ARGV[0]
+dest_ext = '.tsv'
+unless dir_to_rename && (! dir_to_rename.empty?)
+  warn "Need a directory or file spec to rename."
+  exit
+end
+#
+# Setup
+#
+warn "\nPlease IGNORE the 'cat: Unable to write to output stream.' errors\n"
+#
+# Examine the files
+#
+file_listings = `hdp-ls #{dir_to_rename}`.split("\n")
+command_lists = { }
+file_listings[1..-1].each do |file_listing|
+  m = %r{[-drwx]+\s+[\-\d]+\s+\w+\s+\w+\s+(\d+)\s+[\d\-]+\s+[\d\:]+\s+(.+)$}.match(file_listing)
+  if !m then warn "Couldn't grok #{file_listing}" ; next ; end
+  size, filename = m.captures
+  case
+  when size.to_i == 0 then (command_lists[:deletes]||=[]) << filename
+  else
+    firstline = `hdp-cat #{filename} | head -qn1 `
+    file_key, _ = firstline.split("\t", 2)
+    unless file_key && (file_key =~ /\A[\w\-\.]+\z/)
+      warn "Don't want to rename to '#{file_key}'... skipping"
+      next
+    end
+    dirname = File.dirname(filename)
+    destfile = File.join(dirname, file_key)+dest_ext
+    (command_lists[:moves]||=[]) << "hdp-mv #{filename} #{destfile}"
+  end
+end
+#
+# Execute the command_lists
+#
+command_lists.each do |type, command_list|
+  case type
+  when :deletes
+    command = "hdp-rm #{command_list.join(" ")}"
+    puts command
+    `#{command}`
+  when :moves
+    command_list.each do |command|
+      puts command
+      `#{command}`
+    end
+  end
+end
+# -rw-r--r--   3 flip supergroup          0 2008-12-20 05:51 /user/flip/out/sorted-tweets-20081220/part-00010
+# # Killing empty files
+# find . -size 0 -print -exec rm {} \;
+#
+# for foo in part-0* ; do
+#   newname=`
+#     head -n1 $foo |
+#     cut -d'   ' -f1 |
+#     ruby -ne 'puts $_.chomp.gsub(/[^\-\w]/){|s| s.bytes.map{|c| "%%%02X" % c }}'
+#     `.tsv ;
+#   echo "moving $foo to $newname"
+#   mv "$foo" "$newname"
+# done
+#
+# # dir=`basename $PWD`
+# # for foo in *.tsv ; do
+# #   echo "Compressing $dir"
+# #   bzip2 -c $foo > ../$dir-bz2/$foo.bz2
+# # done

data/bin/hdp-ps ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+hadoop job -list all

data/bin/hdp-put ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+hadoop dfs -put "$1" "$2"

data/bin/hdp-rm ADDED Viewed

@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+if [ "$1" == "-r" ] ; then
+    shift
+    action=rmr
+else
+    action=rm
+fi
+echo hadoop dfs -$action "$@"
+# read -p "Hit ctrl-C to abort or enter to do this...."
+hadoop dfs -$action "$@"

data/bin/hdp-sort ADDED Viewed

@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# hadoop dfs -rmr out/parsed-followers
+input_file=${1} 		; shift
+output_file=${1} 		; shift
+map_script=${1-/bin/cat}	; shift
+reduce_script=${1-/usr/bin/uniq}	; shift
+fields=${1-2} 			; shift
+if [ "$reduce_script" == "" ] ; then echo "$0 input_file output_file [sort_fields] [mapper] [reducer] [args]" ; exit ; fi
+HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
+${HADOOP_HOME}/bin/hadoop \
+     jar         ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar		\
+    -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner 			\
+    -jobconf     map.output.key.field.separator='\t'					\
+    -jobconf     num.key.fields.for.partition=1 					\
+    -jobconf 	 stream.map.output.field.separator='\t'					\
+    -jobconf 	 stream.num.map.output.key.fields="$fields"				\
+    -mapper  	 "$map_script"  							\
+    -reducer	 "$reduce_script"							\
+    -input       "$input_file"								\
+    -output  	 "$output_file"								\
+    "$@"
+# -jobconf mapred.map.tasks=3                                                       \
+# -jobconf mapred.reduce.tasks=3                                                    \

data/bin/hdp-stream ADDED Viewed

@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# hadoop dfs -rmr out/parsed-followers
+input_file=${1} 		; shift
+output_file=${1} 		; shift
+map_script=${1-/bin/cat}	; shift
+reduce_script=${1-/usr/bin/uniq}	; shift
+fields=${1-2} 			; shift
+if [ "$reduce_script" == "" ] ; then echo "$0 input_file output_file [sort_fields] [mapper] [reducer] [args]" ; exit ; fi
+HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
+${HADOOP_HOME}/bin/hadoop \
+     jar         ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar		\
+    -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner 			\
+    -jobconf     map.output.key.field.separator='\t'					\
+    -jobconf     num.key.fields.for.partition=1 					\
+    -jobconf 	 stream.map.output.field.separator='\t'					\
+    -jobconf 	 stream.num.map.output.key.fields="$fields"				\
+    -mapper  	 "$map_script"  							\
+    -reducer	 "$reduce_script"							\
+    -input       "$input_file"								\
+    -output  	 "$output_file"								\
+    "$@"
+# -jobconf mapred.map.tasks=3                                                       \
+# -jobconf mapred.reduce.tasks=3                                                    \

data/bin/hdp-stream-flat ADDED Viewed

@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+input_file=${1} 			; shift
+output_file=${1} 			; shift
+map_script=${1-/bin/cat}		; shift
+reduce_script=${1-/usr/bin/uniq}	; shift
+if [ "$reduce_script" == "" ] ; then echo "$0 input_file output_file [sort_fields] [mapper] [reducer] [args]" ; exit ; fi
+hadoop jar /home/flip/hadoop/h/contrib/streaming/hadoop-*-streaming.jar			\
+    -mapper  	"$map_script"  								\
+    -reducer	"$reduce_script"							\
+    -input      "$input_file"								\
+    -output  	"$output_file"								\
+    "$@"
+# -jobconf mapred.map.tasks=3                                                       \
+# -jobconf mapred.reduce.tasks=3                                                    \

data/bin/hdp-sync ADDED Viewed

@@ -0,0 +1,17 @@
+#!/usr/bin/env ruby
+require 'wukong'
+src_dir, dest_dir = ARGV[0..1]
+src_files   = Dir[src_dir + '/*']
+dest_files  = Wukong::Dfs.list_files dest_dir
+Wukong::Dfs.compare_listings(src_files, dest_files) do |comparison, src_file, dest_file|
+  case comparison
+  when :missing
+    dest_filename = "%s/%s" % [dest_dir, dest_file]
+    puts "Copying #{src_file} #{dest_filename}"
+    puts `hadoop dfs -put #{src_file} #{dest_filename}`
+  when :differ
+    src_ls = `ls -l #{src_file}`.split(/\s+/).join("\t")
+    puts "Differ: #{src_ls} \n#{dest_file}"
+  end
+end

data/bin/hdp-wc ADDED Viewed

@@ -0,0 +1,67 @@
+#!/usr/bin/env ruby
+require 'wukong'
+NEWLINE_LENGTH = $/.length # KLUDGE
+#
+#
+#
+# !! The +words+ count comes out higher than that of +wc+ -- don't know
+# why. (It's close: a 10GB, 1M line dataset it showed 367833839 vs. 367713271)
+#
+class WcMapper < Wukong::Streamer::LineStreamer
+  attr_accessor :lines, :fields, :words, :chars, :bytes
+  def before_stream
+    self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
+  end
+  def process line
+    return unless line
+    self.lines  += 1
+    self.fields += 1 + line.count("\t")
+    self.words  += 1 + line.strip.scan(/\s+/).length unless line.blank?
+    self.chars  += line.chars.to_a.length + NEWLINE_LENGTH
+    self.bytes  += line.bytesize          + NEWLINE_LENGTH
+    $stderr.puts line if (line.chars.to_a.length != line.bytesize)
+  end
+  def after_stream
+    emit [lines, fields, words, chars,  bytes]
+  end
+end
+#
+#
+class WcReducer < Wukong::Streamer::Base
+  attr_accessor :lines, :fields, :words, :chars, :bytes
+  def before_stream
+    self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
+  end
+  def process m_lines, m_fields, m_words, m_chars, m_bytes
+    self.lines  += m_lines.to_i
+    self.fields += m_fields.to_i
+    self.words  += m_words.to_i
+    self.chars  += m_chars.to_i
+    self.bytes  += m_bytes.to_i
+  end
+  def after_stream
+    emit [lines, fields, words, chars,  bytes]
+  end
+end
+Wukong::Script.new(WcMapper, WcReducer, :reduce_tasks => 1).run
+# class FooScript < Wukong::Script
+#   def map_command
+#     '/usr/bin/wc'
+#   end
+#   def reduce_command
+#     '/bin/cat'
+#   end
+# end
+# FooScript.new(nil, nil, :reduce_tasks => 1).run
+#
+#  ruby -ne 'wc_v = `echo "#{$_.chomp}" | wc`; gr_v=($_.strip.empty? ? 0 : $_.strip.scan(/\s+/).length + 1 ) ; puts [wc_v.chomp, " ", gr_v, $_.chomp].join("\t")'

data/bin/md5sort ADDED Viewed

@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+""" sorts lines (or tab-sep records) by md5.  (e.g. for train/test splits).
+optionally prepends with the md5 id too.
+brendan o'connor - anyall.org - gist.github.com/brendano """
+import hashlib,sys,optparse
+p = optparse.OptionParser()
+p.add_option('-k',  type='int', default=False)
+p.add_option('-p', action='store_true')
+opts,args=p.parse_args()
+lines = sys.stdin.readlines()
+getter=lambda s: hashlib.md5(s[:-1]).hexdigest()
+if opts.k:
+  getter=lambda s: hashlib.md5(s[:-1].split("\t")[opts.k-1]).hexdigest()
+lines.sort(key=lambda s: getter(s))
+for line in lines:
+  if opts.p:  line = getter(line) + "\t" + line
+  print line,

data/bin/tabchar ADDED Viewed

@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# insert a tab char from the command line:
+# echo "hi$(tabchar)there"
+# # => "hi	there"
+echo -n -e '\t'

data/bin/uniqc ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+uniq -c | ruby -ne 'puts $_.chomp.gsub(/^\s+(\d+)\s+/){ "%15s\t" % $1 }'

data/bin/wu-hist ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+sort | uniq -c | sort -rn | ruby -ne 'puts $_.chomp.gsub(/^\s+(\d+)\s+/){ $1+"\t" }'

data/bin/wu-lign ADDED Viewed

@@ -0,0 +1,177 @@
+#!/usr/bin/env ruby
+USAGE= %Q{
+# h1. wulign -- format a tab-separated file as aligned columns
+#
+# wulign will intelligently reformat a tab-separated file into a tab-separated,
+# space aligned file that is still suitable for further processing. For example,
+# given the log-file input
+#
+#     <pre><code>
+#     2009-07-21T21:39:40 day     65536   3.15479 68750   1171316
+#     2009-07-21T21:39:45 doing   65536   1.04533 26230   1053956
+#     2009-07-21T21:41:53 hapaxlegomenon  65536   0.87574e-05     23707   10051141
+#     2009-07-21T21:44:00 concert 500     0.29290 13367   9733414
+#     2009-07-21T21:44:29 world   65536   1.09110 32850   200916
+#     2009-07-21T21:44:39 world+series    65536   0.49380 9929    7972025
+#     2009-07-21T21:44:54 iranelection    65536   2.91775 14592   136342
+#     </code></pre>
+#
+# wulign will reformat it to read
+#
+#     <pre><code>
+#     2009-07-21T21:39:40 day                   65536   3.154791234 68750    1171316
+#     2009-07-21T21:39:45 doing                 65536   1.045330000 26230    1053956
+#     2009-07-21T21:41:53 hapaxlegomenon        65536   0.000008757 23707   10051141
+#     2009-07-21T21:44:00 concert                 500   0.292900000 13367    9733414
+#     2009-07-21T21:44:29 world                 65536   1.091100000 32850     200916
+#     2009-07-21T21:44:39 world+series          65536   0.493800000  9929    7972025
+#     2009-07-21T21:44:54 iranelection          65536   2.917750000 14592     136342
+#     </code></pre>
+#
+# The fields are still tab-delimited by exactly one tab -- only spaces are used to
+# pad out fields. You can still use cuttab and friends to manipulate columns.
+#
+# wulign isn't intended to be smart, or correct, or reliable -- only to be
+# useful for previewing and organizing tab-formatted files. In general
+# @wulign(foo).split("\t").map(&:strip)@ *should* give output semantically
+# equivalent to its input. (That is, the only changes should be insertion of
+# spaces and re-formatting of numerics.) But still -- reserve its use for human
+# inspection only.
+#
+# (Note: tab characters in this source code file have been converted to spaces;
+# replace whitespace with tab in the first example if you'd like to play along at
+# home.)
+#
+# h2. How it works
+#
+# Wulign takes the first 1000 lines, splits by TAB characters into fields, and
+# tries to guess the format -- int, float, or string -- for each. It builds a
+# consensus of the width and type for corresponding columns in the chunk.  If a
+# column has mixed numeric and string formats it degrades to :mixed, which is
+# basically treated as :string. If a column has mixed :float and :int elements all
+# of them are formatted as float.
+#
+# h2. Command-line arguments
+#
+# You can give sprintf-style positional arguments on the command line that will be
+# applied to the corresponding columns. (Blank args are used for placeholding and
+# auto-formatting is still applied).  So with the example above,
+#
+#     @cat foo | wulign  '' '' '' '%8.4e'@
+#
+# will format the fourth column with "%8.4e", while the first three columns and
+# fifth-and-higher columns are formatted as usual.
+#
+#     <pre><code>
+#     ...
+#     2009-07-21T21:39:45 doing           65536   1.0453e+00      26230    1053956
+#     2009-07-21T21:41:53 hapaxlegomenon  65536   8.7574e-06      23707   10051141
+#     2009-07-21T21:44:00 concert           500   2.9290e-01      13367    9733414
+#     ....
+#     </code></pre>
+#
+# h2. Notes
+#
+# * It has no knowledge of header rows. An all-text first line will screw everything up.
+#
+# * It also requires a unanimous vote. One screwy line can coerce the whole mess
+#   to :mixed; width formatting will still be applied, though.
+#
+# * It won't set columns wider than 70 chars -- this allows for the occasional
+#   super-wide column without completely breaking your screen.
+#
+# * For :float values, wulign tries to guess at the right number of significant
+#   digits to the left and right of the decimal point.
+#
+# * wulign does not parse 'TSV files' in their strict sense -- there is no quoting
+#   or escaping; every tab delimits a field, every newline a record.
+}
+if ARGV[0] == '--help'
+  puts $0
+  puts USAGE
+  exit
+end
+#
+# How many initial lines to use to guess formatting.  Lines after this are
+# simply reformatted according to the consensus of the initial
+# FORMAT_GUESSING_LINES.
+#
+FORMAT_GUESSING_LINES = 500
+# widest column to set
+MAX_MAX_WIDTH = 70
+INT_RE   = /\A\d+\z/
+FLOAT_RE = /\A(\d+)(?:\.(\d+))?(?:e-?\d+)?\z/
+def consensus_type val, alltype
+  return :mixed if alltype == :mixed
+  case
+  when val == ''       then type = nil
+  when val =~ INT_RE   then type = :int
+  when val =~ FLOAT_RE then type = :float
+  else                      type = :str end
+  return if ! type
+  case
+  when alltype.nil?    then type
+  when alltype == type then type
+  when ( ((alltype==:float) && (type == :int)) || ((alltype == :int) && (type == :float)) )
+    :float
+  else :mixed
+  end
+end
+def f_width str
+  str =~ FLOAT_RE or return 0
+  [$1.length, $2 ? $2.length : 0]
+end
+maxw       = []
+col_types  = []
+col_minmag = []
+col_maxmag = []
+rows       = []
+skip_col   = []
+ARGV.each_with_index{|v,i| next if (v == '') ; maxw[i] = 0; skip_col[i] = true }
+FORMAT_GUESSING_LINES.times do
+  line = $stdin.readline rescue nil
+  break unless line
+  cols = line.chomp.split("\t").map{|s| s.strip }
+  col_widths = cols.map{|col| col.length }
+  col_widths.each_with_index{|cw,i| maxw[i] = [[cw,maxw[i]].compact.max, MAX_MAX_WIDTH].min }
+  cols.each_with_index{|col,i|
+    next if skip_col[i]
+    col_types[i] = consensus_type(col, col_types[i])
+    if col_types[i] == :float
+      mantissa, radix = f_width(col)
+      col_minmag[i] = [radix,    col_minmag[i], 1].compact.max
+      col_maxmag[i] = [mantissa, col_maxmag[i], 1].compact.max
+    end
+  }
+  # p [maxw, col_types, col_minmag, col_maxmag, col_widths, cols]
+  rows << cols
+end
+format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type, minmag, maxmag, default|
+  next(lambda{|s| default % s rescue s }) if default.to_s != ''
+  case type
+  when :mixed, nil then lambda{|s| "%-#{width}s" % s }
+  when :str        then lambda{|s| "%-#{width}s" % s }
+  when :int        then lambda{|s| "%#{width}d"  % s.to_i }
+  when :float      then lambda{|s| "%#{maxmag+minmag+1}.#{minmag}f" % s.to_f }
+  else raise "oops type #{type}"  end
+end
+# p [maxw, col_types, col_minmag, col_maxmag, format]
+pad = [''] * maxw.length
+rows.each do |row|
+  # note -- strips trailing columns
+  puts row.zip(format).map{|c,f| f.call(c) }.join("\t")
+end
+$stdin.each do |line|
+  cols = line.chomp.split("\t").map{|s| s.strip }
+  # note -- strips trailing columns
+  puts cols.zip(format).map{|c,f| f.call(c) rescue c }.join("\t")
+end