RubyGems - ul-wukong - Versions diffs - 4.1.0 - Mend

ul-wukong 4.1.0

Files changed (261) hide show

checksums.yaml +15 -0
data/.gitignore +60 -0
data/.gitmodules +6 -0
data/.rspec +2 -0
data/.travis.yml +19 -0
data/.yardopts +6 -0
data/CHANGELOG.md +7 -0
data/Gemfile +17 -0
data/Guardfile +12 -0
data/LICENSE.md +95 -0
data/NOTES-travis.md +31 -0
data/README-old.md +422 -0
data/README.md +1308 -0
data/Rakefile +28 -0
data/TODO.md +99 -0
data/bin/cutc +30 -0
data/bin/cuttab +5 -0
data/bin/greptrue +6 -0
data/bin/md5sort +20 -0
data/bin/setcat +11 -0
data/bin/tabchar +5 -0
data/bin/uniq-ord +59 -0
data/bin/uniqc +3 -0
data/bin/wu +34 -0
data/bin/wu-clean-encoding +31 -0
data/bin/wu-date +13 -0
data/bin/wu-datetime +13 -0
data/bin/wu-hist +3 -0
data/bin/wu-lign +186 -0
data/bin/wu-local +4 -0
data/bin/wu-plus +9 -0
data/bin/wu-source +5 -0
data/bin/wu-sum +31 -0
data/diagrams/wu_local.dot +39 -0
data/diagrams/wu_local.dot.png +0 -0
data/examples/Gemfile +38 -0
data/examples/README.md +9 -0
data/examples/basic/string_reverser.rb +23 -0
data/examples/basic/tiny_count.rb +8 -0
data/examples/basic/word_count/accumulator.rb +26 -0
data/examples/basic/word_count/tokenizer.rb +13 -0
data/examples/basic/word_count/word_count.rb +6 -0
data/examples/dataflow/scraper_macro_flow.rb +28 -0
data/examples/deploy_pack/Gemfile +6 -0
data/examples/deploy_pack/README.md +6 -0
data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
data/examples/deploy_pack/config/environment.rb +1 -0
data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
data/examples/dsl/dataflow/simple.rb +12 -0
data/examples/dsl/dataflow/telegram.rb +45 -0
data/examples/dsl/workflow/cherry_pie.dot +97 -0
data/examples/dsl/workflow/cherry_pie.md +104 -0
data/examples/dsl/workflow/cherry_pie.png +0 -0
data/examples/dsl/workflow/cherry_pie.rb +101 -0
data/examples/empty/.gitkeep +0 -0
data/examples/examples_helper.rb +9 -0
data/examples/geo.rb +4 -0
data/examples/geo/geo_grids.numbers +0 -0
data/examples/geo/geolocated.rb +331 -0
data/examples/geo/quadtile.rb +69 -0
data/examples/geo/spec/geolocated_spec.rb +247 -0
data/examples/geo/tile_fetcher.rb +77 -0
data/examples/graph/implied_geolocation/README.md +63 -0
data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
data/examples/improver/tweet_summary.rb +73 -0
data/examples/loadable.rb +2 -0
data/examples/munging/airline_flights/airline_flights.rake +83 -0
data/examples/munging/airline_flights/airplane.rb +0 -0
data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
data/examples/munging/airline_flights/indexable.rb +75 -0
data/examples/munging/airline_flights/indexable_spec.rb +90 -0
data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
data/examples/munging/airline_flights/tasks.rake +83 -0
data/examples/munging/airline_flights/topcities.rb +167 -0
data/examples/munging/geo/geo_json.rb +54 -0
data/examples/munging/geo/geo_models.rb +69 -0
data/examples/munging/geo/geonames_models.rb +107 -0
data/examples/munging/geo/iso_codes.rb +172 -0
data/examples/munging/geo/reconcile_countries.rb +124 -0
data/examples/munging/geo/tasks.rake +71 -0
data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
data/examples/rake_helper.rb +97 -0
data/examples/ruby_project/Gemfile +6 -0
data/examples/ruby_project/README.md +6 -0
data/examples/ruby_project/a/b/c/.gitkeep +0 -0
data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
data/examples/server_logs/logline.rb +95 -0
data/examples/server_logs/models.rb +66 -0
data/examples/server_logs/page_counts.pig +48 -0
data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
data/examples/serverlogs/models/logline.rb +102 -0
data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
data/examples/serverlogs/visit_paths/common.rb +4 -0
data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
data/examples/splitter.rb +94 -0
data/examples/string_reverser.rb +7 -0
data/examples/text/pig_latin/pig_latinizer.rb +35 -0
data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
data/examples/text/regional_flavor/README.md +14 -0
data/examples/text/regional_flavor/article_wordbags.pig +39 -0
data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
data/examples/twitter.rb +5 -0
data/lib/hanuman.rb +36 -0
data/lib/hanuman/graph.rb +97 -0
data/lib/hanuman/graphvizzer.rb +206 -0
data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
data/lib/hanuman/link.rb +35 -0
data/lib/hanuman/registry.rb +46 -0
data/lib/hanuman/stage.rb +128 -0
data/lib/hanuman/tree.rb +67 -0
data/lib/wu/geo.rb +4 -0
data/lib/wu/geo/geo_grids.numbers +0 -0
data/lib/wu/geo/geolocated.rb +331 -0
data/lib/wu/geo/quadtile.rb +69 -0
data/lib/wu/graph/union_find.rb +62 -0
data/lib/wu/model/reconcilable.rb +63 -0
data/lib/wu/munging.rb +71 -0
data/lib/wu/social/models/twitter.rb +31 -0
data/lib/wu/wikipedia/models.rb +20 -0
data/lib/wukong.rb +54 -0
data/lib/wukong/dataflow.rb +43 -0
data/lib/wukong/doc_helpers.rb +14 -0
data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
data/lib/wukong/doc_helpers/field_handler.rb +91 -0
data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
data/lib/wukong/driver.rb +214 -0
data/lib/wukong/driver/event_machine_driver.rb +15 -0
data/lib/wukong/driver/wiring.rb +68 -0
data/lib/wukong/local.rb +42 -0
data/lib/wukong/local/runner.rb +96 -0
data/lib/wukong/local/stdio_driver.rb +104 -0
data/lib/wukong/logger.rb +102 -0
data/lib/wukong/model/faker.rb +136 -0
data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
data/lib/wukong/plugin.rb +48 -0
data/lib/wukong/processor.rb +110 -0
data/lib/wukong/rake_helper.rb +6 -0
data/lib/wukong/runner.rb +169 -0
data/lib/wukong/runner/boot_sequence.rb +123 -0
data/lib/wukong/runner/code_loader.rb +52 -0
data/lib/wukong/runner/command_runner.rb +44 -0
data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
data/lib/wukong/runner/help_message.rb +42 -0
data/lib/wukong/source.rb +33 -0
data/lib/wukong/source/source_driver.rb +74 -0
data/lib/wukong/source/source_runner.rb +38 -0
data/lib/wukong/spec_helpers.rb +74 -0
data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
data/lib/wukong/version.rb +3 -0
data/lib/wukong/widget/echo.rb +55 -0
data/lib/wukong/widget/extract.rb +122 -0
data/lib/wukong/widget/filters.rb +452 -0
data/lib/wukong/widget/logger.rb +56 -0
data/lib/wukong/widget/operators.rb +82 -0
data/lib/wukong/widget/reducers.rb +10 -0
data/lib/wukong/widget/reducers/accumulator.rb +73 -0
data/lib/wukong/widget/reducers/bin.rb +368 -0
data/lib/wukong/widget/reducers/count.rb +73 -0
data/lib/wukong/widget/reducers/group.rb +128 -0
data/lib/wukong/widget/reducers/group_concat.rb +98 -0
data/lib/wukong/widget/reducers/improver.rb +71 -0
data/lib/wukong/widget/reducers/join_xml.rb +37 -0
data/lib/wukong/widget/reducers/moments.rb +72 -0
data/lib/wukong/widget/reducers/sort.rb +180 -0
data/lib/wukong/widget/reducers/uniq.rb +91 -0
data/lib/wukong/widget/serializers.rb +317 -0
data/lib/wukong/widget/utils.rb +46 -0
data/lib/wukong/widgets.rb +7 -0
data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
data/spec/examples/dataflow/parsing_spec.rb +14 -0
data/spec/examples/dataflow/simple_spec.rb +34 -0
data/spec/examples/dataflow/telegram_spec.rb +43 -0
data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
data/spec/examples/munging/airline_flights_spec.rb +202 -0
data/spec/examples/text/pig_latin_spec.rb +18 -0
data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
data/spec/hanuman/graph_spec.rb +119 -0
data/spec/hanuman/hanuman_spec.rb +10 -0
data/spec/hanuman/registry_spec.rb +123 -0
data/spec/hanuman/stage_spec.rb +81 -0
data/spec/hanuman/tree_spec.rb +119 -0
data/spec/spec.opts +1 -0
data/spec/spec_helper.rb +43 -0
data/spec/support/example_test_helpers.rb +95 -0
data/spec/support/hanuman_test_helpers.rb +92 -0
data/spec/support/integration_helper.rb +38 -0
data/spec/support/model_test_helpers.rb +115 -0
data/spec/support/shared_context_for_graphs.rb +57 -0
data/spec/support/shared_context_for_reducers.rb +37 -0
data/spec/support/shared_examples_for_builders.rb +94 -0
data/spec/support/shared_examples_for_shortcuts.rb +57 -0
data/spec/wu/model/reconcilable_spec.rb +152 -0
data/spec/wukong/dataflow_spec.rb +87 -0
data/spec/wukong/driver_spec.rb +154 -0
data/spec/wukong/local/runner_spec.rb +29 -0
data/spec/wukong/local/stdio_driver_spec.rb +73 -0
data/spec/wukong/local_spec.rb +6 -0
data/spec/wukong/logger_spec.rb +49 -0
data/spec/wukong/model/faker_spec.rb +132 -0
data/spec/wukong/processor_spec.rb +21 -0
data/spec/wukong/runner_spec.rb +132 -0
data/spec/wukong/source_spec.rb +6 -0
data/spec/wukong/widget/extract_spec.rb +101 -0
data/spec/wukong/widget/filters_spec.rb +79 -0
data/spec/wukong/widget/logger_spec.rb +23 -0
data/spec/wukong/widget/operators_spec.rb +25 -0
data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
data/spec/wukong/widget/reducers/count_spec.rb +11 -0
data/spec/wukong/widget/reducers/group_spec.rb +21 -0
data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
data/spec/wukong/widget/serializers_spec.rb +114 -0
data/spec/wukong/widget/sink_spec.rb +19 -0
data/spec/wukong/widget/source_spec.rb +65 -0
data/spec/wukong/wu-local_spec.rb +109 -0
data/spec/wukong/wu-source_spec.rb +32 -0
data/spec/wukong/wu_spec.rb +14 -0
data/spec/wukong/wukong_spec.rb +10 -0
data/wukong.gemspec +35 -0
metadata +465 -0

data/Rakefile ADDED

@@ -0,0 +1,28 @@
+require 'bundler'
+Bundler::GemHelper.install_tasks
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:specs)
+require 'yard'
+YARD::Rake::YardocTask.new
+desc 'Run RSpec with code coverage'
+task :cov do
+  ENV['WUKONG_COV'] = true
+  Rake::Task[:specs].execute
+end
+task :default => :specs
+desc "Create a TAGS file for this project"
+task :tags do
+  files = [%w[Gemfile Guardfile Rakefile README.md].map { |b| File.join(File.dirname(__FILE__), b) }]
+  %w[bin examples lib spec].each do |dir|
+    files << Dir[File.join(File.dirname(__FILE__), "#{dir}/**/*.rb")]
+  end
+  files.each do |arry|
+    sh "etags", '-a', *arry unless arry.empty?
+  end
+end

data/TODO.md ADDED

@@ -0,0 +1,99 @@
+* Driving OR iterated
+* Runner? Executor?
+  - hooks up source to flow,
+  - if iterated source, drives it, otherwise sits in the flow
+* these set the contract for the inbound products
+### slots
+Typical case: one input, `:input`, one output `:output`
+* there are as many products as
+  - the total number of action stage outputs
+  - the concrete input products
+* The number of rsrc->action edges is at most the total number of input slots
+  - (you cannot wire multiple products to the same input slot)
+1. action stage B wires up to an action stage A (which really means "the full set of A's outputs")
+2. I wire action A's output as production product X
+3.
+4. How do I address other stages?
+   - `mapper.cat` a great name for action stage `cat` inside chain `mapper`
+   - `mapper.cat:output`? `mapper.cat_output`?
+Subgraphs own their contents
+## Configuration
+* options on processor become options on flow -- so, in the telegram example,
+  :break_length becomes configurable (somehow)
+## Tests
+* add an examples helper -- runs script on file from data dir, diffs the output.
+__________________________________________________________________________
+* [Ruby-Graphviz](https://github.com/glejeune/Ruby-Graphviz.git) Ruby interface to the GraphViz graphing tool
+* [Ruby GraphML Parser](https://github.com/willcannings/ruby-graphml.git)
+* everything accessible from clean (non-magical) methods.
+* inputs and outputs:
+  - inputs and outputs become an array of symbols
+* You can only have as many macro edges as inputs
+* action stage 'ports'
+  - a list of names for them
+  - can also have an edge going to a
+        _____
+        |
+        --v--
+          |
+          |
+        __^____^__
+        | x  | y |
+        |  foo   |
+        ----------
+create a product with no action? action with anonymous product, wired up later?
+* connections:
+  - action -> action:
+        act_a -> actb
+    act_a :o1 -> rsrc_x
+    act_a :o2 -> rsrc_y
+    act_b :i  <- act_a
+* references:
+  -

data/bin/cutc ADDED

@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+#
+# cut 1
+#
+# Example:
+#
+# A quickie histogram of timestamps; say that for the object in the foo/bar
+# directory, field 3 holds a flat timestamp (YYYYmmddHHMMSS) and you want a
+# histogram by hour (and that foo/bar is small enough to be worth sucking
+# through a single machine):
+#
+#   hdp-catd foo/bar | cuttab 3 | cutc 12 | sort | uniq -c
+#
+# If foo/bar is already sorted leave out the call to sort.
+#
+#
+# Set it to cut up to $1 (if defined), or if not, up to $CUTC_MAX (if defined), or 200 chars as a fallback.
+#
+CUTC_MAX=${CUTC_MAX-200}
+CUTC_MAX=${1-$CUTC_MAX}
+cutchars="1-${CUTC_MAX}"
+shift
+#
+# Do the cuttin'
+#
+exec cut -c"${cutchars}" "$@"

data/bin/cuttab ADDED

@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+fields=${1-"1-"}
+shift
+exec cut  -d'	' -f"$fields" "$@"

data/bin/greptrue ADDED

@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+# runs grep but always returns a true exit status. (Otherwise hadoop vomits)
+# You can set a command line var in hadoop instead, but we'll leave this around
+grep "$@"
+true

data/bin/md5sort ADDED

@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+""" sorts lines (or tab-sep records) by md5.  (e.g. for train/test splits).
+optionally prepends with the md5 id too.
+brendan o'connor - anyall.org - gist.github.com/brendano """
+import hashlib,sys,optparse
+p = optparse.OptionParser()
+p.add_option('-k',  type='int', default=False)
+p.add_option('-p', action='store_true')
+opts,args=p.parse_args()
+lines = sys.stdin.readlines()
+getter=lambda s: hashlib.md5(s[:-1]).hexdigest()
+if opts.k:
+  getter=lambda s: hashlib.md5(s[:-1].split("\t")[opts.k-1]).hexdigest()
+lines.sort(key=lambda s: getter(s))
+for line in lines:
+  if opts.p:  line = getter(line) + "\t" + line
+  print line,

data/bin/setcat ADDED

@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+#
+# This script is useful for debugging. it dumps your environment to STDERR
+# and otherwise runs as `cat`
+#
+set >&2
+cat
+true

data/bin/tabchar ADDED

@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# insert a tab char from the command line:
+# echo "hi$(tabchar)there"
+# # => "hi	there"
+exec echo -n -e '\t'

data/bin/uniq-ord ADDED

@@ -0,0 +1,59 @@
+#!/usr/bin/env ruby
+# encoding: ASCII-8BIT
+require 'set'
+unless ARGV.empty?
+  unless ARGV.include?('--help')
+    puts "\n**\nSorry, uniq-ord only works in-line: cat foo.txt bar.tsv | uniq-ord\n**" ; puts
+  end
+  puts <<USAGE
+uniq-ord is ike the uniq command but doesn't depend on prior sorting: it tracks
+each line and only emits the first-seen instance of that line.
+The algorithm is /very/ simplistic: it uses ruby's built-in hash to track lines.
+This can produce false positives, meaning that a line of output might be removed
+even if it hasn't been seen before.  It may also consume an unbounded amount of
+memory (though less than the input text). With a million lines it will consume
+about 70 MB of memory and have more than 1 in a million chance of false
+positive. On a billion lines it will consume many GB and have over 25% odds of
+incorrectly skipping a line.
+However, it's really handy for dealing with in-order lists from the command line.
+USAGE
+  exit(0)
+end
+# # Logging
+#
+# MB = 1024*1024
+# LOG_INTERVAL = 100_000
+# $start = Time.now; $iter = 0; $size = 0
+# def log_line
+#   elapsed = (Time.now - $start).to_f
+#   $stderr.puts("%5d s\t%10.1f l/s\t%5dk<\t%5dk>\t%5d MB\t%9.1f MB/s\t%11d b/l"%[ elapsed, $iter/elapsed, $iter/1000, LINES.count/1000, $size/MB, ($size/MB)/elapsed, $size/$iter ])
+# end
+LINES = Set.new
+$stdin.each do |line|
+  next if LINES.include?(line.hash)
+  puts line
+  LINES << line.hash
+  # $iter += 1 ; $size += line.length
+  # log_line if ($iter % LOG_INTERVAL == 0)
+end
+# log_line
+#
+# # 2.1 GB data, 1M lines, 2000 avg chars/line
+#
+# # Used:   RSS:     71_988 kB     VSZ:     2_509_152 kB
+# # Stats:   38 s  25_859.1 l/s  1000k<  1000k>  1976 MB         51.1 MB/s       2072 b/l
+# # Time:   real     0m41.4 s      user  0m31.6 s          sys  0m8.3 s     pct    96.48
+#
+# # 4.1 GB data, 5.6M lines, 800 avg chars/line
+#
+# # Used:   RSS:    330_644 kB     VSZ:     2_764_236 kB
+# # Stats:  861     6_538.2 l/s  5632k<  5632k>  4158 MB          4.8 MB/s        774 b/l
+# # Time:   real    14m24.6 s     user  13m8.8 s           sys 0m12. s       pct   92.61
+#

data/bin/uniqc ADDED

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+uniq -c | ruby -ne 'puts $_.chomp.gsub(/^\s+(\d+)\s+/){ "%15s\t" % $1 }'

data/bin/wu ADDED

@@ -0,0 +1,34 @@
+#!/usr/bin/env ruby
+require 'shellwords'
+now=Time.now.strftime("%Y-%m-%d %H:%M:%S")
+if ARGV.empty?
+    abort "ERROR #{now} [wu                  ] -- Must provide a Wukong command to run. Try the --help option."
+else
+    if ARGV.size == 1 && ARGV.first == '--help'
+	abort <<EOF
+usage: wu COMMAND [OPTIONS] [ARG] ...
+wu is a wrapper for easy use of Wukong's command-line tools.  It takes
+your arguments, constructs the name of the proper wu-tool to call, and
+prepends a call to bundle exec.
+  $ wu local ...
+is equivalent to
+  $ bundle exec wu-local ...
+You can run any of the wu-tools this way:
+  wu-local    wu-source
+  wu-hadoop   wu-storm
+  wu-deploy   wu-load
+EOF
+    else
+      if ARGV.first =~ /^-/
+        abort "ERROR ${now} [wu                  ] -- First argument must be the name of a wu tool to run, got <${1}>"
+      else
+        Kernel.exec "bundle exec wu-#{Shellwords.join(ARGV)}"
+      end
+    end
+end

data/bin/wu-clean-encoding ADDED

@@ -0,0 +1,31 @@
+#!/usr/bin/env ruby
+# encoding:UTF-8
+if ARGV.include?('--help')
+  puts <<USAGE
+wu-clean-encoding cleans malformed characters from stdin.
+If a character is malformed, as defined by valid_encoding?,
+it is replaced with a '�'.
+wu-clean-encoding was built to work with UTF-8, and no
+guarantees are provided for other encodings.
+USAGE
+  exit(0)
+end
+ARGF.each do |line|
+  if line.valid_encoding?
+    $stdout.write line
+  else
+    repaired_line = []
+    line.each_char do |char|
+      if char.valid_encoding?
+        repaired_line << char
+      else
+        repaired_line << "�"
+      end
+    end
+    $stdout.write repaired_line.join
+  end
+end

data/bin/wu-date ADDED

@@ -0,0 +1,13 @@
+#!/bin/sh
+#
+# Outputs a compact wukong-style date:
+#
+#
+#	$ date
+#       Sun Nov  8 03:21:37 CST 2009
+#	$ wu-date
+#	20091108
+#
+exec date +"%Y%m%d"

data/bin/wu-datetime ADDED

@@ -0,0 +1,13 @@
+#!/bin/sh
+#
+# Outputs a compact wukong-style datetime:
+#
+#
+#	$ date
+#       Sun Nov  8 03:21:37 CST 2009
+#	$ wu-datetime
+#	20091108032137
+#
+exec date +"%Y%m%d%H%M%D"

data/bin/wu-hist ADDED

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+sort | uniq -c | sort -rn | ruby -ne 'puts $_.chomp.gsub(/^\s+(\d+)\s+/){ $1+"\t" }'

data/bin/wu-lign ADDED

@@ -0,0 +1,186 @@
+#!/usr/bin/env ruby
+USAGE= %Q{
+# h1. wulign -- format a tab-separated file as aligned columns
+#
+# wulign will intelligently reformat a tab-separated file into a tab-separated,
+# space aligned file that is still suitable for further processing. For example,
+# given the log-file input
+#
+#     # cat tag_usage.tsv
+#     2009-07-21T21:39:40 day     65536   3.15479 68750   1171316
+#     2009-07-21T21:39:45 doing   65536   1.04533 26230   1053956
+#     2009-07-21T21:41:53 hapaxlegomenon  65536   0.87574e-05     23707   10051141
+#     2009-07-21T21:44:00 concert 500     0.29290 13367   9733414
+#     2009-07-21T21:44:29 world   65536   1.09110 32850   200916
+#     2009-07-21T21:44:39 world+series    65536   0.49380 9929    7972025
+#     2009-07-21T21:44:54 iranelection    65536   2.91775 14592   136342
+#
+# wulign will reformat it to read
+#
+#     # cat tag_usage.tsv | wu-lign
+#     2009-07-21T21:39:40 day                   65536   3.154791234 68750    1171316
+#     2009-07-21T21:39:45 doing                 65536   1.045330000 26230    1053956
+#     2009-07-21T21:41:53 hapaxlegomenon        65536   0.000008757 23707   10051141
+#     2009-07-21T21:44:00 concert                 500   0.292900000 13367    9733414
+#     2009-07-21T21:44:29 world                 65536   1.091100000 32850     200916
+#     2009-07-21T21:44:39 world+series          65536   0.493800000  9929    7972025
+#     2009-07-21T21:44:54 iranelection          65536   2.917750000 14592     136342
+#
+# The fields are still tab-delimited by exactly one tab -- only spaces are used to
+# pad out fields. You can still use cuttab and friends to manipulate columns.
+#
+# h2. Command-line arguments
+#
+# You can give sprintf-style positional arguments on the command line that will be
+# applied to the corresponding columns. (Blank args are used for placeholding and
+# auto-formatting is still applied).  So with the example above,
+#
+#     cat foo | wulign  '' '' '' '%8.4e'
+#
+# will format the fourth column with "%8.4e", while the first three columns and
+# fifth-and-higher columns are formatted as usual.
+#
+#     ...
+#     2009-07-21T21:39:45 doing           65536   1.0453e+00      26230    1053956
+#     2009-07-21T21:41:53 hapaxlegomenon  65536   8.7574e-06      23707   10051141
+#     2009-07-21T21:44:00 concert           500   2.9290e-01      13367    9733414
+#     ....
+#
+# h2. How it works
+#
+# Wu-lign takes the first 500ish lines, splits into fields on TAB characters,
+# and tries to guess the format (int, float, or string) for each. It builds a
+# consensus of the width and type for corresponding columns in the chunk.  If a
+# column has mixed numeric and string formats it degrades to :mixed, which is
+# basically treated as :string. If a column has mixed :float and :int elements all
+# of them are formatted as float.
+#
+# h2. Notes
+#
+# * Header rows: the first line is used for width alignment but not for type detection.
+#   This means that an initial row of text headers will inform column spacing
+#   but still allow a column of floats (say) to be properly aligned as floats.
+#
+# * It requires a unanimous vote. One screwy line can coerce the whole mess to
+#   :mixed; width formatting will still be applied, though.
+#
+# * It won't set columns wider than 100 chars -- this allows for the occasional
+#   super-wide column without completely breaking your screen.
+#
+# * For :float values, wulign tries to guess at the right number of significant
+#   digits to the left and right of the decimal point.
+#
+# * wulign parses only plain-jane 'TSV files': no quoting or escaping; every tab
+#   delimits a field, every newline a record.
+#
+# wulign isn't intended to be smart, or correct, or reliable -- only to be
+# useful for previewing and organizing tab-formatted files. In general
+# wulign(foo).split("\t").map(&:strip) *should* give output semantically
+# equivalent to its input. (That is, the only changes should be insertion of
+# spaces and re-formatting of numerics.) But still -- reserve its use for human
+# inspection only.
+#
+}
+if ARGV[0] == '--help'
+  puts $0
+  puts USAGE
+  exit
+end
+#
+# How many initial lines to use to guess formatting.  Lines after this are
+# simply reformatted according to the consensus of the initial
+# FORMAT_GUESSING_LINES.
+#
+FORMAT_GUESSING_LINES = 500
+# widest column to set
+MAX_MAX_WIDTH = 100
+INT_RE   = /\A[\d,]+\z/
+FLOAT_RE = /\A([\d,]+)(?:\.(\d+))?(?:e-?\d+)?\z/
+def get_type val
+  case
+  when val == ''       then type = nil
+  when val =~ INT_RE   then type = :int
+  when val =~ FLOAT_RE then type = :float
+  else                      type = :str end
+end
+def consensus_type val, alltype, is_first
+  return :mixed if alltype == :mixed
+  type = get_type(val) or return
+  case
+  when alltype.nil?                  then type
+  when is_first && (alltype == :str) then type
+  when alltype == type               then type
+  when ( ((alltype==:float) && (type == :int)) || ((alltype == :int) && (type == :float)) )
+    :float
+  else :mixed
+  end
+end
+def f_width str
+  str =~ FLOAT_RE or return 0
+  [$1.length, $2 ? $2.length : 0]
+end
+maxw       = []
+col_types  = []
+col_minmag = []
+col_maxmag = []
+rows       = []
+skip_col   = []
+has_header = false
+ARGV.each_with_index{|v,i| next if (v == '') ; maxw[i] = 0; skip_col[i] = true }
+FORMAT_GUESSING_LINES.times do
+  line = $stdin.readline rescue nil
+  break unless line
+  row = line.chomp.split("\t").map{|s| s.strip }
+  col_widths = row.map{|col| col.length }
+  col_widths.each_with_index{|cw,i| maxw[i] = [[cw,maxw[i]].compact.max, MAX_MAX_WIDTH].min }
+  row.each_with_index{|col,i|
+    next if skip_col[i]
+    # Let the first row be text (headers)
+    col_types[i] = consensus_type(col, col_types[i], rows.length == 1)
+    if col_types[i] == :float
+      mantissa, radix = f_width(col)
+      col_minmag[i] = [radix,    col_minmag[i], 1].compact.max
+      col_maxmag[i] = [mantissa, col_maxmag[i], 1].compact.max
+    end
+  }
+  # p [rows.length, has_header, maxw, col_types, col_minmag, col_maxmag, col_widths, row]
+  has_header = true if row.all?{|col| get_type(col) == :str } && rows.length == 0
+  rows << row
+end
+format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type, minmag, maxmag, default|
+  next(lambda{|s| default % s rescue s }) if default.to_s != ''
+  case type
+  when :mixed, nil then lambda{|s| "%-#{width}s" % s }
+  when :str        then lambda{|s| "%-#{width}s" % s }
+  when :int        then lambda{|s| "%#{width}d"  % s.gsub(/[^\d\-\+]+/, "").to_i }
+  when :float      then lambda{|s| "%#{maxmag+minmag+2}.#{minmag}f" % s.gsub(/[^\d\.eE\-\+]+/, "").to_f }
+  else raise "oops type #{type}"  end
+end
+def dump_row row, format
+  puts row.zip(format).map{|c,f| f.call(c) rescue c }.join("\t")
+end
+def dump_header row, maxw
+  puts row.zip(maxw).map{|col, width| "%-#{width}s" % col.to_s }.join("\t")
+end
+pad = [''] * maxw.length
+dump_header(rows.shift, maxw) if has_header
+rows.each do |row|
+  # note -- strips trailing columns
+  dump_row(row, format)
+end
+$stdin.each do |line|
+  row = line.chomp.split("\t").map{|s| s.strip }
+  # note -- strips trailing columns
+  dump_row(row, format)
+end