ul-wukong 4.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +60 -0
- data/.gitmodules +6 -0
- data/.rspec +2 -0
- data/.travis.yml +19 -0
- data/.yardopts +6 -0
- data/CHANGELOG.md +7 -0
- data/Gemfile +17 -0
- data/Guardfile +12 -0
- data/LICENSE.md +95 -0
- data/NOTES-travis.md +31 -0
- data/README-old.md +422 -0
- data/README.md +1308 -0
- data/Rakefile +28 -0
- data/TODO.md +99 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +6 -0
- data/bin/md5sort +20 -0
- data/bin/setcat +11 -0
- data/bin/tabchar +5 -0
- data/bin/uniq-ord +59 -0
- data/bin/uniqc +3 -0
- data/bin/wu +34 -0
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-date +13 -0
- data/bin/wu-datetime +13 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +186 -0
- data/bin/wu-local +4 -0
- data/bin/wu-plus +9 -0
- data/bin/wu-source +5 -0
- data/bin/wu-sum +31 -0
- data/diagrams/wu_local.dot +39 -0
- data/diagrams/wu_local.dot.png +0 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/basic/string_reverser.rb +23 -0
- data/examples/basic/tiny_count.rb +8 -0
- data/examples/basic/word_count/accumulator.rb +26 -0
- data/examples/basic/word_count/tokenizer.rb +13 -0
- data/examples/basic/word_count/word_count.rb +6 -0
- data/examples/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/deploy_pack/Gemfile +6 -0
- data/examples/deploy_pack/README.md +6 -0
- data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
- data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
- data/examples/deploy_pack/config/environment.rb +1 -0
- data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
- data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/dsl/dataflow/simple.rb +12 -0
- data/examples/dsl/dataflow/telegram.rb +45 -0
- data/examples/dsl/workflow/cherry_pie.dot +97 -0
- data/examples/dsl/workflow/cherry_pie.md +104 -0
- data/examples/dsl/workflow/cherry_pie.png +0 -0
- data/examples/dsl/workflow/cherry_pie.rb +101 -0
- data/examples/empty/.gitkeep +0 -0
- data/examples/examples_helper.rb +9 -0
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/implied_geolocation/README.md +63 -0
- data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
- data/examples/improver/tweet_summary.rb +73 -0
- data/examples/loadable.rb +2 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/examples/munging/airline_flights/airplane.rb +0 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/indexable.rb +75 -0
- data/examples/munging/airline_flights/indexable_spec.rb +90 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +107 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/rake_helper.rb +97 -0
- data/examples/ruby_project/Gemfile +6 -0
- data/examples/ruby_project/README.md +6 -0
- data/examples/ruby_project/a/b/c/.gitkeep +0 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/serverlogs/models/logline.rb +102 -0
- data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
- data/examples/serverlogs/visit_paths/common.rb +4 -0
- data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
- data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
- data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
- data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
- data/examples/splitter.rb +94 -0
- data/examples/string_reverser.rb +7 -0
- data/examples/text/pig_latin/pig_latinizer.rb +35 -0
- data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/twitter.rb +5 -0
- data/lib/hanuman.rb +36 -0
- data/lib/hanuman/graph.rb +97 -0
- data/lib/hanuman/graphvizzer.rb +206 -0
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +128 -0
- data/lib/hanuman/tree.rb +67 -0
- data/lib/wu/geo.rb +4 -0
- data/lib/wu/geo/geo_grids.numbers +0 -0
- data/lib/wu/geo/geolocated.rb +331 -0
- data/lib/wu/geo/quadtile.rb +69 -0
- data/lib/wu/graph/union_find.rb +62 -0
- data/lib/wu/model/reconcilable.rb +63 -0
- data/lib/wu/munging.rb +71 -0
- data/lib/wu/social/models/twitter.rb +31 -0
- data/lib/wu/wikipedia/models.rb +20 -0
- data/lib/wukong.rb +54 -0
- data/lib/wukong/dataflow.rb +43 -0
- data/lib/wukong/doc_helpers.rb +14 -0
- data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
- data/lib/wukong/doc_helpers/field_handler.rb +91 -0
- data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
- data/lib/wukong/driver.rb +214 -0
- data/lib/wukong/driver/event_machine_driver.rb +15 -0
- data/lib/wukong/driver/wiring.rb +68 -0
- data/lib/wukong/local.rb +42 -0
- data/lib/wukong/local/runner.rb +96 -0
- data/lib/wukong/local/stdio_driver.rb +104 -0
- data/lib/wukong/logger.rb +102 -0
- data/lib/wukong/model/faker.rb +136 -0
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/plugin.rb +48 -0
- data/lib/wukong/processor.rb +110 -0
- data/lib/wukong/rake_helper.rb +6 -0
- data/lib/wukong/runner.rb +169 -0
- data/lib/wukong/runner/boot_sequence.rb +123 -0
- data/lib/wukong/runner/code_loader.rb +52 -0
- data/lib/wukong/runner/command_runner.rb +44 -0
- data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
- data/lib/wukong/runner/help_message.rb +42 -0
- data/lib/wukong/source.rb +33 -0
- data/lib/wukong/source/source_driver.rb +74 -0
- data/lib/wukong/source/source_runner.rb +38 -0
- data/lib/wukong/spec_helpers.rb +74 -0
- data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
- data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
- data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
- data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
- data/lib/wukong/version.rb +3 -0
- data/lib/wukong/widget/echo.rb +55 -0
- data/lib/wukong/widget/extract.rb +122 -0
- data/lib/wukong/widget/filters.rb +452 -0
- data/lib/wukong/widget/logger.rb +56 -0
- data/lib/wukong/widget/operators.rb +82 -0
- data/lib/wukong/widget/reducers.rb +10 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +368 -0
- data/lib/wukong/widget/reducers/count.rb +73 -0
- data/lib/wukong/widget/reducers/group.rb +128 -0
- data/lib/wukong/widget/reducers/group_concat.rb +98 -0
- data/lib/wukong/widget/reducers/improver.rb +71 -0
- data/lib/wukong/widget/reducers/join_xml.rb +37 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +180 -0
- data/lib/wukong/widget/reducers/uniq.rb +91 -0
- data/lib/wukong/widget/serializers.rb +317 -0
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +7 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
- data/spec/examples/dataflow/parsing_spec.rb +14 -0
- data/spec/examples/dataflow/simple_spec.rb +34 -0
- data/spec/examples/dataflow/telegram_spec.rb +43 -0
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +18 -0
- data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
- data/spec/hanuman/graph_spec.rb +119 -0
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +81 -0
- data/spec/hanuman/tree_spec.rb +119 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +43 -0
- data/spec/support/example_test_helpers.rb +95 -0
- data/spec/support/hanuman_test_helpers.rb +92 -0
- data/spec/support/integration_helper.rb +38 -0
- data/spec/support/model_test_helpers.rb +115 -0
- data/spec/support/shared_context_for_graphs.rb +57 -0
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +94 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/wu/model/reconcilable_spec.rb +152 -0
- data/spec/wukong/dataflow_spec.rb +87 -0
- data/spec/wukong/driver_spec.rb +154 -0
- data/spec/wukong/local/runner_spec.rb +29 -0
- data/spec/wukong/local/stdio_driver_spec.rb +73 -0
- data/spec/wukong/local_spec.rb +6 -0
- data/spec/wukong/logger_spec.rb +49 -0
- data/spec/wukong/model/faker_spec.rb +132 -0
- data/spec/wukong/processor_spec.rb +21 -0
- data/spec/wukong/runner_spec.rb +132 -0
- data/spec/wukong/source_spec.rb +6 -0
- data/spec/wukong/widget/extract_spec.rb +101 -0
- data/spec/wukong/widget/filters_spec.rb +79 -0
- data/spec/wukong/widget/logger_spec.rb +23 -0
- data/spec/wukong/widget/operators_spec.rb +25 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +21 -0
- data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
- data/spec/wukong/widget/serializers_spec.rb +114 -0
- data/spec/wukong/widget/sink_spec.rb +19 -0
- data/spec/wukong/widget/source_spec.rb +65 -0
- data/spec/wukong/wu-local_spec.rb +109 -0
- data/spec/wukong/wu-source_spec.rb +32 -0
- data/spec/wukong/wu_spec.rb +14 -0
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +35 -0
- metadata +465 -0
data/Rakefile
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
Bundler::GemHelper.install_tasks
|
3
|
+
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
RSpec::Core::RakeTask.new(:specs)
|
6
|
+
|
7
|
+
require 'yard'
|
8
|
+
YARD::Rake::YardocTask.new
|
9
|
+
|
10
|
+
desc 'Run RSpec with code coverage'
|
11
|
+
task :cov do
|
12
|
+
ENV['WUKONG_COV'] = true
|
13
|
+
Rake::Task[:specs].execute
|
14
|
+
end
|
15
|
+
|
16
|
+
task :default => :specs
|
17
|
+
|
18
|
+
desc "Create a TAGS file for this project"
|
19
|
+
task :tags do
|
20
|
+
files = [%w[Gemfile Guardfile Rakefile README.md].map { |b| File.join(File.dirname(__FILE__), b) }]
|
21
|
+
%w[bin examples lib spec].each do |dir|
|
22
|
+
files << Dir[File.join(File.dirname(__FILE__), "#{dir}/**/*.rb")]
|
23
|
+
end
|
24
|
+
files.each do |arry|
|
25
|
+
sh "etags", '-a', *arry unless arry.empty?
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
data/TODO.md
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
* Driving OR iterated
|
2
|
+
|
3
|
+
* Runner? Executor?
|
4
|
+
- hooks up source to flow,
|
5
|
+
- if iterated source, drives it, otherwise sits in the flow
|
6
|
+
|
7
|
+
* these set the contract for the inbound products
|
8
|
+
|
9
|
+
### slots
|
10
|
+
|
11
|
+
Typical case: one input, `:input`, one output `:output`
|
12
|
+
|
13
|
+
* there are as many products as
|
14
|
+
- the total number of action stage outputs
|
15
|
+
- the concrete input products
|
16
|
+
* The number of rsrc->action edges is at most the total number of input slots
|
17
|
+
- (you cannot wire multiple products to the same input slot)
|
18
|
+
|
19
|
+
|
20
|
+
1. action stage B wires up to an action stage A (which really means "the full set of A's outputs")
|
21
|
+
2. I wire action A's output as production product X
|
22
|
+
3.
|
23
|
+
|
24
|
+
4. How do I address other stages?
|
25
|
+
- `mapper.cat` a great name for action stage `cat` inside chain `mapper`
|
26
|
+
- `mapper.cat:output`? `mapper.cat_output`?
|
27
|
+
|
28
|
+
Subgraphs own their contents
|
29
|
+
|
30
|
+
|
31
|
+
## Configuration
|
32
|
+
|
33
|
+
* options on processor become options on flow -- so, in the telegram example,
|
34
|
+
:break_length becomes configurable (somehow)
|
35
|
+
|
36
|
+
## Tests
|
37
|
+
|
38
|
+
* add an examples helper -- runs script on file from data dir, diffs the output.
|
39
|
+
|
40
|
+
|
41
|
+
|
42
|
+
__________________________________________________________________________
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
* [Ruby-Graphviz](https://github.com/glejeune/Ruby-Graphviz.git) Ruby interface to the GraphViz graphing tool
|
47
|
+
* [Ruby GraphML Parser](https://github.com/willcannings/ruby-graphml.git)
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
* everything accessible from clean (non-magical) methods.
|
52
|
+
|
53
|
+
* inputs and outputs:
|
54
|
+
- inputs and outputs become an array of symbols
|
55
|
+
|
56
|
+
|
57
|
+
* You can only have as many macro edges as inputs
|
58
|
+
|
59
|
+
* action stage 'ports'
|
60
|
+
- a list of names for them
|
61
|
+
- can also have an edge going to a
|
62
|
+
|
63
|
+
|
64
|
+
_____
|
65
|
+
|
|
66
|
+
--v--
|
67
|
+
|
|
68
|
+
|
|
69
|
+
__^____^__
|
70
|
+
| x | y |
|
71
|
+
| foo |
|
72
|
+
----------
|
73
|
+
|
74
|
+
create a product with no action? action with anonymous product, wired up later?
|
75
|
+
|
76
|
+
|
77
|
+
* connections:
|
78
|
+
|
79
|
+
- action -> action:
|
80
|
+
|
81
|
+
act_a -> actb
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
act_a :o1 -> rsrc_x
|
87
|
+
act_a :o2 -> rsrc_y
|
88
|
+
|
89
|
+
act_b :i <- act_a
|
90
|
+
|
91
|
+
|
92
|
+
|
93
|
+
* references:
|
94
|
+
-
|
95
|
+
|
96
|
+
|
97
|
+
|
98
|
+
|
99
|
+
|
data/bin/cutc
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
#
|
4
|
+
# cut 1
|
5
|
+
#
|
6
|
+
# Example:
|
7
|
+
#
|
8
|
+
# A quickie histogram of timestamps; say that for the object in the foo/bar
|
9
|
+
# directory, field 3 holds a flat timestamp (YYYYmmddHHMMSS) and you want a
|
10
|
+
# histogram by hour (and that foo/bar is small enough to be worth sucking
|
11
|
+
# through a single machine):
|
12
|
+
#
|
13
|
+
# hdp-catd foo/bar | cuttab 3 | cutc 12 | sort | uniq -c
|
14
|
+
#
|
15
|
+
# If foo/bar is already sorted leave out the call to sort.
|
16
|
+
#
|
17
|
+
|
18
|
+
|
19
|
+
#
|
20
|
+
# Set it to cut up to $1 (if defined), or if not, up to $CUTC_MAX (if defined), or 200 chars as a fallback.
|
21
|
+
#
|
22
|
+
CUTC_MAX=${CUTC_MAX-200}
|
23
|
+
CUTC_MAX=${1-$CUTC_MAX}
|
24
|
+
cutchars="1-${CUTC_MAX}"
|
25
|
+
shift
|
26
|
+
|
27
|
+
#
|
28
|
+
# Do the cuttin'
|
29
|
+
#
|
30
|
+
exec cut -c"${cutchars}" "$@"
|
data/bin/cuttab
ADDED
data/bin/greptrue
ADDED
data/bin/md5sort
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
""" sorts lines (or tab-sep records) by md5. (e.g. for train/test splits).
|
3
|
+
optionally prepends with the md5 id too.
|
4
|
+
brendan o'connor - anyall.org - gist.github.com/brendano """
|
5
|
+
|
6
|
+
import hashlib,sys,optparse
|
7
|
+
p = optparse.OptionParser()
|
8
|
+
p.add_option('-k', type='int', default=False)
|
9
|
+
p.add_option('-p', action='store_true')
|
10
|
+
opts,args=p.parse_args()
|
11
|
+
|
12
|
+
lines = sys.stdin.readlines()
|
13
|
+
getter=lambda s: hashlib.md5(s[:-1]).hexdigest()
|
14
|
+
if opts.k:
|
15
|
+
getter=lambda s: hashlib.md5(s[:-1].split("\t")[opts.k-1]).hexdigest()
|
16
|
+
|
17
|
+
lines.sort(key=lambda s: getter(s))
|
18
|
+
for line in lines:
|
19
|
+
if opts.p: line = getter(line) + "\t" + line
|
20
|
+
print line,
|
data/bin/setcat
ADDED
data/bin/tabchar
ADDED
data/bin/uniq-ord
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: ASCII-8BIT
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
unless ARGV.empty?
|
6
|
+
unless ARGV.include?('--help')
|
7
|
+
puts "\n**\nSorry, uniq-ord only works in-line: cat foo.txt bar.tsv | uniq-ord\n**" ; puts
|
8
|
+
end
|
9
|
+
puts <<USAGE
|
10
|
+
uniq-ord is ike the uniq command but doesn't depend on prior sorting: it tracks
|
11
|
+
each line and only emits the first-seen instance of that line.
|
12
|
+
|
13
|
+
The algorithm is /very/ simplistic: it uses ruby's built-in hash to track lines.
|
14
|
+
This can produce false positives, meaning that a line of output might be removed
|
15
|
+
even if it hasn't been seen before. It may also consume an unbounded amount of
|
16
|
+
memory (though less than the input text). With a million lines it will consume
|
17
|
+
about 70 MB of memory and have more than 1 in a million chance of false
|
18
|
+
positive. On a billion lines it will consume many GB and have over 25% odds of
|
19
|
+
incorrectly skipping a line.
|
20
|
+
|
21
|
+
However, it's really handy for dealing with in-order lists from the command line.
|
22
|
+
USAGE
|
23
|
+
exit(0)
|
24
|
+
end
|
25
|
+
|
26
|
+
# # Logging
|
27
|
+
#
|
28
|
+
# MB = 1024*1024
|
29
|
+
# LOG_INTERVAL = 100_000
|
30
|
+
# $start = Time.now; $iter = 0; $size = 0
|
31
|
+
# def log_line
|
32
|
+
# elapsed = (Time.now - $start).to_f
|
33
|
+
# $stderr.puts("%5d s\t%10.1f l/s\t%5dk<\t%5dk>\t%5d MB\t%9.1f MB/s\t%11d b/l"%[ elapsed, $iter/elapsed, $iter/1000, LINES.count/1000, $size/MB, ($size/MB)/elapsed, $size/$iter ])
|
34
|
+
# end
|
35
|
+
|
36
|
+
LINES = Set.new
|
37
|
+
$stdin.each do |line|
|
38
|
+
next if LINES.include?(line.hash)
|
39
|
+
puts line
|
40
|
+
LINES << line.hash
|
41
|
+
# $iter += 1 ; $size += line.length
|
42
|
+
# log_line if ($iter % LOG_INTERVAL == 0)
|
43
|
+
end
|
44
|
+
# log_line
|
45
|
+
|
46
|
+
#
|
47
|
+
# # 2.1 GB data, 1M lines, 2000 avg chars/line
|
48
|
+
#
|
49
|
+
# # Used: RSS: 71_988 kB VSZ: 2_509_152 kB
|
50
|
+
# # Stats: 38 s 25_859.1 l/s 1000k< 1000k> 1976 MB 51.1 MB/s 2072 b/l
|
51
|
+
# # Time: real 0m41.4 s user 0m31.6 s sys 0m8.3 s pct 96.48
|
52
|
+
#
|
53
|
+
# # 4.1 GB data, 5.6M lines, 800 avg chars/line
|
54
|
+
#
|
55
|
+
# # Used: RSS: 330_644 kB VSZ: 2_764_236 kB
|
56
|
+
# # Stats: 861 6_538.2 l/s 5632k< 5632k> 4158 MB 4.8 MB/s 774 b/l
|
57
|
+
# # Time: real 14m24.6 s user 13m8.8 s sys 0m12. s pct 92.61
|
58
|
+
#
|
59
|
+
|
data/bin/uniqc
ADDED
data/bin/wu
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'shellwords'
|
3
|
+
now=Time.now.strftime("%Y-%m-%d %H:%M:%S")
|
4
|
+
if ARGV.empty?
|
5
|
+
abort "ERROR #{now} [wu ] -- Must provide a Wukong command to run. Try the --help option."
|
6
|
+
else
|
7
|
+
if ARGV.size == 1 && ARGV.first == '--help'
|
8
|
+
abort <<EOF
|
9
|
+
usage: wu COMMAND [OPTIONS] [ARG] ...
|
10
|
+
|
11
|
+
wu is a wrapper for easy use of Wukong's command-line tools. It takes
|
12
|
+
your arguments, constructs the name of the proper wu-tool to call, and
|
13
|
+
prepends a call to bundle exec.
|
14
|
+
|
15
|
+
$ wu local ...
|
16
|
+
|
17
|
+
is equivalent to
|
18
|
+
|
19
|
+
$ bundle exec wu-local ...
|
20
|
+
|
21
|
+
You can run any of the wu-tools this way:
|
22
|
+
|
23
|
+
wu-local wu-source
|
24
|
+
wu-hadoop wu-storm
|
25
|
+
wu-deploy wu-load
|
26
|
+
EOF
|
27
|
+
else
|
28
|
+
if ARGV.first =~ /^-/
|
29
|
+
abort "ERROR ${now} [wu ] -- First argument must be the name of a wu tool to run, got <${1}>"
|
30
|
+
else
|
31
|
+
Kernel.exec "bundle exec wu-#{Shellwords.join(ARGV)}"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding:UTF-8
|
3
|
+
|
4
|
+
if ARGV.include?('--help')
|
5
|
+
puts <<USAGE
|
6
|
+
wu-clean-encoding cleans malformed characters from stdin.
|
7
|
+
|
8
|
+
If a character is malformed, as defined by valid_encoding?,
|
9
|
+
it is replaced with a '�'.
|
10
|
+
|
11
|
+
wu-clean-encoding was built to work with UTF-8, and no
|
12
|
+
guarantees are provided for other encodings.
|
13
|
+
USAGE
|
14
|
+
exit(0)
|
15
|
+
end
|
16
|
+
|
17
|
+
ARGF.each do |line|
|
18
|
+
if line.valid_encoding?
|
19
|
+
$stdout.write line
|
20
|
+
else
|
21
|
+
repaired_line = []
|
22
|
+
line.each_char do |char|
|
23
|
+
if char.valid_encoding?
|
24
|
+
repaired_line << char
|
25
|
+
else
|
26
|
+
repaired_line << "�"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
$stdout.write repaired_line.join
|
30
|
+
end
|
31
|
+
end
|
data/bin/wu-date
ADDED
data/bin/wu-datetime
ADDED
data/bin/wu-hist
ADDED
data/bin/wu-lign
ADDED
@@ -0,0 +1,186 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
USAGE= %Q{
|
4
|
+
# h1. wulign -- format a tab-separated file as aligned columns
|
5
|
+
#
|
6
|
+
# wulign will intelligently reformat a tab-separated file into a tab-separated,
|
7
|
+
# space aligned file that is still suitable for further processing. For example,
|
8
|
+
# given the log-file input
|
9
|
+
#
|
10
|
+
# # cat tag_usage.tsv
|
11
|
+
# 2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
|
12
|
+
# 2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
|
13
|
+
# 2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
|
14
|
+
# 2009-07-21T21:44:00 concert 500 0.29290 13367 9733414
|
15
|
+
# 2009-07-21T21:44:29 world 65536 1.09110 32850 200916
|
16
|
+
# 2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
|
17
|
+
# 2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
|
18
|
+
#
|
19
|
+
# wulign will reformat it to read
|
20
|
+
#
|
21
|
+
# # cat tag_usage.tsv | wu-lign
|
22
|
+
# 2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
|
23
|
+
# 2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
|
24
|
+
# 2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
|
25
|
+
# 2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414
|
26
|
+
# 2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
|
27
|
+
# 2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
|
28
|
+
# 2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
|
29
|
+
#
|
30
|
+
# The fields are still tab-delimited by exactly one tab -- only spaces are used to
|
31
|
+
# pad out fields. You can still use cuttab and friends to manipulate columns.
|
32
|
+
#
|
33
|
+
# h2. Command-line arguments
|
34
|
+
#
|
35
|
+
# You can give sprintf-style positional arguments on the command line that will be
|
36
|
+
# applied to the corresponding columns. (Blank args are used for placeholding and
|
37
|
+
# auto-formatting is still applied). So with the example above,
|
38
|
+
#
|
39
|
+
# cat foo | wulign '' '' '' '%8.4e'
|
40
|
+
#
|
41
|
+
# will format the fourth column with "%8.4e", while the first three columns and
|
42
|
+
# fifth-and-higher columns are formatted as usual.
|
43
|
+
#
|
44
|
+
# ...
|
45
|
+
# 2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
|
46
|
+
# 2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
|
47
|
+
# 2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
|
48
|
+
# ....
|
49
|
+
#
|
50
|
+
# h2. How it works
|
51
|
+
#
|
52
|
+
# Wu-lign takes the first 500ish lines, splits into fields on TAB characters,
|
53
|
+
# and tries to guess the format (int, float, or string) for each. It builds a
|
54
|
+
# consensus of the width and type for corresponding columns in the chunk. If a
|
55
|
+
# column has mixed numeric and string formats it degrades to :mixed, which is
|
56
|
+
# basically treated as :string. If a column has mixed :float and :int elements all
|
57
|
+
# of them are formatted as float.
|
58
|
+
#
|
59
|
+
# h2. Notes
|
60
|
+
#
|
61
|
+
# * Header rows: the first line is used for width alignment but not for type detection.
|
62
|
+
# This means that an initial row of text headers will inform column spacing
|
63
|
+
# but still allow a column of floats (say) to be properly aligned as floats.
|
64
|
+
#
|
65
|
+
# * It requires a unanimous vote. One screwy line can coerce the whole mess to
|
66
|
+
# :mixed; width formatting will still be applied, though.
|
67
|
+
#
|
68
|
+
# * It won't set columns wider than 100 chars -- this allows for the occasional
|
69
|
+
# super-wide column without completely breaking your screen.
|
70
|
+
#
|
71
|
+
# * For :float values, wulign tries to guess at the right number of significant
|
72
|
+
# digits to the left and right of the decimal point.
|
73
|
+
#
|
74
|
+
# * wulign parses only plain-jane 'TSV files': no quoting or escaping; every tab
|
75
|
+
# delimits a field, every newline a record.
|
76
|
+
#
|
77
|
+
# wulign isn't intended to be smart, or correct, or reliable -- only to be
|
78
|
+
# useful for previewing and organizing tab-formatted files. In general
|
79
|
+
# wulign(foo).split("\t").map(&:strip) *should* give output semantically
|
80
|
+
# equivalent to its input. (That is, the only changes should be insertion of
|
81
|
+
# spaces and re-formatting of numerics.) But still -- reserve its use for human
|
82
|
+
# inspection only.
|
83
|
+
#
|
84
|
+
}
|
85
|
+
|
86
|
+
if ARGV[0] == '--help'
|
87
|
+
puts $0
|
88
|
+
puts USAGE
|
89
|
+
exit
|
90
|
+
end
|
91
|
+
|
92
|
+
#
|
93
|
+
# How many initial lines to use to guess formatting. Lines after this are
|
94
|
+
# simply reformatted according to the consensus of the initial
|
95
|
+
# FORMAT_GUESSING_LINES.
|
96
|
+
#
|
97
|
+
FORMAT_GUESSING_LINES = 500
|
98
|
+
# widest column to set
|
99
|
+
MAX_MAX_WIDTH = 100
|
100
|
+
|
101
|
+
INT_RE = /\A[\d,]+\z/
|
102
|
+
FLOAT_RE = /\A([\d,]+)(?:\.(\d+))?(?:e-?\d+)?\z/
|
103
|
+
|
104
|
+
def get_type val
|
105
|
+
case
|
106
|
+
when val == '' then type = nil
|
107
|
+
when val =~ INT_RE then type = :int
|
108
|
+
when val =~ FLOAT_RE then type = :float
|
109
|
+
else type = :str end
|
110
|
+
end
|
111
|
+
|
112
|
+
def consensus_type val, alltype, is_first
|
113
|
+
return :mixed if alltype == :mixed
|
114
|
+
type = get_type(val) or return
|
115
|
+
case
|
116
|
+
when alltype.nil? then type
|
117
|
+
when is_first && (alltype == :str) then type
|
118
|
+
when alltype == type then type
|
119
|
+
when ( ((alltype==:float) && (type == :int)) || ((alltype == :int) && (type == :float)) )
|
120
|
+
:float
|
121
|
+
else :mixed
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def f_width str
|
126
|
+
str =~ FLOAT_RE or return 0
|
127
|
+
[$1.length, $2 ? $2.length : 0]
|
128
|
+
end
|
129
|
+
|
130
|
+
maxw = []
|
131
|
+
col_types = []
|
132
|
+
col_minmag = []
|
133
|
+
col_maxmag = []
|
134
|
+
rows = []
|
135
|
+
skip_col = []
|
136
|
+
has_header = false
|
137
|
+
ARGV.each_with_index{|v,i| next if (v == '') ; maxw[i] = 0; skip_col[i] = true }
|
138
|
+
FORMAT_GUESSING_LINES.times do
|
139
|
+
line = $stdin.readline rescue nil
|
140
|
+
break unless line
|
141
|
+
row = line.chomp.split("\t").map{|s| s.strip }
|
142
|
+
col_widths = row.map{|col| col.length }
|
143
|
+
col_widths.each_with_index{|cw,i| maxw[i] = [[cw,maxw[i]].compact.max, MAX_MAX_WIDTH].min }
|
144
|
+
row.each_with_index{|col,i|
|
145
|
+
next if skip_col[i]
|
146
|
+
# Let the first row be text (headers)
|
147
|
+
col_types[i] = consensus_type(col, col_types[i], rows.length == 1)
|
148
|
+
if col_types[i] == :float
|
149
|
+
mantissa, radix = f_width(col)
|
150
|
+
col_minmag[i] = [radix, col_minmag[i], 1].compact.max
|
151
|
+
col_maxmag[i] = [mantissa, col_maxmag[i], 1].compact.max
|
152
|
+
end
|
153
|
+
}
|
154
|
+
# p [rows.length, has_header, maxw, col_types, col_minmag, col_maxmag, col_widths, row]
|
155
|
+
has_header = true if row.all?{|col| get_type(col) == :str } && rows.length == 0
|
156
|
+
rows << row
|
157
|
+
end
|
158
|
+
|
159
|
+
format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type, minmag, maxmag, default|
|
160
|
+
next(lambda{|s| default % s rescue s }) if default.to_s != ''
|
161
|
+
case type
|
162
|
+
when :mixed, nil then lambda{|s| "%-#{width}s" % s }
|
163
|
+
when :str then lambda{|s| "%-#{width}s" % s }
|
164
|
+
when :int then lambda{|s| "%#{width}d" % s.gsub(/[^\d\-\+]+/, "").to_i }
|
165
|
+
when :float then lambda{|s| "%#{maxmag+minmag+2}.#{minmag}f" % s.gsub(/[^\d\.eE\-\+]+/, "").to_f }
|
166
|
+
else raise "oops type #{type}" end
|
167
|
+
end
|
168
|
+
|
169
|
+
def dump_row row, format
|
170
|
+
puts row.zip(format).map{|c,f| f.call(c) rescue c }.join("\t")
|
171
|
+
end
|
172
|
+
def dump_header row, maxw
|
173
|
+
puts row.zip(maxw).map{|col, width| "%-#{width}s" % col.to_s }.join("\t")
|
174
|
+
end
|
175
|
+
|
176
|
+
pad = [''] * maxw.length
|
177
|
+
dump_header(rows.shift, maxw) if has_header
|
178
|
+
rows.each do |row|
|
179
|
+
# note -- strips trailing columns
|
180
|
+
dump_row(row, format)
|
181
|
+
end
|
182
|
+
$stdin.each do |line|
|
183
|
+
row = line.chomp.split("\t").map{|s| s.strip }
|
184
|
+
# note -- strips trailing columns
|
185
|
+
dump_row(row, format)
|
186
|
+
end
|