wukong 3.0.0.pre → 3.0.0.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
data/docpages/wutils.textile
DELETED
@@ -1,263 +0,0 @@
|
|
1
|
-
---
|
2
|
-
layout: default
|
3
|
-
title: mrflip.github.com/wukong - wu-utils utilities
|
4
|
-
collapse: false
|
5
|
-
---
|
6
|
-
|
7
|
-
h1(gemheader). Wukong Utility Scripts
|
8
|
-
|
9
|
-
** "Overview of wutils":#wutils -- command listing
|
10
|
-
** "Stupid command-line tricks":#cmdlinetricks using the wutils
|
11
|
-
** "wu-lign":#wulign -- present a tab-separated file as aligned columns
|
12
|
-
** Dear Lazyweb, please build this for us: "tab-oriented version of the Textutils library":#wutilsinc
|
13
|
-
|
14
|
-
<notextile><div class="toggle"></notextile>
|
15
|
-
|
16
|
-
h2(#cmdlinetricks). Stupid command-line tricks
|
17
|
-
|
18
|
-
Here are a few useful little snippets you can run from the command line:
|
19
|
-
|
20
|
-
h3. Histogram
|
21
|
-
|
22
|
-
Given data with a date column:
|
23
|
-
|
24
|
-
<pre>
|
25
|
-
message 235623 20090423012345 Now is the winter of our discontent Made glorious summer by this son of York
|
26
|
-
message 235623 20080101230900 These pretzels are making me THIRSTY!
|
27
|
-
...
|
28
|
-
</pre>
|
29
|
-
|
30
|
-
You can calculate number of messages sent by day with
|
31
|
-
|
32
|
-
<pre>
|
33
|
-
cat messages | cuttab 3 | cutc 8 | sort | uniq -c
|
34
|
-
</pre>
|
35
|
-
|
36
|
-
(see the wuhist command, below.)
|
37
|
-
|
38
|
-
h3. Simple intersection, union, etc
|
39
|
-
|
40
|
-
For two datasets (batch_1 and batch_2) with unique entries (no repeated lines),
|
41
|
-
|
42
|
-
* Their union is simple:
|
43
|
-
|
44
|
-
<pre>
|
45
|
-
cat batch_1 batch_2 | sort -u
|
46
|
-
</pre>
|
47
|
-
|
48
|
-
* To find their intersection, concatenate the two sets and filters out everything that only occurred once.
|
49
|
-
|
50
|
-
<pre>
|
51
|
-
cat batch_1 batch_2 | sort | uniq -c | egrep -v '^ *1 '
|
52
|
-
</pre>
|
53
|
-
|
54
|
-
* For the complement of the intersection, use @... | egrep '^ *1 '@
|
55
|
-
|
56
|
-
* In both cases, if the files are each internally sorted, the commandline sort takes a --merge flag:
|
57
|
-
|
58
|
-
<pre>
|
59
|
-
sort --merge -u batch_1 batch_2
|
60
|
-
</pre>
|
61
|
-
|
62
|
-
<notextile></div><div class="toggle"></notextile>
|
63
|
-
|
64
|
-
h2(#wutils). Wutils Command Listing
|
65
|
-
|
66
|
-
h3. cutc
|
67
|
-
|
68
|
-
@cutc [colnum]@
|
69
|
-
|
70
|
-
Ex.
|
71
|
-
|
72
|
-
@echo -e 'foo\tbar\tbaz' | cutc 6@
|
73
|
-
@foo ba@
|
74
|
-
|
75
|
-
Cuts from beginning of line to given column (default 200). A tab is one character, so right margin can still be ragged.
|
76
|
-
|
77
|
-
h3. cuttab
|
78
|
-
|
79
|
-
@cuttab [colspec]@
|
80
|
-
|
81
|
-
Cuts given tab-separated columns. You can give a comma separated list of numbers
|
82
|
-
or ranges 1-4. columns are numbered from 1.
|
83
|
-
|
84
|
-
Ex.
|
85
|
-
|
86
|
-
<pre>
|
87
|
-
echo -e 'foo\tbar\tbaz' | cuttab 1,3
|
88
|
-
foo baz
|
89
|
-
</pre>
|
90
|
-
|
91
|
-
h3. hdp-*
|
92
|
-
|
93
|
-
These perform the corresponding commands on the HDFS filesystem. In general,
|
94
|
-
where they accept command-line flags, they go with the GNU-style ones, not the
|
95
|
-
hadoop-style: so, @hdp-du -s dir@ or @hdp-rm -r foo/@
|
96
|
-
|
97
|
-
* @hdp-cat@
|
98
|
-
* @hdp-catd@ -- cats the files that don't start with '_' in a directory. Use this for a pile of @.../part-00000@ files
|
99
|
-
* @hdp-du@
|
100
|
-
* @hdp-get@
|
101
|
-
* @hdp-kill@
|
102
|
-
* @hdp-ls@
|
103
|
-
* @hdp-mkdir@
|
104
|
-
* @hdp-mv@
|
105
|
-
* @hdp-ps@
|
106
|
-
* @hdp-put@
|
107
|
-
* @hdp-rm@
|
108
|
-
* @hdp-sync@
|
109
|
-
|
110
|
-
h3. hdp-sort, hdp-stream, hdp-stream-flat
|
111
|
-
|
112
|
-
* @hdp-sort@
|
113
|
-
* @hdp-stream@
|
114
|
-
* @hdp-stream-flat@
|
115
|
-
|
116
|
-
<code><pre>
|
117
|
-
hdp-stream input_filespec output_file map_cmd reduce_cmd num_key_fields
|
118
|
-
</pre></code>
|
119
|
-
|
120
|
-
h3. tabchar
|
121
|
-
|
122
|
-
Outputs a single tab character.
|
123
|
-
|
124
|
-
h3. wuhist
|
125
|
-
|
126
|
-
Occasionally useful to gather a lexical histogram of a single column:
|
127
|
-
|
128
|
-
Ex.
|
129
|
-
|
130
|
-
<code><pre>
|
131
|
-
$ echo -e 'foo\nbar\nbar\nfoo\nfoo\nfoo\n7' | ./wuhist
|
132
|
-
4 foo
|
133
|
-
2 bar
|
134
|
-
1 7
|
135
|
-
</pre></code>
|
136
|
-
|
137
|
-
(the output will have a tab between the first and second column, for futher processing.)
|
138
|
-
|
139
|
-
h3. wulign
|
140
|
-
|
141
|
-
Intelligently format a tab-separated file into aligned columns (while remaining tab-separated for further processing). See "below":#wulign.
|
142
|
-
|
143
|
-
h3. hdp-parts_to_keys.rb
|
144
|
-
|
145
|
-
A *very* clumsy script to rename reduced hadoop output files by their initial key.
|
146
|
-
|
147
|
-
If your output file has an initial key in the first column and you pass it through hdp-sort, they will be distributed across reducers and thus output files. (Because of the way hadoop hashes the keys, there's no guarantee that each file will get a distinct key. You could have 2 keys with a million entries and they could land sequentially on the same reducer, always fun.)
|
148
|
-
|
149
|
-
If you're willing to roll the dice, this script will rename files according to the first key in the first line.
|
150
|
-
|
151
|
-
**Do you have or know of a native hadoop utility to do this?** If so, please get in touch!
|
152
|
-
|
153
|
-
<notextile></div><div class="toggle"></notextile>
|
154
|
-
|
155
|
-
h2(#wulign). wu-lign -- format a tab-separated file as aligned columns
|
156
|
-
|
157
|
-
wu-lign will intelligently reformat a tab-separated file into a tab-separated, space aligned file that is still suitable for further processing. For example, given the log-file input
|
158
|
-
|
159
|
-
<pre><code>
|
160
|
-
2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
|
161
|
-
2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
|
162
|
-
2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
|
163
|
-
2009-07-21T21:44:00 concert 500 0.29290 13367 9733414
|
164
|
-
2009-07-21T21:44:29 world 65536 1.09110 32850 200916
|
165
|
-
2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
|
166
|
-
2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
|
167
|
-
</code></pre>
|
168
|
-
|
169
|
-
wu-lign will reformat it to read
|
170
|
-
|
171
|
-
<pre><code>
|
172
|
-
2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
|
173
|
-
2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
|
174
|
-
2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
|
175
|
-
2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414
|
176
|
-
2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
|
177
|
-
2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
|
178
|
-
2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
|
179
|
-
</code></pre>
|
180
|
-
|
181
|
-
The fields are still tab-delimited by exactly one tab -- only spaces are used to pad out fields. You can still use cuttab and friends to manipulate columns.
|
182
|
-
|
183
|
-
wu-lign isn't intended to be smart, or correct, or reliable -- only to be useful for previewing and organizing tab-formatted files. In general @wu-lign(foo).split("\t").map(&:strip)@ *should* give output semantically equivalent to its input. (That is, the only changes should be insertion of spaces and re-formatting of numerics.) But still -- reserve its use for human inspection only.
|
184
|
-
|
185
|
-
(Note: tab characters in this source code file have been converted to spaces; replace whitespace with tab in the first example if you'd like to play along at home.)
|
186
|
-
|
187
|
-
h3. How it works
|
188
|
-
|
189
|
-
Wu-Lign takes the first 1000 lines, splits by TAB characters into fields, and tries to guess the format -- int, float, or string -- for each. It builds a consensus of the width and type for corresponding columns in the chunk. If a column has mixed numeric and string formats it degrades to :mixed, which is basically treated as :string. If a column has mixed :float and :int elements all of them are formatted as float.
|
190
|
-
|
191
|
-
h3. Command-line arguments
|
192
|
-
|
193
|
-
You can give sprintf-style positional arguments on the command line that will be applied to the corresponding columns. (Blank args are used for placeholding and auto-formatting is still applied). So with the example above,
|
194
|
-
|
195
|
-
@cat foo | wu-lign '' '' '' '%8.4e'@
|
196
|
-
|
197
|
-
will format the fourth column with "%8.4e", while the first three columns and fifth-and-higher columns are formatted as usual.
|
198
|
-
|
199
|
-
<pre><code>
|
200
|
-
...
|
201
|
-
2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
|
202
|
-
2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
|
203
|
-
2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
|
204
|
-
....
|
205
|
-
</code></pre>
|
206
|
-
|
207
|
-
h3. Notes
|
208
|
-
|
209
|
-
* It has no knowledge of header rows. An all-text first line will screw everything up.
|
210
|
-
* It also requires a unanimous vote. One screwy line can coerce the whole mess to :mixed; width formatting will still be applied, though.
|
211
|
-
* It won't set columns wider than 70 chars -- this allows for the occasional super-wide column without completely breaking your screen.
|
212
|
-
* For :float values, wu-lign tries to guess at the right number of significant digits to the left and right of the decimal point.
|
213
|
-
* wu-lign does not parse 'TSV files' in their strict sense -- there is no quoting or escaping; every tab delimits a field, every newline a record.
|
214
|
-
|
215
|
-
h2(#wutilsinc). Dear Lazyweb, please build this
|
216
|
-
|
217
|
-
* uniq - report or filter out repeated lines in a file
|
218
|
-
** -c produces line<tab>count
|
219
|
-
** --ignore f1,f2,... discards given fields from consideration. field syntax same as for cut, etc.
|
220
|
-
|
221
|
-
* sort - sort lines of text files
|
222
|
-
** columns indexed as tab-separated
|
223
|
-
** can specify any column order, uses same field spec as cut
|
224
|
-
* tsort - topological sort of a directed graph
|
225
|
-
|
226
|
-
* cut - select portions of each line of a file
|
227
|
-
** can reorder columns
|
228
|
-
* nl - line numbering filter
|
229
|
-
** takes prefix, suffix
|
230
|
-
** count \t line -OR- line \t count
|
231
|
-
|
232
|
-
* wc - word, line, character, and byte count
|
233
|
-
** field count (tab-separated fields)
|
234
|
-
* paste - merge corresponding or subsequent lines of files
|
235
|
-
* expand, unexpand - expand tabs to spaces, and vice versa
|
236
|
-
* seq
|
237
|
-
* simple row, column sums
|
238
|
-
* join - relational database operator
|
239
|
-
* tac
|
240
|
-
|
241
|
-
* cat - concatenate and print files
|
242
|
-
* head - display first lines of a file
|
243
|
-
* tail - display the last part of a file
|
244
|
-
* shuf
|
245
|
-
* split - split a file into pieces
|
246
|
-
* csplit - split files based on context
|
247
|
-
* tee - pipe fitting
|
248
|
-
|
249
|
-
* ls - list directory contents.
|
250
|
-
* df - display free disk space
|
251
|
-
* du - display disk usage statistics
|
252
|
-
** tab-delimited, space aligned
|
253
|
-
|
254
|
-
* od - octal, decimal, hex, ASCII dump
|
255
|
-
* printf - formatted output
|
256
|
-
* cksum, sum - display file checksums and block counts
|
257
|
-
* md5sum
|
258
|
-
|
259
|
-
* diff
|
260
|
-
* comm
|
261
|
-
|
262
|
-
|
263
|
-
<notextile></div></notextile>
|
@@ -1,11 +0,0 @@
|
|
1
|
-
|
2
|
-
# TODO: a flow with splits and stuff
|
3
|
-
|
4
|
-
# parsed = map{|line| ApacheLogLine.make(line) }
|
5
|
-
#
|
6
|
-
# input(:default) > parsed
|
7
|
-
#
|
8
|
-
# parsed > split.into(
|
9
|
-
# to_json > output(:dump, stdout),
|
10
|
-
# to_tsv > output(:tsv, file_sink(Pathname.path_to(:tmp, 'foo.tsv')))
|
11
|
-
# )
|
data/examples/dataflow/donuts.rb
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
Wukong.dataflow(:gotta_make_the_donuts) do
|
2
|
-
input :dough_circles, dough_hopper
|
3
|
-
output :donut_box, box(:capacity => 12)
|
4
|
-
|
5
|
-
input(:dough_circles) >
|
6
|
-
frier(:top_frier) >
|
7
|
-
flipper >
|
8
|
-
frier(:btm_frier) >
|
9
|
-
cooling(:pre_glazer) >
|
10
|
-
glazer >
|
11
|
-
cooling(:ready) >
|
12
|
-
output(:donut_box)
|
13
|
-
end
|
@@ -1,92 +0,0 @@
|
|
1
|
-
1 1872
|
2
|
-
1 alice
|
3
|
-
2 all
|
4
|
-
15 and
|
5
|
-
1 arms
|
6
|
-
1 awhile
|
7
|
-
1 back
|
8
|
-
1 bandersnatch
|
9
|
-
1 beamish
|
10
|
-
2 beware
|
11
|
-
1 bird
|
12
|
-
1 bite
|
13
|
-
1 blade
|
14
|
-
2 borogoves
|
15
|
-
1 boy
|
16
|
-
2 brillig
|
17
|
-
1 burbled
|
18
|
-
1 callay
|
19
|
-
1 callooh
|
20
|
-
2 came
|
21
|
-
1 carroll
|
22
|
-
1 catch
|
23
|
-
1 chortled
|
24
|
-
1 claws
|
25
|
-
1 come
|
26
|
-
1 day
|
27
|
-
1 dead
|
28
|
-
2 did
|
29
|
-
1 eyes
|
30
|
-
1 flame
|
31
|
-
1 foe
|
32
|
-
1 found
|
33
|
-
1 frabjous
|
34
|
-
1 from
|
35
|
-
1 frumious
|
36
|
-
1 galumphing
|
37
|
-
2 gimble
|
38
|
-
1 glass
|
39
|
-
2 gyre
|
40
|
-
1 hand
|
41
|
-
1 has
|
42
|
-
1 head
|
43
|
-
2 his
|
44
|
-
1 its
|
45
|
-
3 jabberwock
|
46
|
-
1 jabberwocky
|
47
|
-
1 jaws
|
48
|
-
1 joy
|
49
|
-
1 jubjub
|
50
|
-
1 left
|
51
|
-
1 lewis
|
52
|
-
1 long
|
53
|
-
1 looking
|
54
|
-
1 manxome
|
55
|
-
2 mimsy
|
56
|
-
2 mome
|
57
|
-
2 one
|
58
|
-
2 outgrabe
|
59
|
-
2 raths
|
60
|
-
1 rested
|
61
|
-
1 shun
|
62
|
-
1 slain
|
63
|
-
2 slithy
|
64
|
-
1 snack
|
65
|
-
1 snicker
|
66
|
-
1 son
|
67
|
-
1 sought
|
68
|
-
2 stood
|
69
|
-
1 sword
|
70
|
-
2 that
|
71
|
-
20 the
|
72
|
-
1 there
|
73
|
-
1 thou
|
74
|
-
2 thought
|
75
|
-
4 through
|
76
|
-
1 time
|
77
|
-
1 took
|
78
|
-
2 toves
|
79
|
-
1 tree
|
80
|
-
1 tulgey
|
81
|
-
1 tumtum
|
82
|
-
2 twas
|
83
|
-
2 two
|
84
|
-
1 uffish
|
85
|
-
2 vorpal
|
86
|
-
2 wabe
|
87
|
-
2 went
|
88
|
-
2 were
|
89
|
-
1 what
|
90
|
-
1 whiffling
|
91
|
-
2 with
|
92
|
-
1 wood
|
data/examples/word_count.rb
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'wukong'
|
4
|
-
|
5
|
-
# cat data/jabberwocky.txt | bin/wu-map examples/word_count.rb | sort | bin/wu-red examples/word_count.rb | sort -rnk2 | head
|
6
|
-
|
7
|
-
Wukong.processor(:add_count) do
|
8
|
-
def process(word)
|
9
|
-
emit [word, 1]
|
10
|
-
end
|
11
|
-
end
|
12
|
-
|
13
|
-
Wukong.processor(:accumulator) do
|
14
|
-
attr_accessor :current, :count
|
15
|
-
|
16
|
-
def setup() reset! ; end
|
17
|
-
|
18
|
-
def stop() report_then_reset! ; end
|
19
|
-
|
20
|
-
def reset!() @current = nil ; @count = 0 ; end
|
21
|
-
|
22
|
-
def report_then_reset!
|
23
|
-
emit [current, count] unless current.nil?
|
24
|
-
reset!
|
25
|
-
end
|
26
|
-
|
27
|
-
def accumulate(word, seen)
|
28
|
-
@current = word if @current.nil?
|
29
|
-
@count += seen
|
30
|
-
end
|
31
|
-
|
32
|
-
def process(pair)
|
33
|
-
word, seen = pair
|
34
|
-
report_then_reset! unless word == current
|
35
|
-
accumulate(word, seen.to_i)
|
36
|
-
end
|
37
|
-
|
38
|
-
end
|
39
|
-
|
40
|
-
Wukong.dataflow(:mapper) do
|
41
|
-
splitter = map { |line| line.downcase.strip.split(/\W/) }
|
42
|
-
cleaner = reject { |word| word.length < 2 }
|
43
|
-
splitter > flatten > cleaner > add_count > to_tsv
|
44
|
-
end
|
45
|
-
|
46
|
-
Wukong.dataflow(:reducer) do
|
47
|
-
from_tsv > accumulator > to_tsv
|
48
|
-
end
|
data/examples/workflow/fiddle.rb
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
connect('split:top').into('flatten:ingredient')
|
5
|
-
|
6
|
-
combine << utensil('bowl') << ingredient('flour') << ingredient('salt') << ingredient('sugar') > ingredient('dough')
|
7
|
-
|
8
|
-
|
9
|
-
task 'package' do
|
10
|
-
slot(:docs) << directory('docs')
|
11
|
-
slot(:exe) << action(:compiled)
|
12
|
-
end
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
wukong 'foo.rb', 'x.tsv', 'y.tsv', :reduce_tasks => 0, :min_split_size => '1M' > 'foo_out.tsv'
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
wukong 'combine.rb', 'x.tsv', 'y.tsv', :reduce_tasks => 0, :min_split_size => '1M' > :raw_pie
|
21
|
-
|
22
|
-
pig
|
23
|
-
|
24
|
-
wukong 'bake.rb', :raw_pie > :pie
|