wukong 3.0.0.pre → 3.0.0.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,140 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
require 'wukong/streamer/count_keys'
|
5
|
-
|
6
|
-
#
|
7
|
-
# Ch3ck out dis moist azz code bitches!!
|
8
|
-
#
|
9
|
-
#
|
10
|
-
|
11
|
-
#
|
12
|
-
# Do nothing more than bin users here, arbitrary and probably bad
|
13
|
-
#
|
14
|
-
class Mapper < Wukong::Streamer::RecordStreamer
|
15
|
-
def process rank, followers
|
16
|
-
followers = followers.to_i
|
17
|
-
if followers > 100
|
18
|
-
yield [9,rank]
|
19
|
-
elsif followers > 75
|
20
|
-
yield [8,rank]
|
21
|
-
elsif followers > 50
|
22
|
-
yield [7,rank]
|
23
|
-
elsif followers > 25
|
24
|
-
yield [6,rank]
|
25
|
-
elsif followers > 15
|
26
|
-
yield [5,rank]
|
27
|
-
elsif followers > 10
|
28
|
-
yield [4,rank]
|
29
|
-
elsif followers > 5
|
30
|
-
yield [3,rank]
|
31
|
-
elsif followers > 4
|
32
|
-
yield [2,rank]
|
33
|
-
elsif followers > 1
|
34
|
-
yield [1,rank]
|
35
|
-
else
|
36
|
-
yield [0,rank]
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
|
42
|
-
#
|
43
|
-
# Calculate percentile rank for every pr value in a given follower bracket
|
44
|
-
#
|
45
|
-
class Reducer < Wukong::Streamer::AccumulatingReducer
|
46
|
-
attr_accessor :count_bin
|
47
|
-
def start! bin, rank
|
48
|
-
self.count_bin ||= {}
|
49
|
-
self.count_bin[bin] ||= {}
|
50
|
-
end
|
51
|
-
|
52
|
-
def accumulate bin, rank
|
53
|
-
rank = (rank.to_f*10.0).round.to_f/10.0
|
54
|
-
self.count_bin[bin][rank] ||= 0
|
55
|
-
self.count_bin[bin][rank] += 1
|
56
|
-
end
|
57
|
-
|
58
|
-
def finalize
|
59
|
-
count_bin[key] = generate_all_pairs(key).inject({}){|h,pair| h[pair.first] = pair.last; h}
|
60
|
-
yield [key, count_bin[key].values.sort.join(",")]
|
61
|
-
end
|
62
|
-
|
63
|
-
#
|
64
|
-
# Write the final table to disk as a ruby hash
|
65
|
-
#
|
66
|
-
def after_stream
|
67
|
-
table = File.open("trstrank_table.rb", 'w')
|
68
|
-
table << "TRSTRANK_TABLE = " << count_bin.inspect
|
69
|
-
table.close
|
70
|
-
end
|
71
|
-
|
72
|
-
#
|
73
|
-
# Return percentile of a given trstrank for a given follower bracket
|
74
|
-
#
|
75
|
-
def percentile bin, rank
|
76
|
-
((count_less_than(bin,rank) + 0.5*frequency_of(bin,rank))/ total_num(bin) )*100.0
|
77
|
-
end
|
78
|
-
|
79
|
-
#
|
80
|
-
# Return the count of values less than rank
|
81
|
-
#
|
82
|
-
def count_less_than bin, rank
|
83
|
-
count_bin[bin].keys.inject(0){|count,key| count += count_bin[bin][key] if key.to_f < rank; count}
|
84
|
-
end
|
85
|
-
|
86
|
-
#
|
87
|
-
# Return the count of rank
|
88
|
-
#
|
89
|
-
def frequency_of bin, rank
|
90
|
-
count_bin[bin].keys.inject(0){|count,key| count += count_bin[bin][key] if key.to_f == rank; count}
|
91
|
-
end
|
92
|
-
|
93
|
-
#
|
94
|
-
# Return the total number in sample
|
95
|
-
#
|
96
|
-
def total_num bin
|
97
|
-
count_bin[bin].values.inject(0){|count,v| count += v; count}
|
98
|
-
end
|
99
|
-
|
100
|
-
#
|
101
|
-
# Generate a list of all pairs {trstrank => percentile}, interpolate when necessary
|
102
|
-
#
|
103
|
-
def generate_all_pairs bin
|
104
|
-
h = {}
|
105
|
-
count_bin[bin].keys.each do |rank|
|
106
|
-
h[rank.to_f] = percentile(bin, rank.to_f)
|
107
|
-
end
|
108
|
-
h[0.0] ||= 0.0
|
109
|
-
h[10.0] ||= 100.0
|
110
|
-
arr = h.to_a.sort!{|x,y| x.first <=> y.first}
|
111
|
-
list = arr.zip(arr[1..-1])
|
112
|
-
big_list = []
|
113
|
-
big_list << [0.0,0.0]
|
114
|
-
list.each do |pairs|
|
115
|
-
interpolate(pairs.first, pairs.last, 0.1).each{|pair| big_list << pair}
|
116
|
-
end
|
117
|
-
big_list.uniq.sort{|x,y| x.first <=> y.first}
|
118
|
-
end
|
119
|
-
|
120
|
-
|
121
|
-
#
|
122
|
-
# Nothing to see here, move along
|
123
|
-
#
|
124
|
-
def interpolate pair1, pair2, dx
|
125
|
-
return [pair1] if pair2.blank?
|
126
|
-
m = (pair2.last - pair1.last)/(pair2.first - pair1.first) # slope
|
127
|
-
b = pair2.last - m*pair2.first # y intercept
|
128
|
-
num = ((pair2.first - pair1.first)/dx).abs.round # number of points to interpolate
|
129
|
-
points = []
|
130
|
-
num.times do |i|
|
131
|
-
x = pair1.first + (i+1).to_f*dx
|
132
|
-
y = m*x + b
|
133
|
-
points << [x,y]
|
134
|
-
end
|
135
|
-
points # return an array of pairs
|
136
|
-
end
|
137
|
-
|
138
|
-
end
|
139
|
-
|
140
|
-
Wukong::Script.new(Mapper,Reducer).run
|
@@ -1,173 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
require 'wukong/streamer/rank_and_bin_reducer'
|
5
|
-
|
6
|
-
#
|
7
|
-
# This example uses the classes from http://github.com/mrflip/twitter_friends
|
8
|
-
# (That's sloppy, and I apologize. I'm building this script for that, but it
|
9
|
-
# seems broadly useful and I'm not maintaining two copies. Once this script is
|
10
|
-
# more worky we'll make it standalone. Anyway you should get the picture.)
|
11
|
-
#
|
12
|
-
$: << File.dirname(__FILE__)+'/../../projects/twitter_friends/lib'
|
13
|
-
require 'twitter_friends';
|
14
|
-
require 'twitter_friends/struct_model' ; include TwitterFriends::StructModel
|
15
|
-
|
16
|
-
|
17
|
-
#
|
18
|
-
# attrs to bin
|
19
|
-
#
|
20
|
-
BINNABLE_ATTRS = {
|
21
|
-
:twitter_user => [
|
22
|
-
[:followers_count, :fo ],
|
23
|
-
[:friends_count, :fr ],
|
24
|
-
[:statuses_count, :st ],
|
25
|
-
[:favourites_count, :fv ],
|
26
|
-
[:created_at, :crat ]
|
27
|
-
]
|
28
|
-
|
29
|
-
}
|
30
|
-
RESOURCE_ALIASES = {
|
31
|
-
:twitter_user => :u,
|
32
|
-
:user_metrics => :um,
|
33
|
-
}
|
34
|
-
#
|
35
|
-
# KLUDGE This is not DRY at all but let's get it working first
|
36
|
-
#
|
37
|
-
BinUserMetrics = TypedStruct.new(
|
38
|
-
[:id, Integer],
|
39
|
-
*BINNABLE_ATTRS[:user_metrics].map{|attr, attr_abbr| [attr_abbr, Integer] }
|
40
|
-
)
|
41
|
-
BINNED_RESOURCE_ALIASES = {
|
42
|
-
:u => BinTwitterUser,
|
43
|
-
}
|
44
|
-
|
45
|
-
module RankAndBinAttrs
|
46
|
-
class ExplodeResourceMapper < Wukong::Streamer::StructStreamer
|
47
|
-
def get_and_format_attr thing, attr
|
48
|
-
val = thing.send(attr)
|
49
|
-
case thing.members_types[attr].to_s.to_sym
|
50
|
-
when :Integer then "%010d" % val.to_i
|
51
|
-
when :Float then "%020.7f" % val.to_f
|
52
|
-
when :Bignum then "%020d" % val.to_i
|
53
|
-
else
|
54
|
-
raise [val, thing.members_types[attr].to_s.to_sym].inspect
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
#
|
59
|
-
# The data expansion of this mapper is large enough that it makes sense to
|
60
|
-
# be a little responsible with what we emit. We'll use the RESOURCE_ALIASES
|
61
|
-
# and BINNABLE_ATTRS hashes, above, to dump a more parsimonious
|
62
|
-
# representation.
|
63
|
-
#
|
64
|
-
def process thing, *args, &block
|
65
|
-
attr_abbrs = BINNABLE_ATTRS[thing.class.resource_name]
|
66
|
-
return unless attr_abbrs
|
67
|
-
attr_abbrs.each do |attr, abbr|
|
68
|
-
yield [
|
69
|
-
RESOURCE_ALIASES[thing.class.resource_name],
|
70
|
-
abbr,
|
71
|
-
get_and_format_attr(thing, attr),
|
72
|
-
thing.id.to_i
|
73
|
-
]
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
class BinAttrReducer < Wukong::Streamer::RankAndBinReducer
|
79
|
-
attr_accessor :last_rsrc_attr
|
80
|
-
#
|
81
|
-
# Note that we might get several different resources at the same reducer
|
82
|
-
#
|
83
|
-
def get_key rsrc, attr, val, *args
|
84
|
-
if [rsrc, attr] != self.last_rsrc_attr
|
85
|
-
# Note: since each partition has the same cardinality, we don't need to
|
86
|
-
# fiddle around with the bin_size, etc -- just reset the order
|
87
|
-
# parameters' state.
|
88
|
-
reset_order_params!
|
89
|
-
self.last_rsrc_attr = [rsrc, attr]
|
90
|
-
end
|
91
|
-
val
|
92
|
-
end
|
93
|
-
|
94
|
-
#
|
95
|
-
# Note well -- we are rearranging the field order to
|
96
|
-
#
|
97
|
-
# resource_abbr id attr_abbr bin
|
98
|
-
#
|
99
|
-
# for proper sorting to the re-assembler
|
100
|
-
#
|
101
|
-
def emit record
|
102
|
-
rsrc, attr, val, id, numbering, rank, bin = record
|
103
|
-
super [rsrc, id, attr, bin]
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
class ReassembleObjectReducer < Wukong::Streamer::AccumulatingReducer
|
108
|
-
attr_accessor :thing
|
109
|
-
def klass_from_abbr rsrc_abbr
|
110
|
-
BINNED_RESOURCE_ALIASES[rsrc_abbr.to_sym]
|
111
|
-
end
|
112
|
-
def get_key rsrc_abbr, id, *args
|
113
|
-
[rsrc_abbr, id.to_i]
|
114
|
-
end
|
115
|
-
|
116
|
-
def start! rsrc_abbr, id, *args
|
117
|
-
klass = klass_from_abbr(rsrc_abbr)
|
118
|
-
self.thing = klass.new id.to_i
|
119
|
-
end
|
120
|
-
|
121
|
-
def accumulate rsrc, id, attr, bin
|
122
|
-
thing.send("#{attr}=", bin)
|
123
|
-
end
|
124
|
-
|
125
|
-
def finalize
|
126
|
-
yield thing
|
127
|
-
end
|
128
|
-
end
|
129
|
-
|
130
|
-
#
|
131
|
-
# Two-phase script
|
132
|
-
#
|
133
|
-
# FIXME -- We need a runner class to manage this.
|
134
|
-
#
|
135
|
-
class Script < Wukong::Script
|
136
|
-
attr_accessor :phase
|
137
|
-
# KLUDGE !!
|
138
|
-
def initialize
|
139
|
-
case
|
140
|
-
when ARGV.detect{|arg| arg =~ /--phase=1/}
|
141
|
-
# Phase 1 -- Steal underpants. Also, disassemble each object, and find
|
142
|
-
# the bin for each binnable attribute's value
|
143
|
-
self.phase = 1
|
144
|
-
self.mapper_klass, self.reducer_klass = [ExplodeResourceMapper, BinAttrReducer]
|
145
|
-
when ARGV.detect{|arg| arg =~ /--phase=2/}
|
146
|
-
# Phase 2 -- ????
|
147
|
-
raise "Phase 2 : ????"
|
148
|
-
when ARGV.detect{|arg| arg =~ /--phase=3/}
|
149
|
-
# Phase 3 -- profit. In this case, put records back together.
|
150
|
-
self.phase = 3
|
151
|
-
self.mapper_klass, self.reducer_klass = [nil, ReassembleObjectReducer]
|
152
|
-
else
|
153
|
-
raise "Please run me with a --phase= option"
|
154
|
-
end
|
155
|
-
super mapper_klass, reducer_klass
|
156
|
-
end
|
157
|
-
|
158
|
-
def default_options
|
159
|
-
extra_options =
|
160
|
-
case self.phase
|
161
|
-
# partition on [rsrc, attr]; sort on [rsrc, attr, val]
|
162
|
-
when 1 then { :sort_fields => 3, :partition_fields => 2 }
|
163
|
-
# sort on [rsrc, id]
|
164
|
-
when 3 then { :sort_fields => 2 }
|
165
|
-
else { }
|
166
|
-
end
|
167
|
-
super.merge extra_options
|
168
|
-
end
|
169
|
-
end
|
170
|
-
|
171
|
-
# execute script
|
172
|
-
Script.new.run
|
173
|
-
end
|
@@ -1,40 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
|
5
|
-
# Run as (local mode)
|
6
|
-
#
|
7
|
-
# ./examples/stupidly_simple_filter.rb --run=local input.tsv output.tsv
|
8
|
-
#
|
9
|
-
# for hadoop mode,
|
10
|
-
#
|
11
|
-
# ./examples/stupidly_simple_filter.rb --run=hadoop input.tsv output.tsv
|
12
|
-
#
|
13
|
-
# For debugging, run
|
14
|
-
#
|
15
|
-
# cat input.tsv | ./examples/stupidly_simple_filter.rb --map input.tsv | more
|
16
|
-
#
|
17
|
-
|
18
|
-
class Mapper < LineStreamer
|
19
|
-
include Filter
|
20
|
-
MATCHER = %r{(ford|mercury|saab|mazda|isuzu)}
|
21
|
-
|
22
|
-
#
|
23
|
-
# A very simple mapper -- looks for a regex match in one field,
|
24
|
-
# and emits the whole record if the field matches
|
25
|
-
#
|
26
|
-
#
|
27
|
-
# Given a series of records like:
|
28
|
-
#
|
29
|
-
# tweet 123456789 20100102030405 @frank: I'm having a bacon sandwich
|
30
|
-
# tweet 123456789 20100102030405 @jerry, I'm having your baby
|
31
|
-
#
|
32
|
-
# emits only the lines matching that regex
|
33
|
-
#
|
34
|
-
def emit? line
|
35
|
-
MATCHER.match line
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
# Execute the script
|
40
|
-
Wukong.run(Mapper)
|
data/old/examples/word_count.rb
DELETED
@@ -1,75 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
|
5
|
-
module WordCount
|
6
|
-
class Mapper < Wukong::Streamer::LineStreamer
|
7
|
-
#
|
8
|
-
# Split a string into its constituent words.
|
9
|
-
#
|
10
|
-
# This is pretty simpleminded:
|
11
|
-
# * downcase the word
|
12
|
-
# * Split at any non-alphanumeric boundary, including '_'
|
13
|
-
# * However, preserve the special cases of 's, 'd or 't at the end of a
|
14
|
-
# word.
|
15
|
-
#
|
16
|
-
# tokenize("Ability is a poor man's wealth #johnwoodenquote")
|
17
|
-
# # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
|
18
|
-
#
|
19
|
-
def tokenize str
|
20
|
-
return [] if str.blank?
|
21
|
-
str = str.downcase;
|
22
|
-
# kill off all punctuation except [stuff]'s or [stuff]'t
|
23
|
-
# this includes hyphens (words are split)
|
24
|
-
str = str.
|
25
|
-
gsub(/[^a-zA-Z0-9\']+/, ' ').
|
26
|
-
gsub(/(\w)\'([stdm]|re|ve|ll)\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
|
27
|
-
# Busticate at whitespace
|
28
|
-
words = str.split(/\s+/)
|
29
|
-
words.reject!{|w| w.length < 3 }
|
30
|
-
words
|
31
|
-
end
|
32
|
-
|
33
|
-
#
|
34
|
-
# Emit each word in each line.
|
35
|
-
#
|
36
|
-
def process line
|
37
|
-
tokenize(line).each{|word| yield [word, 1] }
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
#
|
42
|
-
# You can stack up all the values in a list then sum them at once.
|
43
|
-
#
|
44
|
-
# This isn't good style, as it means the whole list is held in memory
|
45
|
-
#
|
46
|
-
class Reducer1 < Wukong::Streamer::ListReducer
|
47
|
-
def finalize
|
48
|
-
yield [ values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot }, key ]
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
#
|
53
|
-
# A bit kinder to your memory manager: accumulate the sum record-by-record:
|
54
|
-
#
|
55
|
-
class Reducer2 < Wukong::Streamer::AccumulatingReducer
|
56
|
-
def start!(*args) @key_count = 0 end
|
57
|
-
def accumulate(*args) @key_count += 1 end
|
58
|
-
def finalize
|
59
|
-
yield [ @key_count, key ]
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
#
|
64
|
-
# ... easiest of all, though: this is common enough that it's already included
|
65
|
-
#
|
66
|
-
require 'wukong/streamer/count_keys'
|
67
|
-
class Reducer3 < Wukong::Streamer::CountKeys
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
# Execute the script
|
72
|
-
Wukong.run(
|
73
|
-
WordCount::Mapper,
|
74
|
-
WordCount::Reducer2
|
75
|
-
)
|
@@ -1,580 +0,0 @@
|
|
1
|
-
#!/usr/local/bin/ruby -w
|
2
|
-
|
3
|
-
require "enumerator"
|
4
|
-
|
5
|
-
##
|
6
|
-
# Graph models directed graphs and subgraphs and outputs in graphviz's
|
7
|
-
# dot format.
|
8
|
-
|
9
|
-
module Hanuman
|
10
|
-
|
11
|
-
class GraphvizBuilder
|
12
|
-
VERSION = "2.5.0" # :nodoc:
|
13
|
-
|
14
|
-
LIGHT_COLORS = %w(gray lightblue lightcyan lightgray lightpink
|
15
|
-
lightslategray lightsteelblue white)
|
16
|
-
|
17
|
-
# WTF -- can't be %w() because of a bug in rcov
|
18
|
-
BOLD_COLORS = [:black, :brown, :mediumblue, :blueviolet,
|
19
|
-
:orange, :magenta, :darkgreen, :maroon,
|
20
|
-
:violetred, :purple, :greenyellow, :deeppink,
|
21
|
-
:midnightblue, :firebrick, :darkturquoise,
|
22
|
-
:mediumspringgreen, :chartreuse, :navy,
|
23
|
-
:lightseagreen, :chocolate, :lawngreen, :green,
|
24
|
-
:indigo, :darkgoldenrod, :darkviolet, :red,
|
25
|
-
:springgreen, :saddlebrown, :mediumvioletred,
|
26
|
-
:goldenrod, :tomato, :cyan, :forestgreen,
|
27
|
-
:darkorchid, :crimson, :coral, :deepskyblue,
|
28
|
-
:seagreen, :peru, :turquoise, :orangered,
|
29
|
-
:dodgerblue, :sienna, :limegreen, :royalblue,
|
30
|
-
:darkorange, :blue]
|
31
|
-
|
32
|
-
##
|
33
|
-
# Defines the brewer color schemes and the maximum number of colors
|
34
|
-
# in each set.
|
35
|
-
|
36
|
-
COLOR_SCHEME_MAX = {
|
37
|
-
:accent => 8, :blues => 9, :brbg => 11, :bugn => 9,
|
38
|
-
:dark2 => 8, :gnbu => 9, :greens => 9, :greys => 9,
|
39
|
-
:oranges => 9, :orrd => 9, :paired => 12, :pastel1 => 9,
|
40
|
-
:pastel2 => 8, :piyg => 11, :prgn => 11, :pubu => 9,
|
41
|
-
:pubugn => 9, :puor => 11, :purd => 9, :purples => 9,
|
42
|
-
:rdbu => 11, :rdgy => 11, :rdylbu => 11, :rdylgn => 11,
|
43
|
-
:reds => 9, :set1 => 9, :set2 => 8, :set3 => 12,
|
44
|
-
:spectral => 11, :ylgn => 9, :ylgnbu => 9, :ylorbr => 9,
|
45
|
-
:ylorrd => 9
|
46
|
-
}
|
47
|
-
|
48
|
-
SHAPES = %w[
|
49
|
-
Mcircle Mdiamond Msquare box box3d circle component
|
50
|
-
diamond doublecircle doubleoctagon egg ellipse folder
|
51
|
-
hexagon house invhouse invtrapezium invtriangle none
|
52
|
-
note octagon parallelogram pentagon plaintext point
|
53
|
-
polygon rect rectangle septagon square tab trapezium
|
54
|
-
triangle tripleoctagon
|
55
|
-
].map(&:to_sym)
|
56
|
-
|
57
|
-
STYLES = %w(dashed dotted solid invis bold filled diagonals rounded).map(&:to_sym)
|
58
|
-
|
59
|
-
ARROW_RE = /(?:o?[lr]?(?:box|crow|diamond|dot|inv|none|normal|tee|vee)){1,4}/
|
60
|
-
|
61
|
-
ARROWS = %w(box crow diamond dot inv none normal tee vee).map(&:to_sym)
|
62
|
-
|
63
|
-
STYLES.each do |name|
|
64
|
-
define_method(name) { style name }
|
65
|
-
end
|
66
|
-
|
67
|
-
(BOLD_COLORS + LIGHT_COLORS).each do |name|
|
68
|
-
define_method(name) { color name }
|
69
|
-
end
|
70
|
-
|
71
|
-
SHAPES.each do |name|
|
72
|
-
method_name = name.downcase.to_s.sub(/none/, 'shape_none')
|
73
|
-
define_method(method_name) { shape name }
|
74
|
-
end
|
75
|
-
|
76
|
-
ARROWS.each do |name|
|
77
|
-
method_name = {
|
78
|
-
:none => "none_arrow",
|
79
|
-
:box => "box_arrow",
|
80
|
-
:diamond => "diamond_arrow",
|
81
|
-
}[name] || name
|
82
|
-
|
83
|
-
define_method(method_name) { arrowhead name }
|
84
|
-
end
|
85
|
-
|
86
|
-
ENGINES = %w[ circo dot fdp neato osage sfdp twopi ].map(&:to_sym)
|
87
|
-
|
88
|
-
##
|
89
|
-
# A parent graph, if any. Only used for subgraphs.
|
90
|
-
|
91
|
-
attr_accessor :graph
|
92
|
-
|
93
|
-
##
|
94
|
-
# The name of the graph. Optional for graphs and subgraphs. Prefix
|
95
|
-
# the name of a subgraph with "cluster" for subgraph that is boxed.
|
96
|
-
|
97
|
-
attr_accessor :name
|
98
|
-
|
99
|
-
##
|
100
|
-
# Global attributes for edges in this graph.
|
101
|
-
|
102
|
-
attr_reader :edge_attribs
|
103
|
-
|
104
|
-
##
|
105
|
-
# The hash of hashes of edges in this graph. Use #[] or #node to create edges.
|
106
|
-
|
107
|
-
attr_reader :edges
|
108
|
-
|
109
|
-
##
|
110
|
-
# Global attributes for this graph.
|
111
|
-
|
112
|
-
attr_reader :graph_attribs
|
113
|
-
|
114
|
-
##
|
115
|
-
# Global attributes for nodes in this graph.
|
116
|
-
|
117
|
-
attr_reader :node_attribs
|
118
|
-
|
119
|
-
##
|
120
|
-
# The hash of nodes in this graph. Use #[] or #node to create nodes.
|
121
|
-
|
122
|
-
attr_reader :nodes
|
123
|
-
|
124
|
-
##
|
125
|
-
# An array of subgraphs.
|
126
|
-
|
127
|
-
attr_reader :subgraphs
|
128
|
-
|
129
|
-
##
|
130
|
-
# Creates a new graph object. Optional name and parent graph are
|
131
|
-
# available. Also takes an optional block for DSL-like use.
|
132
|
-
|
133
|
-
def initialize name = nil, graph = nil, &block
|
134
|
-
@name = name.to_sym
|
135
|
-
@graph = graph
|
136
|
-
graph << self if graph
|
137
|
-
@nodes = Hash.new { |h,k| h[k] = Node.new self, k }
|
138
|
-
@edges = Hash.new { |h,k|
|
139
|
-
h[k] = Hash.new { |h2, k2| h2[k2] = Edge.new self, self[k], self[k2] }
|
140
|
-
}
|
141
|
-
@graph_attribs = []
|
142
|
-
@node_attribs = []
|
143
|
-
@edge_attribs = []
|
144
|
-
@subgraphs = []
|
145
|
-
|
146
|
-
engine(:dot)
|
147
|
-
|
148
|
-
self.scheme = graph.scheme if graph
|
149
|
-
node_attribs << scheme if scheme
|
150
|
-
configurate(&block) if block
|
151
|
-
end
|
152
|
-
|
153
|
-
def depth
|
154
|
-
graph.nil? ? 0 : graph.depth + 1
|
155
|
-
end
|
156
|
-
|
157
|
-
def configurate(&block)
|
158
|
-
(block.arity == 0) ? instance_eval(&block) : block.call(self)
|
159
|
-
self
|
160
|
-
end
|
161
|
-
|
162
|
-
def engine(engine_name=nil)
|
163
|
-
return @engine unless engine_name
|
164
|
-
raise ArgumentError, "Don't have engine #{engine_name} listed -- should be one of #{ENGINES}" unless ENGINES.include?(engine_name.to_sym)
|
165
|
-
@engine = engine_name
|
166
|
-
end
|
167
|
-
|
168
|
-
##
|
169
|
-
# Push a subgraph into the current graph. Sets the subgraph's graph to self.
|
170
|
-
|
171
|
-
def << subgraph
|
172
|
-
subgraphs << subgraph
|
173
|
-
subgraph.graph = self
|
174
|
-
end
|
175
|
-
|
176
|
-
##
|
177
|
-
# Access a node by name
|
178
|
-
|
179
|
-
def [] name
|
180
|
-
nodes[name]
|
181
|
-
end
|
182
|
-
|
183
|
-
def arrowhead shape
|
184
|
-
raise ArgumentError, "Bad arrow shape: #{shape}" unless shape =~ ARROW_RE
|
185
|
-
Attribute.new "arrowhead = #{shape}"
|
186
|
-
end
|
187
|
-
|
188
|
-
def arrowtail shape
|
189
|
-
raise ArgumentError, "Bad arrow shape: #{shape}" unless shape =~ ARROW_RE
|
190
|
-
Attribute.new "arrowtail = #{shape}"
|
191
|
-
end
|
192
|
-
|
193
|
-
def arrowsize size
|
194
|
-
Attribute.new "arrowsize = #{size}"
|
195
|
-
end
|
196
|
-
|
197
|
-
##
|
198
|
-
# A convenience method to set the global node attributes to use boxes.
|
199
|
-
|
200
|
-
def boxes
|
201
|
-
node_attribs << shape(:box)
|
202
|
-
end
|
203
|
-
|
204
|
-
##
|
205
|
-
# Shortcut method to create a new color Attribute instance.
|
206
|
-
|
207
|
-
def color color
|
208
|
-
Attribute.new "color = #{color}"
|
209
|
-
end
|
210
|
-
|
211
|
-
##
|
212
|
-
# Shortcut method to create a new colorscheme Attribute instance. If
|
213
|
-
# passed +n+, +name+ must match one of the brewer color scheme names
|
214
|
-
# and it will generate accessors for each fillcolor as well as push
|
215
|
-
# the colorscheme onto the node_attribs.
|
216
|
-
|
217
|
-
attr_accessor :scheme
|
218
|
-
|
219
|
-
def colorscheme name, n = nil
|
220
|
-
self.scheme = Attribute.new "colorscheme = #{name}#{n}"
|
221
|
-
max = COLOR_SCHEME_MAX[name.to_sym]
|
222
|
-
|
223
|
-
node_attribs << scheme if max
|
224
|
-
|
225
|
-
scheme
|
226
|
-
end
|
227
|
-
|
228
|
-
(1..COLOR_SCHEME_MAX.values.max).map { |m|
|
229
|
-
define_method "c#{m}" do
|
230
|
-
GraphvizBuilder::Attribute.new("fillcolor = #{m}")
|
231
|
-
end
|
232
|
-
}
|
233
|
-
|
234
|
-
##
|
235
|
-
# Define one or more edges.
|
236
|
-
#
|
237
|
-
# edge :a, :b, :c, ...
|
238
|
-
#
|
239
|
-
# is equivalent to:
|
240
|
-
#
|
241
|
-
# edge :a, :b
|
242
|
-
# edge :b, :c
|
243
|
-
# ...
|
244
|
-
|
245
|
-
def edge(*names)
|
246
|
-
last = nil
|
247
|
-
names.each_cons(2) do |from, to|
|
248
|
-
last = self[from][to]
|
249
|
-
end
|
250
|
-
last
|
251
|
-
end
|
252
|
-
|
253
|
-
##
|
254
|
-
# Creates a new Graph whose edges point the other direction.
|
255
|
-
|
256
|
-
def invert(new_name=nil)
|
257
|
-
result = self.class.new(new_name || "#{name}_inverted")
|
258
|
-
edges.each do |from, h|
|
259
|
-
h.each do |to, edge|
|
260
|
-
result[to][from]
|
261
|
-
end
|
262
|
-
end
|
263
|
-
result
|
264
|
-
end
|
265
|
-
|
266
|
-
##
|
267
|
-
# Shortcut method to create a new fillcolor Attribute instance.
|
268
|
-
|
269
|
-
def fillcolor n
|
270
|
-
Attribute.new "fillcolor = #{n}"
|
271
|
-
end
|
272
|
-
|
273
|
-
##
|
274
|
-
# Shortcut method to create a new font Attribute instance. You can
|
275
|
-
# pass in both the name and an optional font size.
|
276
|
-
|
277
|
-
def font name
|
278
|
-
Attribute.new "fontname = #{name.inspect}"
|
279
|
-
end
|
280
|
-
|
281
|
-
def fontsize size
|
282
|
-
Attribute.new "fontsize = #{size}"
|
283
|
-
end
|
284
|
-
|
285
|
-
##
|
286
|
-
# Shortcut method to set the graph's label. Usually used with subgraphs.
|
287
|
-
|
288
|
-
def label name
|
289
|
-
graph_attribs << %Q{label = "#{name.to_s.gsub(/\n/, '\n')}"} # ""
|
290
|
-
end
|
291
|
-
|
292
|
-
##
|
293
|
-
# Access a node by name, supplying an optional label
|
294
|
-
|
295
|
-
def node name, label = nil
|
296
|
-
n = nodes[name]
|
297
|
-
n.label label if label
|
298
|
-
n
|
299
|
-
end
|
300
|
-
|
301
|
-
##
|
302
|
-
# Shortcut method to specify the orientation of the graph. Defaults
|
303
|
-
# to the graphviz default "TB".
|
304
|
-
|
305
|
-
def orient dir = :TB
|
306
|
-
graph_attribs << "rankdir = #{dir}"
|
307
|
-
end
|
308
|
-
|
309
|
-
##
|
310
|
-
# Shortcut method to specify the orientation of the graph. Defaults to :LR.
|
311
|
-
|
312
|
-
def rotate dir = :LR
|
313
|
-
orient dir
|
314
|
-
end
|
315
|
-
|
316
|
-
##
|
317
|
-
# Saves out both a dot file to path and an image for the specified type.
|
318
|
-
# Specify type as nil to skip exporting an image.
|
319
|
-
|
320
|
-
def save(path, type=nil)
|
321
|
-
File.open "#{path}.dot", "w" do |f|
|
322
|
-
f.puts self.to_s
|
323
|
-
end
|
324
|
-
system "#{engine} -T#{type} #{path}.dot > #{path}.#{type}" if type
|
325
|
-
end
|
326
|
-
|
327
|
-
##
|
328
|
-
# Shortcut method to create a new shape Attribute instance.
|
329
|
-
|
330
|
-
def shape shape
|
331
|
-
Attribute.new "shape = #{shape}"
|
332
|
-
end
|
333
|
-
|
334
|
-
##
|
335
|
-
# Shortcut method to create a new style Attribute instance.
|
336
|
-
|
337
|
-
def style name
|
338
|
-
Attribute.new "style = #{name}"
|
339
|
-
end
|
340
|
-
|
341
|
-
##
|
342
|
-
# Shortcut method to create a subgraph in the current graph. Use
|
343
|
-
# with the top-level +digraph+ method in block form for a graph DSL.
|
344
|
-
|
345
|
-
def subgraph name = nil, &block
|
346
|
-
GraphvizBuilder.new name, self, &block
|
347
|
-
end
|
348
|
-
|
349
|
-
##
|
350
|
-
# Shortcut method to create a clustered subgraph in the current
|
351
|
-
# graph. Use with the top-level +digraph+ method in block form for a
|
352
|
-
# graph DSL.
|
353
|
-
|
354
|
-
def cluster name, &block
|
355
|
-
subgraph "cluster_#{name}", &block
|
356
|
-
end
|
357
|
-
|
358
|
-
##
|
359
|
-
# Outputs a graphviz graph.
|
360
|
-
|
361
|
-
def to_s
|
362
|
-
result = []
|
363
|
-
|
364
|
-
type = graph ? "subgraph " : "digraph "
|
365
|
-
type << "\"#{name}\"" if name and !name.empty?
|
366
|
-
result << "#{type} {"
|
367
|
-
|
368
|
-
graph_attribs.each do |line|
|
369
|
-
result << " #{line};"
|
370
|
-
end
|
371
|
-
|
372
|
-
unless node_attribs.empty? then
|
373
|
-
result << " node [ #{node_attribs.join(", ")} ];"
|
374
|
-
end
|
375
|
-
|
376
|
-
unless edge_attribs.empty? then
|
377
|
-
result << " edge [ #{edge_attribs.join(", ")} ];"
|
378
|
-
end
|
379
|
-
|
380
|
-
subgraphs.each do |line|
|
381
|
-
result << " #{line.to_s.rstrip};"
|
382
|
-
end
|
383
|
-
|
384
|
-
nodes.each do |name, node|
|
385
|
-
result << " #{node.to_s.rstrip};" if graph or node.attributes? or node.orphan?
|
386
|
-
end
|
387
|
-
|
388
|
-
edges.each do |from, deps|
|
389
|
-
deps.each do |to, edge|
|
390
|
-
result << " #{edge.to_s.rstrip};"
|
391
|
-
end
|
392
|
-
end
|
393
|
-
|
394
|
-
result << "}"
|
395
|
-
result.join "\n#{" "*self.depth}"
|
396
|
-
end
|
397
|
-
|
398
|
-
##
|
399
|
-
# An attribute for a graph, node, or edge. Really just a composable
|
400
|
-
# string (via #+) with a convenience method #<< that allows you to
|
401
|
-
# "paint" nodes and edges with this attribute.
|
402
|
-
|
403
|
-
class Attribute < Struct.new :attr
|
404
|
-
##
|
405
|
-
# "Paint" graphs, nodes, and edges with this attribute.
|
406
|
-
#
|
407
|
-
# red << node1 << node2 << node3
|
408
|
-
#
|
409
|
-
# is the same as:
|
410
|
-
#
|
411
|
-
# node1.attributes << red
|
412
|
-
# node2.attributes << red
|
413
|
-
# node3.attributes << red
|
414
|
-
|
415
|
-
def << thing
|
416
|
-
thing.attributes << self
|
417
|
-
thing.attributes.uniq!
|
418
|
-
self
|
419
|
-
end
|
420
|
-
|
421
|
-
##
|
422
|
-
# Returns the attribute in string form.
|
423
|
-
|
424
|
-
alias :to_s :attr
|
425
|
-
|
426
|
-
##
|
427
|
-
# Compose a new attribute from two existing attributes:
|
428
|
-
#
|
429
|
-
# bad_nodes = red + filled + diamond
|
430
|
-
|
431
|
-
def + style
|
432
|
-
c = CompoundAttribute.new
|
433
|
-
c.push self
|
434
|
-
c.push style
|
435
|
-
c
|
436
|
-
end
|
437
|
-
end
|
438
|
-
|
439
|
-
class CompoundAttribute < Attribute
|
440
|
-
def initialize attr = []
|
441
|
-
super
|
442
|
-
end
|
443
|
-
|
444
|
-
def push attrib
|
445
|
-
attr.push attrib
|
446
|
-
end
|
447
|
-
|
448
|
-
def << thing
|
449
|
-
attr.each do |subattr|
|
450
|
-
subattr << thing # allows for recursive compound attributes
|
451
|
-
end
|
452
|
-
self
|
453
|
-
end
|
454
|
-
|
455
|
-
def to_s
|
456
|
-
attr.join ", "
|
457
|
-
end
|
458
|
-
end
|
459
|
-
|
460
|
-
class Thingy < Struct.new :graph, :attributes
|
461
|
-
def initialize graph
|
462
|
-
super graph, []
|
463
|
-
end
|
464
|
-
|
465
|
-
def quote(str)
|
466
|
-
%Q{"#{str}"}
|
467
|
-
end
|
468
|
-
|
469
|
-
def pad_with_attributes(text)
|
470
|
-
width = 40 - (2 * graph.depth)
|
471
|
-
if self.attributes? then
|
472
|
-
"%-#{width}s [ %s ]" % [text, attributes.join(',')]
|
473
|
-
else
|
474
|
-
text
|
475
|
-
end
|
476
|
-
end
|
477
|
-
|
478
|
-
def initialize_copy other # :nodoc:
|
479
|
-
super
|
480
|
-
self.attributes = other.attributes.dup
|
481
|
-
end
|
482
|
-
|
483
|
-
##
|
484
|
-
# Shortcut method to set the label attribute.
|
485
|
-
|
486
|
-
def label name
|
487
|
-
attributes.reject! { |s| s =~ /^label =/ }
|
488
|
-
attributes << "label = \"#{name.to_s.gsub(/\n/, '\n')}\""
|
489
|
-
self
|
490
|
-
end
|
491
|
-
|
492
|
-
##
|
493
|
-
# Does this thing have attributes?
|
494
|
-
|
495
|
-
def attributes?
|
496
|
-
not self.attributes.empty?
|
497
|
-
end
|
498
|
-
end
|
499
|
-
|
500
|
-
##
|
501
|
-
# An edge in a graph.
|
502
|
-
|
503
|
-
class Edge < Thingy
|
504
|
-
|
505
|
-
attr_accessor :from, :to, :from_slot, :to_slot
|
506
|
-
|
507
|
-
##
|
508
|
-
# Create a new edge in +graph+ from +from+ to +to+.
|
509
|
-
|
510
|
-
def initialize graph, from, to, from_slot=nil, to_slot=nil
|
511
|
-
super graph
|
512
|
-
self.from = from
|
513
|
-
self.to = to
|
514
|
-
self.from_slot = from_slot
|
515
|
-
self.to_slot = to_slot
|
516
|
-
end
|
517
|
-
|
518
|
-
##
|
519
|
-
# Returns the edge in dot syntax.
|
520
|
-
|
521
|
-
def to_s
|
522
|
-
from_name = quote(from.name)
|
523
|
-
to_name = quote(to.name)
|
524
|
-
fromto = "%-18s -> %s" % [from_name, to_name]
|
525
|
-
pad_with_attributes(fromto)
|
526
|
-
end
|
527
|
-
end
|
528
|
-
|
529
|
-
##
|
530
|
-
# Nodes in the graph.
|
531
|
-
|
532
|
-
class Node < Thingy
|
533
|
-
|
534
|
-
attr_accessor :name
|
535
|
-
|
536
|
-
def connected?
|
537
|
-
edges = graph.edges
|
538
|
-
|
539
|
-
edges.include?(name) or edges.any? { |from, deps| deps.include? name }
|
540
|
-
end
|
541
|
-
|
542
|
-
def orphan?
|
543
|
-
not connected?
|
544
|
-
end
|
545
|
-
|
546
|
-
##
|
547
|
-
# Create a new Node. Takes a parent graph and a name.
|
548
|
-
|
549
|
-
def initialize graph, name
|
550
|
-
super graph
|
551
|
-
self.name = name
|
552
|
-
end
|
553
|
-
|
554
|
-
##
|
555
|
-
# Create a new node with +name+ and an edge between them pointing
|
556
|
-
# from self to the new node.
|
557
|
-
|
558
|
-
def >> name
|
559
|
-
self[name] # creates node and edge
|
560
|
-
self
|
561
|
-
end
|
562
|
-
|
563
|
-
alias :"<<" :">>"
|
564
|
-
|
565
|
-
##
|
566
|
-
# Returns the edge between self and +dep_name+.
|
567
|
-
|
568
|
-
def [] dep_name
|
569
|
-
graph.edges[name][dep_name]
|
570
|
-
end
|
571
|
-
|
572
|
-
##
|
573
|
-
# Returns the node in dot syntax.
|
574
|
-
|
575
|
-
def to_s
|
576
|
-
pad_with_attributes(quote(name))
|
577
|
-
end
|
578
|
-
end
|
579
|
-
end
|
580
|
-
end
|