wukong 3.0.0.pre → 3.0.0.pre2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,140 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
require 'wukong/streamer/count_keys'
|
5
|
-
|
6
|
-
#
|
7
|
-
# Ch3ck out dis moist azz code bitches!!
|
8
|
-
#
|
9
|
-
#
|
10
|
-
|
11
|
-
#
|
12
|
-
# Do nothing more than bin users here, arbitrary and probably bad
|
13
|
-
#
|
14
|
-
class Mapper < Wukong::Streamer::RecordStreamer
|
15
|
-
def process rank, followers
|
16
|
-
followers = followers.to_i
|
17
|
-
if followers > 100
|
18
|
-
yield [9,rank]
|
19
|
-
elsif followers > 75
|
20
|
-
yield [8,rank]
|
21
|
-
elsif followers > 50
|
22
|
-
yield [7,rank]
|
23
|
-
elsif followers > 25
|
24
|
-
yield [6,rank]
|
25
|
-
elsif followers > 15
|
26
|
-
yield [5,rank]
|
27
|
-
elsif followers > 10
|
28
|
-
yield [4,rank]
|
29
|
-
elsif followers > 5
|
30
|
-
yield [3,rank]
|
31
|
-
elsif followers > 4
|
32
|
-
yield [2,rank]
|
33
|
-
elsif followers > 1
|
34
|
-
yield [1,rank]
|
35
|
-
else
|
36
|
-
yield [0,rank]
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
|
42
|
-
#
|
43
|
-
# Calculate percentile rank for every pr value in a given follower bracket
|
44
|
-
#
|
45
|
-
class Reducer < Wukong::Streamer::AccumulatingReducer
|
46
|
-
attr_accessor :count_bin
|
47
|
-
def start! bin, rank
|
48
|
-
self.count_bin ||= {}
|
49
|
-
self.count_bin[bin] ||= {}
|
50
|
-
end
|
51
|
-
|
52
|
-
def accumulate bin, rank
|
53
|
-
rank = (rank.to_f*10.0).round.to_f/10.0
|
54
|
-
self.count_bin[bin][rank] ||= 0
|
55
|
-
self.count_bin[bin][rank] += 1
|
56
|
-
end
|
57
|
-
|
58
|
-
def finalize
|
59
|
-
count_bin[key] = generate_all_pairs(key).inject({}){|h,pair| h[pair.first] = pair.last; h}
|
60
|
-
yield [key, count_bin[key].values.sort.join(",")]
|
61
|
-
end
|
62
|
-
|
63
|
-
#
|
64
|
-
# Write the final table to disk as a ruby hash
|
65
|
-
#
|
66
|
-
def after_stream
|
67
|
-
table = File.open("trstrank_table.rb", 'w')
|
68
|
-
table << "TRSTRANK_TABLE = " << count_bin.inspect
|
69
|
-
table.close
|
70
|
-
end
|
71
|
-
|
72
|
-
#
|
73
|
-
# Return percentile of a given trstrank for a given follower bracket
|
74
|
-
#
|
75
|
-
def percentile bin, rank
|
76
|
-
((count_less_than(bin,rank) + 0.5*frequency_of(bin,rank))/ total_num(bin) )*100.0
|
77
|
-
end
|
78
|
-
|
79
|
-
#
|
80
|
-
# Return the count of values less than rank
|
81
|
-
#
|
82
|
-
def count_less_than bin, rank
|
83
|
-
count_bin[bin].keys.inject(0){|count,key| count += count_bin[bin][key] if key.to_f < rank; count}
|
84
|
-
end
|
85
|
-
|
86
|
-
#
|
87
|
-
# Return the count of rank
|
88
|
-
#
|
89
|
-
def frequency_of bin, rank
|
90
|
-
count_bin[bin].keys.inject(0){|count,key| count += count_bin[bin][key] if key.to_f == rank; count}
|
91
|
-
end
|
92
|
-
|
93
|
-
#
|
94
|
-
# Return the total number in sample
|
95
|
-
#
|
96
|
-
def total_num bin
|
97
|
-
count_bin[bin].values.inject(0){|count,v| count += v; count}
|
98
|
-
end
|
99
|
-
|
100
|
-
#
|
101
|
-
# Generate a list of all pairs {trstrank => percentile}, interpolate when necessary
|
102
|
-
#
|
103
|
-
def generate_all_pairs bin
|
104
|
-
h = {}
|
105
|
-
count_bin[bin].keys.each do |rank|
|
106
|
-
h[rank.to_f] = percentile(bin, rank.to_f)
|
107
|
-
end
|
108
|
-
h[0.0] ||= 0.0
|
109
|
-
h[10.0] ||= 100.0
|
110
|
-
arr = h.to_a.sort!{|x,y| x.first <=> y.first}
|
111
|
-
list = arr.zip(arr[1..-1])
|
112
|
-
big_list = []
|
113
|
-
big_list << [0.0,0.0]
|
114
|
-
list.each do |pairs|
|
115
|
-
interpolate(pairs.first, pairs.last, 0.1).each{|pair| big_list << pair}
|
116
|
-
end
|
117
|
-
big_list.uniq.sort{|x,y| x.first <=> y.first}
|
118
|
-
end
|
119
|
-
|
120
|
-
|
121
|
-
#
|
122
|
-
# Nothing to see here, move along
|
123
|
-
#
|
124
|
-
def interpolate pair1, pair2, dx
|
125
|
-
return [pair1] if pair2.blank?
|
126
|
-
m = (pair2.last - pair1.last)/(pair2.first - pair1.first) # slope
|
127
|
-
b = pair2.last - m*pair2.first # y intercept
|
128
|
-
num = ((pair2.first - pair1.first)/dx).abs.round # number of points to interpolate
|
129
|
-
points = []
|
130
|
-
num.times do |i|
|
131
|
-
x = pair1.first + (i+1).to_f*dx
|
132
|
-
y = m*x + b
|
133
|
-
points << [x,y]
|
134
|
-
end
|
135
|
-
points # return an array of pairs
|
136
|
-
end
|
137
|
-
|
138
|
-
end
|
139
|
-
|
140
|
-
Wukong::Script.new(Mapper,Reducer).run
|
@@ -1,173 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
require 'wukong/streamer/rank_and_bin_reducer'
|
5
|
-
|
6
|
-
#
|
7
|
-
# This example uses the classes from http://github.com/mrflip/twitter_friends
|
8
|
-
# (That's sloppy, and I apologize. I'm building this script for that, but it
|
9
|
-
# seems broadly useful and I'm not maintaining two copies. Once this script is
|
10
|
-
# more worky we'll make it standalone. Anyway you should get the picture.)
|
11
|
-
#
|
12
|
-
$: << File.dirname(__FILE__)+'/../../projects/twitter_friends/lib'
|
13
|
-
require 'twitter_friends';
|
14
|
-
require 'twitter_friends/struct_model' ; include TwitterFriends::StructModel
|
15
|
-
|
16
|
-
|
17
|
-
#
|
18
|
-
# attrs to bin
|
19
|
-
#
|
20
|
-
BINNABLE_ATTRS = {
|
21
|
-
:twitter_user => [
|
22
|
-
[:followers_count, :fo ],
|
23
|
-
[:friends_count, :fr ],
|
24
|
-
[:statuses_count, :st ],
|
25
|
-
[:favourites_count, :fv ],
|
26
|
-
[:created_at, :crat ]
|
27
|
-
]
|
28
|
-
|
29
|
-
}
|
30
|
-
RESOURCE_ALIASES = {
|
31
|
-
:twitter_user => :u,
|
32
|
-
:user_metrics => :um,
|
33
|
-
}
|
34
|
-
#
|
35
|
-
# KLUDGE This is not DRY at all but let's get it working first
|
36
|
-
#
|
37
|
-
BinUserMetrics = TypedStruct.new(
|
38
|
-
[:id, Integer],
|
39
|
-
*BINNABLE_ATTRS[:user_metrics].map{|attr, attr_abbr| [attr_abbr, Integer] }
|
40
|
-
)
|
41
|
-
BINNED_RESOURCE_ALIASES = {
|
42
|
-
:u => BinTwitterUser,
|
43
|
-
}
|
44
|
-
|
45
|
-
module RankAndBinAttrs
|
46
|
-
class ExplodeResourceMapper < Wukong::Streamer::StructStreamer
|
47
|
-
def get_and_format_attr thing, attr
|
48
|
-
val = thing.send(attr)
|
49
|
-
case thing.members_types[attr].to_s.to_sym
|
50
|
-
when :Integer then "%010d" % val.to_i
|
51
|
-
when :Float then "%020.7f" % val.to_f
|
52
|
-
when :Bignum then "%020d" % val.to_i
|
53
|
-
else
|
54
|
-
raise [val, thing.members_types[attr].to_s.to_sym].inspect
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
#
|
59
|
-
# The data expansion of this mapper is large enough that it makes sense to
|
60
|
-
# be a little responsible with what we emit. We'll use the RESOURCE_ALIASES
|
61
|
-
# and BINNABLE_ATTRS hashes, above, to dump a more parsimonious
|
62
|
-
# representation.
|
63
|
-
#
|
64
|
-
def process thing, *args, &block
|
65
|
-
attr_abbrs = BINNABLE_ATTRS[thing.class.resource_name]
|
66
|
-
return unless attr_abbrs
|
67
|
-
attr_abbrs.each do |attr, abbr|
|
68
|
-
yield [
|
69
|
-
RESOURCE_ALIASES[thing.class.resource_name],
|
70
|
-
abbr,
|
71
|
-
get_and_format_attr(thing, attr),
|
72
|
-
thing.id.to_i
|
73
|
-
]
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
class BinAttrReducer < Wukong::Streamer::RankAndBinReducer
|
79
|
-
attr_accessor :last_rsrc_attr
|
80
|
-
#
|
81
|
-
# Note that we might get several different resources at the same reducer
|
82
|
-
#
|
83
|
-
def get_key rsrc, attr, val, *args
|
84
|
-
if [rsrc, attr] != self.last_rsrc_attr
|
85
|
-
# Note: since each partition has the same cardinality, we don't need to
|
86
|
-
# fiddle around with the bin_size, etc -- just reset the order
|
87
|
-
# parameters' state.
|
88
|
-
reset_order_params!
|
89
|
-
self.last_rsrc_attr = [rsrc, attr]
|
90
|
-
end
|
91
|
-
val
|
92
|
-
end
|
93
|
-
|
94
|
-
#
|
95
|
-
# Note well -- we are rearranging the field order to
|
96
|
-
#
|
97
|
-
# resource_abbr id attr_abbr bin
|
98
|
-
#
|
99
|
-
# for proper sorting to the re-assembler
|
100
|
-
#
|
101
|
-
def emit record
|
102
|
-
rsrc, attr, val, id, numbering, rank, bin = record
|
103
|
-
super [rsrc, id, attr, bin]
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
class ReassembleObjectReducer < Wukong::Streamer::AccumulatingReducer
|
108
|
-
attr_accessor :thing
|
109
|
-
def klass_from_abbr rsrc_abbr
|
110
|
-
BINNED_RESOURCE_ALIASES[rsrc_abbr.to_sym]
|
111
|
-
end
|
112
|
-
def get_key rsrc_abbr, id, *args
|
113
|
-
[rsrc_abbr, id.to_i]
|
114
|
-
end
|
115
|
-
|
116
|
-
def start! rsrc_abbr, id, *args
|
117
|
-
klass = klass_from_abbr(rsrc_abbr)
|
118
|
-
self.thing = klass.new id.to_i
|
119
|
-
end
|
120
|
-
|
121
|
-
def accumulate rsrc, id, attr, bin
|
122
|
-
thing.send("#{attr}=", bin)
|
123
|
-
end
|
124
|
-
|
125
|
-
def finalize
|
126
|
-
yield thing
|
127
|
-
end
|
128
|
-
end
|
129
|
-
|
130
|
-
#
|
131
|
-
# Two-phase script
|
132
|
-
#
|
133
|
-
# FIXME -- We need a runner class to manage this.
|
134
|
-
#
|
135
|
-
class Script < Wukong::Script
|
136
|
-
attr_accessor :phase
|
137
|
-
# KLUDGE !!
|
138
|
-
def initialize
|
139
|
-
case
|
140
|
-
when ARGV.detect{|arg| arg =~ /--phase=1/}
|
141
|
-
# Phase 1 -- Steal underpants. Also, disassemble each object, and find
|
142
|
-
# the bin for each binnable attribute's value
|
143
|
-
self.phase = 1
|
144
|
-
self.mapper_klass, self.reducer_klass = [ExplodeResourceMapper, BinAttrReducer]
|
145
|
-
when ARGV.detect{|arg| arg =~ /--phase=2/}
|
146
|
-
# Phase 2 -- ????
|
147
|
-
raise "Phase 2 : ????"
|
148
|
-
when ARGV.detect{|arg| arg =~ /--phase=3/}
|
149
|
-
# Phase 3 -- profit. In this case, put records back together.
|
150
|
-
self.phase = 3
|
151
|
-
self.mapper_klass, self.reducer_klass = [nil, ReassembleObjectReducer]
|
152
|
-
else
|
153
|
-
raise "Please run me with a --phase= option"
|
154
|
-
end
|
155
|
-
super mapper_klass, reducer_klass
|
156
|
-
end
|
157
|
-
|
158
|
-
def default_options
|
159
|
-
extra_options =
|
160
|
-
case self.phase
|
161
|
-
# partition on [rsrc, attr]; sort on [rsrc, attr, val]
|
162
|
-
when 1 then { :sort_fields => 3, :partition_fields => 2 }
|
163
|
-
# sort on [rsrc, id]
|
164
|
-
when 3 then { :sort_fields => 2 }
|
165
|
-
else { }
|
166
|
-
end
|
167
|
-
super.merge extra_options
|
168
|
-
end
|
169
|
-
end
|
170
|
-
|
171
|
-
# execute script
|
172
|
-
Script.new.run
|
173
|
-
end
|
@@ -1,40 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
|
5
|
-
# Run as (local mode)
|
6
|
-
#
|
7
|
-
# ./examples/stupidly_simple_filter.rb --run=local input.tsv output.tsv
|
8
|
-
#
|
9
|
-
# for hadoop mode,
|
10
|
-
#
|
11
|
-
# ./examples/stupidly_simple_filter.rb --run=hadoop input.tsv output.tsv
|
12
|
-
#
|
13
|
-
# For debugging, run
|
14
|
-
#
|
15
|
-
# cat input.tsv | ./examples/stupidly_simple_filter.rb --map input.tsv | more
|
16
|
-
#
|
17
|
-
|
18
|
-
class Mapper < LineStreamer
|
19
|
-
include Filter
|
20
|
-
MATCHER = %r{(ford|mercury|saab|mazda|isuzu)}
|
21
|
-
|
22
|
-
#
|
23
|
-
# A very simple mapper -- looks for a regex match in one field,
|
24
|
-
# and emits the whole record if the field matches
|
25
|
-
#
|
26
|
-
#
|
27
|
-
# Given a series of records like:
|
28
|
-
#
|
29
|
-
# tweet 123456789 20100102030405 @frank: I'm having a bacon sandwich
|
30
|
-
# tweet 123456789 20100102030405 @jerry, I'm having your baby
|
31
|
-
#
|
32
|
-
# emits only the lines matching that regex
|
33
|
-
#
|
34
|
-
def emit? line
|
35
|
-
MATCHER.match line
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
# Execute the script
|
40
|
-
Wukong.run(Mapper)
|
data/old/examples/word_count.rb
DELETED
@@ -1,75 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
|
5
|
-
module WordCount
|
6
|
-
class Mapper < Wukong::Streamer::LineStreamer
|
7
|
-
#
|
8
|
-
# Split a string into its constituent words.
|
9
|
-
#
|
10
|
-
# This is pretty simpleminded:
|
11
|
-
# * downcase the word
|
12
|
-
# * Split at any non-alphanumeric boundary, including '_'
|
13
|
-
# * However, preserve the special cases of 's, 'd or 't at the end of a
|
14
|
-
# word.
|
15
|
-
#
|
16
|
-
# tokenize("Ability is a poor man's wealth #johnwoodenquote")
|
17
|
-
# # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
|
18
|
-
#
|
19
|
-
def tokenize str
|
20
|
-
return [] if str.blank?
|
21
|
-
str = str.downcase;
|
22
|
-
# kill off all punctuation except [stuff]'s or [stuff]'t
|
23
|
-
# this includes hyphens (words are split)
|
24
|
-
str = str.
|
25
|
-
gsub(/[^a-zA-Z0-9\']+/, ' ').
|
26
|
-
gsub(/(\w)\'([stdm]|re|ve|ll)\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
|
27
|
-
# Busticate at whitespace
|
28
|
-
words = str.split(/\s+/)
|
29
|
-
words.reject!{|w| w.length < 3 }
|
30
|
-
words
|
31
|
-
end
|
32
|
-
|
33
|
-
#
|
34
|
-
# Emit each word in each line.
|
35
|
-
#
|
36
|
-
def process line
|
37
|
-
tokenize(line).each{|word| yield [word, 1] }
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
#
|
42
|
-
# You can stack up all the values in a list then sum them at once.
|
43
|
-
#
|
44
|
-
# This isn't good style, as it means the whole list is held in memory
|
45
|
-
#
|
46
|
-
class Reducer1 < Wukong::Streamer::ListReducer
|
47
|
-
def finalize
|
48
|
-
yield [ values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot }, key ]
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
#
|
53
|
-
# A bit kinder to your memory manager: accumulate the sum record-by-record:
|
54
|
-
#
|
55
|
-
class Reducer2 < Wukong::Streamer::AccumulatingReducer
|
56
|
-
def start!(*args) @key_count = 0 end
|
57
|
-
def accumulate(*args) @key_count += 1 end
|
58
|
-
def finalize
|
59
|
-
yield [ @key_count, key ]
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
#
|
64
|
-
# ... easiest of all, though: this is common enough that it's already included
|
65
|
-
#
|
66
|
-
require 'wukong/streamer/count_keys'
|
67
|
-
class Reducer3 < Wukong::Streamer::CountKeys
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
# Execute the script
|
72
|
-
Wukong.run(
|
73
|
-
WordCount::Mapper,
|
74
|
-
WordCount::Reducer2
|
75
|
-
)
|
@@ -1,580 +0,0 @@
|
|
1
|
-
#!/usr/local/bin/ruby -w
|
2
|
-
|
3
|
-
require "enumerator"
|
4
|
-
|
5
|
-
##
|
6
|
-
# Graph models directed graphs and subgraphs and outputs in graphviz's
|
7
|
-
# dot format.
|
8
|
-
|
9
|
-
module Hanuman
|
10
|
-
|
11
|
-
class GraphvizBuilder
|
12
|
-
VERSION = "2.5.0" # :nodoc:
|
13
|
-
|
14
|
-
LIGHT_COLORS = %w(gray lightblue lightcyan lightgray lightpink
|
15
|
-
lightslategray lightsteelblue white)
|
16
|
-
|
17
|
-
# WTF -- can't be %w() because of a bug in rcov
|
18
|
-
BOLD_COLORS = [:black, :brown, :mediumblue, :blueviolet,
|
19
|
-
:orange, :magenta, :darkgreen, :maroon,
|
20
|
-
:violetred, :purple, :greenyellow, :deeppink,
|
21
|
-
:midnightblue, :firebrick, :darkturquoise,
|
22
|
-
:mediumspringgreen, :chartreuse, :navy,
|
23
|
-
:lightseagreen, :chocolate, :lawngreen, :green,
|
24
|
-
:indigo, :darkgoldenrod, :darkviolet, :red,
|
25
|
-
:springgreen, :saddlebrown, :mediumvioletred,
|
26
|
-
:goldenrod, :tomato, :cyan, :forestgreen,
|
27
|
-
:darkorchid, :crimson, :coral, :deepskyblue,
|
28
|
-
:seagreen, :peru, :turquoise, :orangered,
|
29
|
-
:dodgerblue, :sienna, :limegreen, :royalblue,
|
30
|
-
:darkorange, :blue]
|
31
|
-
|
32
|
-
##
|
33
|
-
# Defines the brewer color schemes and the maximum number of colors
|
34
|
-
# in each set.
|
35
|
-
|
36
|
-
COLOR_SCHEME_MAX = {
|
37
|
-
:accent => 8, :blues => 9, :brbg => 11, :bugn => 9,
|
38
|
-
:dark2 => 8, :gnbu => 9, :greens => 9, :greys => 9,
|
39
|
-
:oranges => 9, :orrd => 9, :paired => 12, :pastel1 => 9,
|
40
|
-
:pastel2 => 8, :piyg => 11, :prgn => 11, :pubu => 9,
|
41
|
-
:pubugn => 9, :puor => 11, :purd => 9, :purples => 9,
|
42
|
-
:rdbu => 11, :rdgy => 11, :rdylbu => 11, :rdylgn => 11,
|
43
|
-
:reds => 9, :set1 => 9, :set2 => 8, :set3 => 12,
|
44
|
-
:spectral => 11, :ylgn => 9, :ylgnbu => 9, :ylorbr => 9,
|
45
|
-
:ylorrd => 9
|
46
|
-
}
|
47
|
-
|
48
|
-
SHAPES = %w[
|
49
|
-
Mcircle Mdiamond Msquare box box3d circle component
|
50
|
-
diamond doublecircle doubleoctagon egg ellipse folder
|
51
|
-
hexagon house invhouse invtrapezium invtriangle none
|
52
|
-
note octagon parallelogram pentagon plaintext point
|
53
|
-
polygon rect rectangle septagon square tab trapezium
|
54
|
-
triangle tripleoctagon
|
55
|
-
].map(&:to_sym)
|
56
|
-
|
57
|
-
STYLES = %w(dashed dotted solid invis bold filled diagonals rounded).map(&:to_sym)
|
58
|
-
|
59
|
-
ARROW_RE = /(?:o?[lr]?(?:box|crow|diamond|dot|inv|none|normal|tee|vee)){1,4}/
|
60
|
-
|
61
|
-
ARROWS = %w(box crow diamond dot inv none normal tee vee).map(&:to_sym)
|
62
|
-
|
63
|
-
STYLES.each do |name|
|
64
|
-
define_method(name) { style name }
|
65
|
-
end
|
66
|
-
|
67
|
-
(BOLD_COLORS + LIGHT_COLORS).each do |name|
|
68
|
-
define_method(name) { color name }
|
69
|
-
end
|
70
|
-
|
71
|
-
SHAPES.each do |name|
|
72
|
-
method_name = name.downcase.to_s.sub(/none/, 'shape_none')
|
73
|
-
define_method(method_name) { shape name }
|
74
|
-
end
|
75
|
-
|
76
|
-
ARROWS.each do |name|
|
77
|
-
method_name = {
|
78
|
-
:none => "none_arrow",
|
79
|
-
:box => "box_arrow",
|
80
|
-
:diamond => "diamond_arrow",
|
81
|
-
}[name] || name
|
82
|
-
|
83
|
-
define_method(method_name) { arrowhead name }
|
84
|
-
end
|
85
|
-
|
86
|
-
ENGINES = %w[ circo dot fdp neato osage sfdp twopi ].map(&:to_sym)
|
87
|
-
|
88
|
-
##
|
89
|
-
# A parent graph, if any. Only used for subgraphs.
|
90
|
-
|
91
|
-
attr_accessor :graph
|
92
|
-
|
93
|
-
##
|
94
|
-
# The name of the graph. Optional for graphs and subgraphs. Prefix
|
95
|
-
# the name of a subgraph with "cluster" for subgraph that is boxed.
|
96
|
-
|
97
|
-
attr_accessor :name
|
98
|
-
|
99
|
-
##
|
100
|
-
# Global attributes for edges in this graph.
|
101
|
-
|
102
|
-
attr_reader :edge_attribs
|
103
|
-
|
104
|
-
##
|
105
|
-
# The hash of hashes of edges in this graph. Use #[] or #node to create edges.
|
106
|
-
|
107
|
-
attr_reader :edges
|
108
|
-
|
109
|
-
##
|
110
|
-
# Global attributes for this graph.
|
111
|
-
|
112
|
-
attr_reader :graph_attribs
|
113
|
-
|
114
|
-
##
|
115
|
-
# Global attributes for nodes in this graph.
|
116
|
-
|
117
|
-
attr_reader :node_attribs
|
118
|
-
|
119
|
-
##
|
120
|
-
# The hash of nodes in this graph. Use #[] or #node to create nodes.
|
121
|
-
|
122
|
-
attr_reader :nodes
|
123
|
-
|
124
|
-
##
|
125
|
-
# An array of subgraphs.
|
126
|
-
|
127
|
-
attr_reader :subgraphs
|
128
|
-
|
129
|
-
##
|
130
|
-
# Creates a new graph object. Optional name and parent graph are
|
131
|
-
# available. Also takes an optional block for DSL-like use.
|
132
|
-
|
133
|
-
def initialize name = nil, graph = nil, &block
|
134
|
-
@name = name.to_sym
|
135
|
-
@graph = graph
|
136
|
-
graph << self if graph
|
137
|
-
@nodes = Hash.new { |h,k| h[k] = Node.new self, k }
|
138
|
-
@edges = Hash.new { |h,k|
|
139
|
-
h[k] = Hash.new { |h2, k2| h2[k2] = Edge.new self, self[k], self[k2] }
|
140
|
-
}
|
141
|
-
@graph_attribs = []
|
142
|
-
@node_attribs = []
|
143
|
-
@edge_attribs = []
|
144
|
-
@subgraphs = []
|
145
|
-
|
146
|
-
engine(:dot)
|
147
|
-
|
148
|
-
self.scheme = graph.scheme if graph
|
149
|
-
node_attribs << scheme if scheme
|
150
|
-
configurate(&block) if block
|
151
|
-
end
|
152
|
-
|
153
|
-
def depth
|
154
|
-
graph.nil? ? 0 : graph.depth + 1
|
155
|
-
end
|
156
|
-
|
157
|
-
def configurate(&block)
|
158
|
-
(block.arity == 0) ? instance_eval(&block) : block.call(self)
|
159
|
-
self
|
160
|
-
end
|
161
|
-
|
162
|
-
def engine(engine_name=nil)
|
163
|
-
return @engine unless engine_name
|
164
|
-
raise ArgumentError, "Don't have engine #{engine_name} listed -- should be one of #{ENGINES}" unless ENGINES.include?(engine_name.to_sym)
|
165
|
-
@engine = engine_name
|
166
|
-
end
|
167
|
-
|
168
|
-
##
|
169
|
-
# Push a subgraph into the current graph. Sets the subgraph's graph to self.
|
170
|
-
|
171
|
-
def << subgraph
|
172
|
-
subgraphs << subgraph
|
173
|
-
subgraph.graph = self
|
174
|
-
end
|
175
|
-
|
176
|
-
##
|
177
|
-
# Access a node by name
|
178
|
-
|
179
|
-
def [] name
|
180
|
-
nodes[name]
|
181
|
-
end
|
182
|
-
|
183
|
-
def arrowhead shape
|
184
|
-
raise ArgumentError, "Bad arrow shape: #{shape}" unless shape =~ ARROW_RE
|
185
|
-
Attribute.new "arrowhead = #{shape}"
|
186
|
-
end
|
187
|
-
|
188
|
-
def arrowtail shape
|
189
|
-
raise ArgumentError, "Bad arrow shape: #{shape}" unless shape =~ ARROW_RE
|
190
|
-
Attribute.new "arrowtail = #{shape}"
|
191
|
-
end
|
192
|
-
|
193
|
-
def arrowsize size
|
194
|
-
Attribute.new "arrowsize = #{size}"
|
195
|
-
end
|
196
|
-
|
197
|
-
##
|
198
|
-
# A convenience method to set the global node attributes to use boxes.
|
199
|
-
|
200
|
-
def boxes
|
201
|
-
node_attribs << shape(:box)
|
202
|
-
end
|
203
|
-
|
204
|
-
##
|
205
|
-
# Shortcut method to create a new color Attribute instance.
|
206
|
-
|
207
|
-
def color color
|
208
|
-
Attribute.new "color = #{color}"
|
209
|
-
end
|
210
|
-
|
211
|
-
##
|
212
|
-
# Shortcut method to create a new colorscheme Attribute instance. If
|
213
|
-
# passed +n+, +name+ must match one of the brewer color scheme names
|
214
|
-
# and it will generate accessors for each fillcolor as well as push
|
215
|
-
# the colorscheme onto the node_attribs.
|
216
|
-
|
217
|
-
attr_accessor :scheme
|
218
|
-
|
219
|
-
def colorscheme name, n = nil
|
220
|
-
self.scheme = Attribute.new "colorscheme = #{name}#{n}"
|
221
|
-
max = COLOR_SCHEME_MAX[name.to_sym]
|
222
|
-
|
223
|
-
node_attribs << scheme if max
|
224
|
-
|
225
|
-
scheme
|
226
|
-
end
|
227
|
-
|
228
|
-
(1..COLOR_SCHEME_MAX.values.max).map { |m|
|
229
|
-
define_method "c#{m}" do
|
230
|
-
GraphvizBuilder::Attribute.new("fillcolor = #{m}")
|
231
|
-
end
|
232
|
-
}
|
233
|
-
|
234
|
-
##
|
235
|
-
# Define one or more edges.
|
236
|
-
#
|
237
|
-
# edge :a, :b, :c, ...
|
238
|
-
#
|
239
|
-
# is equivalent to:
|
240
|
-
#
|
241
|
-
# edge :a, :b
|
242
|
-
# edge :b, :c
|
243
|
-
# ...
|
244
|
-
|
245
|
-
def edge(*names)
|
246
|
-
last = nil
|
247
|
-
names.each_cons(2) do |from, to|
|
248
|
-
last = self[from][to]
|
249
|
-
end
|
250
|
-
last
|
251
|
-
end
|
252
|
-
|
253
|
-
##
|
254
|
-
# Creates a new Graph whose edges point the other direction.
|
255
|
-
|
256
|
-
def invert(new_name=nil)
|
257
|
-
result = self.class.new(new_name || "#{name}_inverted")
|
258
|
-
edges.each do |from, h|
|
259
|
-
h.each do |to, edge|
|
260
|
-
result[to][from]
|
261
|
-
end
|
262
|
-
end
|
263
|
-
result
|
264
|
-
end
|
265
|
-
|
266
|
-
##
|
267
|
-
# Shortcut method to create a new fillcolor Attribute instance.
|
268
|
-
|
269
|
-
def fillcolor n
|
270
|
-
Attribute.new "fillcolor = #{n}"
|
271
|
-
end
|
272
|
-
|
273
|
-
##
|
274
|
-
# Shortcut method to create a new font Attribute instance. You can
|
275
|
-
# pass in both the name and an optional font size.
|
276
|
-
|
277
|
-
def font name
|
278
|
-
Attribute.new "fontname = #{name.inspect}"
|
279
|
-
end
|
280
|
-
|
281
|
-
def fontsize size
|
282
|
-
Attribute.new "fontsize = #{size}"
|
283
|
-
end
|
284
|
-
|
285
|
-
##
|
286
|
-
# Shortcut method to set the graph's label. Usually used with subgraphs.
|
287
|
-
|
288
|
-
def label name
|
289
|
-
graph_attribs << %Q{label = "#{name.to_s.gsub(/\n/, '\n')}"} # ""
|
290
|
-
end
|
291
|
-
|
292
|
-
##
|
293
|
-
# Access a node by name, supplying an optional label
|
294
|
-
|
295
|
-
def node name, label = nil
|
296
|
-
n = nodes[name]
|
297
|
-
n.label label if label
|
298
|
-
n
|
299
|
-
end
|
300
|
-
|
301
|
-
##
|
302
|
-
# Shortcut method to specify the orientation of the graph. Defaults
|
303
|
-
# to the graphviz default "TB".
|
304
|
-
|
305
|
-
def orient dir = :TB
|
306
|
-
graph_attribs << "rankdir = #{dir}"
|
307
|
-
end
|
308
|
-
|
309
|
-
##
|
310
|
-
# Shortcut method to specify the orientation of the graph. Defaults to :LR.
|
311
|
-
|
312
|
-
def rotate dir = :LR
|
313
|
-
orient dir
|
314
|
-
end
|
315
|
-
|
316
|
-
##
|
317
|
-
# Saves out both a dot file to path and an image for the specified type.
|
318
|
-
# Specify type as nil to skip exporting an image.
|
319
|
-
|
320
|
-
def save(path, type=nil)
|
321
|
-
File.open "#{path}.dot", "w" do |f|
|
322
|
-
f.puts self.to_s
|
323
|
-
end
|
324
|
-
system "#{engine} -T#{type} #{path}.dot > #{path}.#{type}" if type
|
325
|
-
end
|
326
|
-
|
327
|
-
##
|
328
|
-
# Shortcut method to create a new shape Attribute instance.
|
329
|
-
|
330
|
-
def shape shape
|
331
|
-
Attribute.new "shape = #{shape}"
|
332
|
-
end
|
333
|
-
|
334
|
-
##
|
335
|
-
# Shortcut method to create a new style Attribute instance.
|
336
|
-
|
337
|
-
def style name
|
338
|
-
Attribute.new "style = #{name}"
|
339
|
-
end
|
340
|
-
|
341
|
-
##
|
342
|
-
# Shortcut method to create a subgraph in the current graph. Use
|
343
|
-
# with the top-level +digraph+ method in block form for a graph DSL.
|
344
|
-
|
345
|
-
def subgraph name = nil, &block
|
346
|
-
GraphvizBuilder.new name, self, &block
|
347
|
-
end
|
348
|
-
|
349
|
-
##
|
350
|
-
# Shortcut method to create a clustered subgraph in the current
|
351
|
-
# graph. Use with the top-level +digraph+ method in block form for a
|
352
|
-
# graph DSL.
|
353
|
-
|
354
|
-
def cluster name, &block
|
355
|
-
subgraph "cluster_#{name}", &block
|
356
|
-
end
|
357
|
-
|
358
|
-
##
|
359
|
-
# Outputs a graphviz graph.
|
360
|
-
|
361
|
-
def to_s
|
362
|
-
result = []
|
363
|
-
|
364
|
-
type = graph ? "subgraph " : "digraph "
|
365
|
-
type << "\"#{name}\"" if name and !name.empty?
|
366
|
-
result << "#{type} {"
|
367
|
-
|
368
|
-
graph_attribs.each do |line|
|
369
|
-
result << " #{line};"
|
370
|
-
end
|
371
|
-
|
372
|
-
unless node_attribs.empty? then
|
373
|
-
result << " node [ #{node_attribs.join(", ")} ];"
|
374
|
-
end
|
375
|
-
|
376
|
-
unless edge_attribs.empty? then
|
377
|
-
result << " edge [ #{edge_attribs.join(", ")} ];"
|
378
|
-
end
|
379
|
-
|
380
|
-
subgraphs.each do |line|
|
381
|
-
result << " #{line.to_s.rstrip};"
|
382
|
-
end
|
383
|
-
|
384
|
-
nodes.each do |name, node|
|
385
|
-
result << " #{node.to_s.rstrip};" if graph or node.attributes? or node.orphan?
|
386
|
-
end
|
387
|
-
|
388
|
-
edges.each do |from, deps|
|
389
|
-
deps.each do |to, edge|
|
390
|
-
result << " #{edge.to_s.rstrip};"
|
391
|
-
end
|
392
|
-
end
|
393
|
-
|
394
|
-
result << "}"
|
395
|
-
result.join "\n#{" "*self.depth}"
|
396
|
-
end
|
397
|
-
|
398
|
-
##
|
399
|
-
# An attribute for a graph, node, or edge. Really just a composable
|
400
|
-
# string (via #+) with a convenience method #<< that allows you to
|
401
|
-
# "paint" nodes and edges with this attribute.
|
402
|
-
|
403
|
-
class Attribute < Struct.new :attr
|
404
|
-
##
|
405
|
-
# "Paint" graphs, nodes, and edges with this attribute.
|
406
|
-
#
|
407
|
-
# red << node1 << node2 << node3
|
408
|
-
#
|
409
|
-
# is the same as:
|
410
|
-
#
|
411
|
-
# node1.attributes << red
|
412
|
-
# node2.attributes << red
|
413
|
-
# node3.attributes << red
|
414
|
-
|
415
|
-
def << thing
|
416
|
-
thing.attributes << self
|
417
|
-
thing.attributes.uniq!
|
418
|
-
self
|
419
|
-
end
|
420
|
-
|
421
|
-
##
|
422
|
-
# Returns the attribute in string form.
|
423
|
-
|
424
|
-
alias :to_s :attr
|
425
|
-
|
426
|
-
##
|
427
|
-
# Compose a new attribute from two existing attributes:
|
428
|
-
#
|
429
|
-
# bad_nodes = red + filled + diamond
|
430
|
-
|
431
|
-
def + style
|
432
|
-
c = CompoundAttribute.new
|
433
|
-
c.push self
|
434
|
-
c.push style
|
435
|
-
c
|
436
|
-
end
|
437
|
-
end
|
438
|
-
|
439
|
-
class CompoundAttribute < Attribute
|
440
|
-
def initialize attr = []
|
441
|
-
super
|
442
|
-
end
|
443
|
-
|
444
|
-
def push attrib
|
445
|
-
attr.push attrib
|
446
|
-
end
|
447
|
-
|
448
|
-
def << thing
|
449
|
-
attr.each do |subattr|
|
450
|
-
subattr << thing # allows for recursive compound attributes
|
451
|
-
end
|
452
|
-
self
|
453
|
-
end
|
454
|
-
|
455
|
-
def to_s
|
456
|
-
attr.join ", "
|
457
|
-
end
|
458
|
-
end
|
459
|
-
|
460
|
-
class Thingy < Struct.new :graph, :attributes
|
461
|
-
def initialize graph
|
462
|
-
super graph, []
|
463
|
-
end
|
464
|
-
|
465
|
-
def quote(str)
|
466
|
-
%Q{"#{str}"}
|
467
|
-
end
|
468
|
-
|
469
|
-
def pad_with_attributes(text)
|
470
|
-
width = 40 - (2 * graph.depth)
|
471
|
-
if self.attributes? then
|
472
|
-
"%-#{width}s [ %s ]" % [text, attributes.join(',')]
|
473
|
-
else
|
474
|
-
text
|
475
|
-
end
|
476
|
-
end
|
477
|
-
|
478
|
-
def initialize_copy other # :nodoc:
|
479
|
-
super
|
480
|
-
self.attributes = other.attributes.dup
|
481
|
-
end
|
482
|
-
|
483
|
-
##
|
484
|
-
# Shortcut method to set the label attribute.
|
485
|
-
|
486
|
-
def label name
|
487
|
-
attributes.reject! { |s| s =~ /^label =/ }
|
488
|
-
attributes << "label = \"#{name.to_s.gsub(/\n/, '\n')}\""
|
489
|
-
self
|
490
|
-
end
|
491
|
-
|
492
|
-
##
|
493
|
-
# Does this thing have attributes?
|
494
|
-
|
495
|
-
def attributes?
|
496
|
-
not self.attributes.empty?
|
497
|
-
end
|
498
|
-
end
|
499
|
-
|
500
|
-
##
|
501
|
-
# An edge in a graph.
|
502
|
-
|
503
|
-
class Edge < Thingy
|
504
|
-
|
505
|
-
attr_accessor :from, :to, :from_slot, :to_slot
|
506
|
-
|
507
|
-
##
|
508
|
-
# Create a new edge in +graph+ from +from+ to +to+.
|
509
|
-
|
510
|
-
def initialize graph, from, to, from_slot=nil, to_slot=nil
|
511
|
-
super graph
|
512
|
-
self.from = from
|
513
|
-
self.to = to
|
514
|
-
self.from_slot = from_slot
|
515
|
-
self.to_slot = to_slot
|
516
|
-
end
|
517
|
-
|
518
|
-
##
|
519
|
-
# Returns the edge in dot syntax.
|
520
|
-
|
521
|
-
def to_s
|
522
|
-
from_name = quote(from.name)
|
523
|
-
to_name = quote(to.name)
|
524
|
-
fromto = "%-18s -> %s" % [from_name, to_name]
|
525
|
-
pad_with_attributes(fromto)
|
526
|
-
end
|
527
|
-
end
|
528
|
-
|
529
|
-
##
|
530
|
-
# Nodes in the graph.
|
531
|
-
|
532
|
-
class Node < Thingy
|
533
|
-
|
534
|
-
attr_accessor :name
|
535
|
-
|
536
|
-
def connected?
|
537
|
-
edges = graph.edges
|
538
|
-
|
539
|
-
edges.include?(name) or edges.any? { |from, deps| deps.include? name }
|
540
|
-
end
|
541
|
-
|
542
|
-
def orphan?
|
543
|
-
not connected?
|
544
|
-
end
|
545
|
-
|
546
|
-
##
|
547
|
-
# Create a new Node. Takes a parent graph and a name.
|
548
|
-
|
549
|
-
def initialize graph, name
|
550
|
-
super graph
|
551
|
-
self.name = name
|
552
|
-
end
|
553
|
-
|
554
|
-
##
|
555
|
-
# Create a new node with +name+ and an edge between them pointing
|
556
|
-
# from self to the new node.
|
557
|
-
|
558
|
-
def >> name
|
559
|
-
self[name] # creates node and edge
|
560
|
-
self
|
561
|
-
end
|
562
|
-
|
563
|
-
alias :"<<" :">>"
|
564
|
-
|
565
|
-
##
|
566
|
-
# Returns the edge between self and +dep_name+.
|
567
|
-
|
568
|
-
def [] dep_name
|
569
|
-
graph.edges[name][dep_name]
|
570
|
-
end
|
571
|
-
|
572
|
-
##
|
573
|
-
# Returns the node in dot syntax.
|
574
|
-
|
575
|
-
def to_s
|
576
|
-
pad_with_attributes(quote(name))
|
577
|
-
end
|
578
|
-
end
|
579
|
-
end
|
580
|
-
end
|