wukong 3.0.0.pre → 3.0.0.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,68 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
|
5
|
-
class Edge < Struct.new(:src, :dest)
|
6
|
-
end
|
7
|
-
|
8
|
-
class MultiEdge < Struct.new(
|
9
|
-
:src, :dest,
|
10
|
-
:a_follows_b, :b_follows_a,
|
11
|
-
:a_replies_b, :b_replies_a,
|
12
|
-
:a_favorites_b, :b_favorites_a
|
13
|
-
)
|
14
|
-
end
|
15
|
-
|
16
|
-
module Gen1HoodEdges
|
17
|
-
class Mapper < Wukong::Streamer::Base
|
18
|
-
def process rsrc, src, dest
|
19
|
-
# next if (src.to_i == 0) || (dest.to_i == 0)
|
20
|
-
yield [ dest, 'i', src ]
|
21
|
-
yield [ src, 'o', dest]
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
#
|
26
|
-
# Accumulate ( !!in memory!!) all inbound links onto middle node
|
27
|
-
#
|
28
|
-
# Then for each outbound link, loop over those inbound links and emit the
|
29
|
-
# triple (in, mid,out)
|
30
|
-
#
|
31
|
-
class Reducer < Wukong::Streamer::AccumulatingReducer
|
32
|
-
attr_accessor :ins
|
33
|
-
def start! *args
|
34
|
-
self.ins = []
|
35
|
-
end
|
36
|
-
def accumulate mid, dir, node
|
37
|
-
case dir
|
38
|
-
when 'i'
|
39
|
-
self.ins << node
|
40
|
-
if (self.ins.length % 1000 == 0) && (self.ins.length > 10000)
|
41
|
-
$stderr.puts ["Accumulating:", mid, self.ins.length].join("\t")
|
42
|
-
end
|
43
|
-
when 'o'
|
44
|
-
ins.each do |inn|
|
45
|
-
yield ['path_2', inn, mid, node]
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
def finalize
|
50
|
-
end
|
51
|
-
def get_key mid, *_
|
52
|
-
mid
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
class Script < Wukong::Script
|
57
|
-
def default_options
|
58
|
-
super.merge :sort_fields => 2, :partition_fields => 1
|
59
|
-
end
|
60
|
-
end
|
61
|
-
|
62
|
-
end
|
63
|
-
|
64
|
-
# Execute the script
|
65
|
-
Gen1HoodEdges::Script.new(
|
66
|
-
Gen1HoodEdges::Mapper,
|
67
|
-
Gen1HoodEdges::Reducer
|
68
|
-
).run
|
@@ -1,112 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong'
|
4
|
-
|
5
|
-
#
|
6
|
-
# Takes any number of flavors of directed edge with the form
|
7
|
-
#
|
8
|
-
# a_relatesto_b src_id dest_id [optional fields]
|
9
|
-
#
|
10
|
-
# and prepares a combined adjacency list. You need to supply a model named
|
11
|
-
# "MultiEdge" with members for each edge type.
|
12
|
-
#
|
13
|
-
# For instance, suppose you have a social network with edges like
|
14
|
-
#
|
15
|
-
# a_follows_b user_a_id user_b_id
|
16
|
-
# a_messages_b user_a_id user_b_id message_id date
|
17
|
-
# a_favorites_b user_a_id user_b_id message_id date
|
18
|
-
#
|
19
|
-
# Your MultiEdge class might look like
|
20
|
-
#
|
21
|
-
# class MultiEdge < Struct(
|
22
|
-
# :src, :dest,
|
23
|
-
# :a_follows_b, :b_follows_a,
|
24
|
-
# :a_messages_b, :b_messages_a,
|
25
|
-
# :a_favorites_b, :b_favorites_a
|
26
|
-
# )
|
27
|
-
# end
|
28
|
-
#
|
29
|
-
# The row for a user pair who follows each other; with user_a #24601 messaging b
|
30
|
-
# 57 times and favoriting 5 of user_b's messages; and user_b #8675309 messaging
|
31
|
-
# 62 times and favoriting none, will emerge as (tab separated, with [blank]
|
32
|
-
# indicating there is no text in that slot):
|
33
|
-
#
|
34
|
-
# ...
|
35
|
-
# 24601 8675309 1 1 57 62 5 [blank]
|
36
|
-
# ...
|
37
|
-
#
|
38
|
-
module GenMultiEdge
|
39
|
-
#
|
40
|
-
# Emit each relation as
|
41
|
-
#
|
42
|
-
# src dest rel
|
43
|
-
#
|
44
|
-
# Canonicalizes the src and dest ids to 10-character, zero-padded strings.
|
45
|
-
# (Ten chars fits a 32-bit up-to-4-billion-and-change unsigned integer.)
|
46
|
-
# Discards all the ancillary crap except +src+, +dest+ and +rel+
|
47
|
-
#
|
48
|
-
class Mapper < Wukong::Streamer::Base
|
49
|
-
def process rsrc, src, dest, *_
|
50
|
-
# note that a_retweets_b_id matches here
|
51
|
-
m = /^a_([a-z]+)_b.*/.match(rsrc) or return
|
52
|
-
rel = m.captures.first
|
53
|
-
src = src.to_i ; dest = dest.to_i
|
54
|
-
return if ((src == 0) || (dest == 0))
|
55
|
-
yield [src, dest, "a_#{rel}_b"]
|
56
|
-
yield [dest, src, "b_#{rel}_a"]
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
#
|
61
|
-
# Aggregate all sightings of relations for each pair into
|
62
|
-
# a single combined
|
63
|
-
#
|
64
|
-
# Note that [a,b] and [b,a] /each/ have a listing, with the a->b and b<-a
|
65
|
-
# relations repeated for each. That is, if there is an "a_messages_b"
|
66
|
-
# relation, you'll have edges
|
67
|
-
#
|
68
|
-
# x y ... a_messages_b(x,y) b_messages_a(y,x) ...
|
69
|
-
# y x ... a_messages_b(y,x) b_messages_a(x,y) ...
|
70
|
-
#
|
71
|
-
#
|
72
|
-
class Reducer < Wukong::Streamer::AccumulatingReducer
|
73
|
-
attr_accessor :multi_edge
|
74
|
-
def get_key src, dest, rel
|
75
|
-
[src, dest]
|
76
|
-
end
|
77
|
-
def start! *args
|
78
|
-
self.multi_edge = MultiEdge.new
|
79
|
-
end
|
80
|
-
def accumulate src, dest, rel
|
81
|
-
self.multi_edge[rel] ||= 0
|
82
|
-
self.multi_edge[rel] += 1
|
83
|
-
end
|
84
|
-
def finalize
|
85
|
-
multi_edge.src, multi_edge.dest = key
|
86
|
-
yield self.multi_edge
|
87
|
-
end
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
Edge = TypedStruct.new(
|
92
|
-
[:src, Integer],
|
93
|
-
[:dest, Integer]
|
94
|
-
)
|
95
|
-
|
96
|
-
MultiEdge = TypedStruct.new(
|
97
|
-
[:src, Integer],
|
98
|
-
[:dest, Integer],
|
99
|
-
[:a_follows_b, Integer],
|
100
|
-
[:b_follows_a, Integer],
|
101
|
-
[:a_replies_b, Integer],
|
102
|
-
[:b_replies_a, Integer],
|
103
|
-
[:a_atsigns_b, Integer],
|
104
|
-
[:b_atsigns_a, Integer],
|
105
|
-
[:a_retweets_b, Integer],
|
106
|
-
[:b_retweets_a, Integer],
|
107
|
-
[:a_favorites_b, Integer],
|
108
|
-
[:b_favorites_a, Integer]
|
109
|
-
)
|
110
|
-
|
111
|
-
# Execute the script
|
112
|
-
Script.new(Mapper, Reducer, :sort_fields => 2).run
|
@@ -1,64 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
$: << File.dirname(__FILE__)+'/../../lib'
|
4
|
-
require 'wukong'
|
5
|
-
|
6
|
-
class Edge < Struct.new(:src, :dest)
|
7
|
-
end
|
8
|
-
|
9
|
-
class ASymmetricB < Edge
|
10
|
-
end
|
11
|
-
|
12
|
-
module Wukong::Streamer
|
13
|
-
class EdgeStreamer < Wukong::Streamer::Base
|
14
|
-
def recordize line
|
15
|
-
rsrc, src, dest, *_ = super(line)
|
16
|
-
[ASymmetricB.new(src.to_i, dest.to_i)]
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
#
|
22
|
-
# Find symmetric links
|
23
|
-
#
|
24
|
-
# Takes adjacency list for a directed graph and emits only edges where
|
25
|
-
# A->B and B->A
|
26
|
-
#
|
27
|
-
# The output will list each such symmetric edge exactly once as
|
28
|
-
# a_symmetric_b node1 node2
|
29
|
-
# where node1 is lexicographically less than node2.
|
30
|
-
#
|
31
|
-
module FindSymmetricLinks
|
32
|
-
|
33
|
-
class Mapper < Wukong::Streamer::EdgeStreamer
|
34
|
-
def process edge
|
35
|
-
yield edge.to_flat(false)
|
36
|
-
yield ASymmetricB.new(edge.dest, edge.src).to_flat(false)
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
#
|
41
|
-
#
|
42
|
-
class Reducer < Wukong::Streamer::Base
|
43
|
-
def stream
|
44
|
-
%x{/usr/bin/uniq -c}.split("\n").each do |line|
|
45
|
-
key_count, rsrc, src, dest, data = line.chomp.strip.split(/\s+/, 4)
|
46
|
-
next unless key_count.to_i == 2
|
47
|
-
next unless src.to_i < dest.to_i
|
48
|
-
emit [src, dest, data].compact
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
class Script < Wukong::Script
|
54
|
-
def default_options
|
55
|
-
super.merge :sort_fields => 3
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
# Execute the script
|
61
|
-
Wukong::Script.new(
|
62
|
-
FindSymmetricLinks::Mapper,
|
63
|
-
FindSymmetricLinks::Reducer
|
64
|
-
).run
|
@@ -1,57 +0,0 @@
|
|
1
|
-
|
2
|
-
-- ===========================================================================
|
3
|
-
--
|
4
|
-
-- Load Graph
|
5
|
-
--
|
6
|
-
AFollowsB = LOAD 'twnew/all/a_follows_b' AS (rsrc: chararray, user_a_id: int, user_b_id: int) ;
|
7
|
-
FollEdges_0 = FOREACH AFollowsB GENERATE user_a_id AS src, user_b_id AS dest ;
|
8
|
-
|
9
|
-
InitPagerankFoll_0 = GROUP FollEdges_0 BY src ;
|
10
|
-
InitPagerankFoll_1 = FOREACH InitPagerankFoll_0 GENERATE
|
11
|
-
group AS src,
|
12
|
-
1.0F AS pagerank:float,
|
13
|
-
FollEdges_0.(dest) AS dests
|
14
|
-
;
|
15
|
-
rmf twnew/pagerank-foll/pagerank_graph_000 ;
|
16
|
-
STORE InitPagerankFoll_1 INTO 'twnew/pagerank-foll/pagerank_graph_000';
|
17
|
-
|
18
|
-
|
19
|
-
-- MultiEdge = LOAD 'twnew/all/multi_edge' AS (
|
20
|
-
-- rsrc: chararray, src: int, dest: int,
|
21
|
-
-- fo: int, fr: int,
|
22
|
-
-- re_out: int, re_in: int,
|
23
|
-
-- at_out: int, at_in: int,
|
24
|
-
-- rt_out: int, rt_in: int,
|
25
|
-
-- fv_out: int, fv_in: int) ;
|
26
|
-
--
|
27
|
-
-- SymmEdges_0 = FOREACH MultiEdge GENERATE src, dest, fo, fr ;
|
28
|
-
-- SymmEdges_1 = FILTER SymmEdges_0 BY (fo >= 1.0) AND (fr >= 1.0) ;
|
29
|
-
-- SymmEdges = FOREACH SymmEdges_1 GENERATE src, dest ;
|
30
|
-
-- -- rm twnew/graphs/symm_edges; STORE SymmEdges INTO 'twnew/graphs/symm_edges' ;
|
31
|
-
-- SymmEdges = LOAD 'twnew/graphs/symm_edges' AS (src:int , dest:int);
|
32
|
-
--
|
33
|
-
-- AnyoutEdges_0 = FOREACH MultiEdge GENERATE src, dest, fo, re_out, fv_out ;
|
34
|
-
-- AnyoutEdges_1 = FILTER AnyoutEdges_0 BY (fo >= 1.0) OR (re_out >= 1.0) OR (fv_out >= 1.0) ;
|
35
|
-
-- AnyoutEdges = FOREACH AnyoutEdges_1 GENERATE src, dest ;
|
36
|
-
-- -- rm twnew/graphs/anyout_edges; STORE AnyoutEdges INTO 'twnew/graphs/anyout_edges' ;
|
37
|
-
-- AnyoutEdges = LOAD 'twnew/graphs/anyout_edges' AS (src:int , dest:int);
|
38
|
-
--
|
39
|
-
--
|
40
|
-
-- InitPagerankSymm_0 = GROUP SymmEdges BY src ;
|
41
|
-
-- InitPagerankSymm_1 = FOREACH InitPagerankSymm_0 GENERATE
|
42
|
-
-- group AS src,
|
43
|
-
-- 1.0F AS pagerank:float,
|
44
|
-
-- SymmEdges.(dest) AS dests
|
45
|
-
-- ;
|
46
|
-
-- rm twnew/pagerank-symm/pagerank_graph_000 ;
|
47
|
-
-- STORE InitPagerankSymm_1 INTO 'twnew/pagerank-symm/pagerank_graph_000';
|
48
|
-
--
|
49
|
-
--
|
50
|
-
-- InitPagerankAnyout_0 = GROUP AnyoutEdges BY src ;
|
51
|
-
-- InitPagerankAnyout_1 = FOREACH InitPagerankAnyout_0 GENERATE
|
52
|
-
-- group AS src,
|
53
|
-
-- 1.0F AS pagerank:float,
|
54
|
-
-- AnyoutEdges.(dest) AS dests
|
55
|
-
-- ;
|
56
|
-
-- rm twnew/pagerank-anyout/pagerank_graph_000 ;
|
57
|
-
-- STORE InitPagerankAnyout_1 INTO 'twnew/pagerank-anyout/pagerank_graph_000';
|
@@ -1,72 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
|
5
|
-
module PageRank
|
6
|
-
#
|
7
|
-
# Damping factor (prob. of a 'random' jump)
|
8
|
-
# 0.85 works well in practice. See http://en.wikipedia.org/wiki/Pagerank
|
9
|
-
#
|
10
|
-
DAMPING_FACTOR = 0.85
|
11
|
-
|
12
|
-
# Each user's line looks like
|
13
|
-
# user_a pagerank id1,id2,...,idN
|
14
|
-
# we need to disperse this user's pagerank to each of id1..idN, and
|
15
|
-
# rendezvous the list of outbound links at user_a's reducer as well.
|
16
|
-
module Iterating
|
17
|
-
class PagerankMapper < Wukong::Streamer::Base
|
18
|
-
#
|
19
|
-
# Send pagerank to each page, and send the dests list back to self
|
20
|
-
#
|
21
|
-
def process src, pagerank, dests_str, &block
|
22
|
-
# This lets us use Pig to generate the input
|
23
|
-
dests_str = dests_str.gsub(/[\(\{\}\)]/, '')
|
24
|
-
dests = dests_str.split(",")
|
25
|
-
yield_pagerank_shares src, pagerank, dests, &block
|
26
|
-
yield_own_dest_list src, dests_str, &block
|
27
|
-
end
|
28
|
-
|
29
|
-
# Take the source node's pagerank and distribute it among all the out-nodes
|
30
|
-
def yield_pagerank_shares src, pagerank, dests
|
31
|
-
pagerank_share = pagerank.to_f / dests.length
|
32
|
-
dests.each do |dest|
|
33
|
-
yield [dest, 'p', pagerank_share]
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
# Dispatch this user's out-node list to rendezvous with itself.
|
38
|
-
def yield_own_dest_list src, dests_str
|
39
|
-
yield [src, 'd', dests_str]
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
class PagerankReducer < Wukong::Streamer::AccumulatingReducer
|
44
|
-
attr_accessor :node_id, :pagerank, :dests_str
|
45
|
-
# Begin reduction with 0 accumulated pagerank and no dests as yet
|
46
|
-
def start! node_id, *args
|
47
|
-
self.node_id = node_id
|
48
|
-
self.pagerank = 0.0
|
49
|
-
self.dests_str = nil
|
50
|
-
end
|
51
|
-
# We'll receive fractional pagerank from all incoming edges,
|
52
|
-
# and the destination list from this node's map stage
|
53
|
-
def accumulate node_id, what, val
|
54
|
-
case what
|
55
|
-
when 'p' then self.pagerank += val.to_f
|
56
|
-
when 'd' then self.dests_str = val
|
57
|
-
else raise "Don't know how to accumulate #{[node_id, what, val].inspect}"
|
58
|
-
end
|
59
|
-
end
|
60
|
-
# To finalize, dump the damped pagerank and dest list
|
61
|
-
# in a form that can be fed back into this script
|
62
|
-
def finalize
|
63
|
-
damped_pagerank = (self.pagerank * DAMPING_FACTOR) + (1 - DAMPING_FACTOR)
|
64
|
-
self.dests_str = 'dummy' if self.dests_str.blank?
|
65
|
-
yield [node_id, damped_pagerank, dests_str]
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
|
-
Wukong.run(PagerankMapper, PagerankReducer,
|
70
|
-
:extra_args => ' -jobconf io.sort.record.percent=0.25 ')
|
71
|
-
end
|
72
|
-
end
|
@@ -1,42 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
require 'wukong/streamer/list_reducer'
|
5
|
-
|
6
|
-
module PageRank
|
7
|
-
class Script < Wukong::Script
|
8
|
-
#
|
9
|
-
# Input format is
|
10
|
-
#
|
11
|
-
# rsrc src_id dest_id [... junk ...]
|
12
|
-
#
|
13
|
-
# All we want from the line are its src and dest IDs.
|
14
|
-
#
|
15
|
-
def map_command
|
16
|
-
%Q{/usr/bin/cut -d"\t" -f2,3}
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
#
|
21
|
-
# Accumulate the dests list in memory, dump as a whole. Multiple edges between
|
22
|
-
# any two nodes are permitted, and will accumulate pagerank according to the
|
23
|
-
# edge's multiplicity.
|
24
|
-
#
|
25
|
-
class Reducer < Wukong::Streamer::ListReducer
|
26
|
-
def accumulate src, dest
|
27
|
-
@values << dest
|
28
|
-
end
|
29
|
-
|
30
|
-
# Emit src, initial pagerank, and flattened dests list
|
31
|
-
def finalize
|
32
|
-
@values = ['dummy'] if @values.blank?
|
33
|
-
yield [key, 1.0, @values.to_a.join(",")]
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
# Execute the script
|
38
|
-
Script.new(nil, PageRank::Reducer, :io_sort_record_percent => 0.25).run
|
39
|
-
end
|
40
|
-
|
41
|
-
|
42
|
-
|
@@ -1,21 +0,0 @@
|
|
1
|
-
#!/usr/bin/env bash
|
2
|
-
|
3
|
-
# Directory to pagerank on.
|
4
|
-
work_dir=$1 ; shift
|
5
|
-
if [ "$work_dir" == '' ] ; then echo "Please specify the parent of the directory made by gen_initial_pagerank: $0 initial_dir [number_of_iterations] [start_iteration]" ; exit ; fi
|
6
|
-
# How many rounds to run: default 10
|
7
|
-
n_iters=${1-10} ; shift
|
8
|
-
# the iteration to start with: default 0
|
9
|
-
start_i=${1-0} ; shift
|
10
|
-
|
11
|
-
# this directory
|
12
|
-
script_dir="`dirname $0`"
|
13
|
-
|
14
|
-
for (( iter=0 ; "$iter" < "$n_iters" ; iter++ )) ; do
|
15
|
-
curr_str=`printf "%03d" $(( $start_i + $iter ))`
|
16
|
-
next_str=`printf "%03d" $(( $start_i + $iter + 1 ))`
|
17
|
-
curr_dir=$work_dir/pagerank_graph_${curr_str}
|
18
|
-
next_dir=$work_dir/pagerank_graph_${next_str}
|
19
|
-
echo -e "Iteration $(( $iter + 1 )) / $n_iters:\t `basename $curr_dir` => `basename $next_dir`"
|
20
|
-
$script_dir/pagerank.rb --rm --run $curr_dir $next_dir
|
21
|
-
done
|