wukong 3.0.0.pre → 3.0.0.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,53 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
|
5
|
-
#
|
6
|
-
# Bigram counts
|
7
|
-
#
|
8
|
-
# head -n 100 /usr/share/dict/words | ./examples/corpus/words_to_bigrams.rb | sort | /tmp/words_to_bigrams.rb
|
9
|
-
#
|
10
|
-
|
11
|
-
|
12
|
-
#
|
13
|
-
# Kludge to work in Elastic map reduce:
|
14
|
-
#
|
15
|
-
# If your script is ./examples/corpus/words_to_bigrams.rb, make symlinks
|
16
|
-
# to it from ./examples/corpus/words_to_bigrams__map.rb and
|
17
|
-
# ./examples/corpus/words_to_bigrams__reduce.rb
|
18
|
-
#
|
19
|
-
if $0 =~ /__(map|reduce)\.rb$/
|
20
|
-
Settings[$1.to_sym] = true
|
21
|
-
end
|
22
|
-
|
23
|
-
|
24
|
-
#
|
25
|
-
# given one word per line
|
26
|
-
# emits all successive pairs of characters in that word
|
27
|
-
# eg 'boooo-urns' yields
|
28
|
-
# bo oo oo oo o- -u ur rn ns
|
29
|
-
#
|
30
|
-
class WordNGrams < Wukong::Streamer::Base
|
31
|
-
def process word
|
32
|
-
word[0..-2].chars.zip(word[1..-1].chars).each do |ngram_2|
|
33
|
-
yield ngram_2.join('')
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
#
|
39
|
-
# number of unique keys in a row
|
40
|
-
#
|
41
|
-
class KeyCountStreamer < Wukong::Streamer::AccumulatingReducer
|
42
|
-
def start! *args
|
43
|
-
@count = 0
|
44
|
-
end
|
45
|
-
def accumulate *args
|
46
|
-
@count += 1
|
47
|
-
end
|
48
|
-
def finalize
|
49
|
-
yield [key, @count]
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
Wukong::Script.new(WordNGrams, KeyCountStreamer).run
|
@@ -1,110 +0,0 @@
|
|
1
|
-
h1. Using Elastic Map-Reduce in Wukong
|
2
|
-
|
3
|
-
h2. Initial Setup
|
4
|
-
|
5
|
-
# Sign up for elastic map reduce and S3 at Amazon AWS.
|
6
|
-
|
7
|
-
# Download the Amazon elastic-mapreduce runner: either the official version at http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip or the infochimps fork (which has support for Ruby 1.9) at http://github.com/infochimps/elastic-mapreduce .
|
8
|
-
|
9
|
-
# Create a bucket and path to hold your EMR logs, scripts and other ephemera. For instance you might choose 'emr.yourdomain.com' as the bucket and '/wukong' as a scoping path within that bucket. In that case you will refer to it with a path like s3://emr.yourdomain.com/wukong (see notes below about s3n:// vs. s3:// URLs).
|
10
|
-
|
11
|
-
# Copy the contents of wukong/examples/emr/dot_wukong_dir to ~/.wukong
|
12
|
-
|
13
|
-
# Edit emr.yaml and credentials.json, adding your keys where appropriate and following the other instructions. Start with a single-node m1.small cluster as you'll probably have some false starts beforethe flow of logging in, checking the logs, etc becomes clear.
|
14
|
-
|
15
|
-
# You should now be good to launch a program. We'll give it the @--alive@ flag so that the machine sticks around if there were any issues:
|
16
|
-
|
17
|
-
./elastic_mapreduce_example.rb --run=emr --alive s3://emr.yourdomain.com/wukong/data/input s3://emr.yourdomain.com/wukong/data/output
|
18
|
-
|
19
|
-
# If you visit the "AWS console":http://bit.ly/awsconsole you should now see a jobflow with two steps. The first sets up debugging for the job; the second is your hadoop task.
|
20
|
-
|
21
|
-
# The "AWS console":http://bit.ly/awsconsole also has the public IP of the master node. You can log in to the machine directly:
|
22
|
-
|
23
|
-
<pre>
|
24
|
-
ssh -i /path/to/your/keypair.pem hadoop@ec2-148-37-14-128.compute-1.amazonaws.com
|
25
|
-
</pre>
|
26
|
-
|
27
|
-
h3. Lorkbong
|
28
|
-
|
29
|
-
Lorkbong (named after the staff carried by Sun Wukong) is a very very simple example Heroku app that lets you trigger showing job status or launching a new job, either by visiting a special URL or by triggering a rake task. Get its code from
|
30
|
-
|
31
|
-
http://github.com/mrflip/lorkbong
|
32
|
-
|
33
|
-
h3. s3n:// vs. s3:// URLs
|
34
|
-
|
35
|
-
Many external tools use a URI convention to address files in S3; they typically use the 's3://' scheme, which makes a lot of sense:
|
36
|
-
s3://emr.yourcompany.com/wukong/happy_job_1/logs/whatever-20100808.log
|
37
|
-
|
38
|
-
Hadoop can maintain an HDFS on the Amazon S3: it uses a block structure and has optimizations for streaming, no file size limitation, and other goodness. However, only hadoop tools can interpret the contents of those blocks -- to everything else it just looks like a soup of blocks labelled block_-8675309 and so forth. Hadoop unfortunately chose the 's3://' scheme for URIs in this filesystem:
|
39
|
-
s3://s3hdfs.yourcompany.com/path/to/data
|
40
|
-
|
41
|
-
Hadoop is happy to read s3 native files -- 'native' as in, you can look at them with a browser and upload them an download them with any S3 tool out there. There's a 5GB limit on file size, and in some cases a performance hit (but not in our experience enough to worry about). You refer to these files with the 's3n://' scheme ('n' as in 'native'):
|
42
|
-
s3n://emr.yourcompany.com/wukong/happy_job_1/code/happy_job_1-mapper.rb
|
43
|
-
s3n://emr.yourcompany.com/wukong/happy_job_1/code/happy_job_1-reducer.rb
|
44
|
-
s3n://emr.yourcompany.com/wukong/happy_job_1/logs/whatever-20100808.log
|
45
|
-
|
46
|
-
Wukong will coerce things to the right scheme when it knows what that scheme should be (eg. code should be s3n://). It will otherwise leave the path alone. Specifically, if you use a URI scheme for input and output paths you must use 's3n://' for normal s3 files.
|
47
|
-
|
48
|
-
h2. Advanced Tips n' Tricks for common usage
|
49
|
-
|
50
|
-
h3. Direct access to logs using your browser
|
51
|
-
|
52
|
-
Each Hadoop component exposes a web dashboard for you to access. Use the following ports:
|
53
|
-
|
54
|
-
* 9100: Job tracker (master only)
|
55
|
-
* 9101: Namenode (master only)
|
56
|
-
* 9102: Datanodes
|
57
|
-
* 9103: Task trackers
|
58
|
-
|
59
|
-
They will only, however, respond to web requests from within the private cluster
|
60
|
-
subnet. You can browse the cluster by creating a persistent tunnel to the hadoop master node, and configuring your
|
61
|
-
browser to use it as a proxy.
|
62
|
-
|
63
|
-
h4. Create a tunneling proxy to your cluster
|
64
|
-
|
65
|
-
To create a tunnel from your local machine to the master node, substitute the keypair and the master node's address into this command:
|
66
|
-
|
67
|
-
<pre><code>
|
68
|
-
ssh -i ~/.wukong/keypairs/KEYPAIR.pem -f -N -D 6666 -o StrictHostKeyChecking=no -o "ConnectTimeout=10" -o "ServerAliveInterval=60" -o "ControlPath=none" ubuntu@MASTER_NODE_PUBLIC_IP
|
69
|
-
</code></pre>
|
70
|
-
|
71
|
-
The command will silently background itself if it worked.
|
72
|
-
|
73
|
-
h4. Make your browser use the proxy (but only for cluster machines)
|
74
|
-
|
75
|
-
You can access basic information by pointing your browser to "this Proxy
|
76
|
-
Auto-Configuration (PAC)
|
77
|
-
file.":http://github.com/infochimps/cluster_chef/raw/master/config/proxy.pac
|
78
|
-
You'll have issues if you browse around though, because many of the in-page
|
79
|
-
links will refer to addresses that only resolve within the cluster's private
|
80
|
-
namespace.
|
81
|
-
|
82
|
-
h4. Setup Foxy Proxy
|
83
|
-
|
84
|
-
To fix this, use "FoxyProxy":https://addons.mozilla.org/en-US/firefox/addon/2464
|
85
|
-
It allows you to manage multiple proxy configurations and to use the proxy for
|
86
|
-
DNS resolution (curing the private address problem).
|
87
|
-
|
88
|
-
Once you've installed the FoxyProxy extension and restarted Firefox,
|
89
|
-
|
90
|
-
* Set FoxyProxy to 'Use Proxies based on their pre-defined patterns and priorities'
|
91
|
-
* Create a new proxy, called 'EC2 Socks Proxy' or something
|
92
|
-
* Automatic proxy configuration URL: http://github.com/infochimps/cluster_chef/raw/master/config/proxy.pac
|
93
|
-
* Under 'General', check yes for 'Perform remote DNS lookups on host'
|
94
|
-
* Add the following URL patterns as 'whitelist' using 'Wildcards' (not regular expression):
|
95
|
-
|
96
|
-
* <code>*.compute-*.internal*</code>
|
97
|
-
* <code>*ec2.internal*</code>
|
98
|
-
* <code>*domu*.internal*</code>
|
99
|
-
* <code>*ec2*.amazonaws.com*</code>
|
100
|
-
* <code>*://10.*</code>
|
101
|
-
|
102
|
-
And this one as blacklist:
|
103
|
-
|
104
|
-
* <code>https://us-*st-1.ec2.amazonaws.com/*</code>
|
105
|
-
|
106
|
-
|
107
|
-
h3. Pulling to your local machine
|
108
|
-
|
109
|
-
s3cmd sync s3://s3n.infinitemonkeys.info/emr/elastic_mapreduce_example/log/ /tmp/emr_log/
|
110
|
-
|
@@ -1,69 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Elastic MapReduce config in wukong
|
3
|
-
#
|
4
|
-
|
5
|
-
# ===========================================================================
|
6
|
-
#
|
7
|
-
# Infrastructure options
|
8
|
-
#
|
9
|
-
|
10
|
-
# == Fill all your information into yet another file with your amazon key
|
11
|
-
# It needs to be in so many stupid places because nobody can agree on a
|
12
|
-
# filename or format.
|
13
|
-
#
|
14
|
-
:emr_credentials_file: ~/.wukong/credentials.json
|
15
|
-
|
16
|
-
#
|
17
|
-
# == Set the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY env vars, or enter them here
|
18
|
-
#
|
19
|
-
# :access_key: ASDFAHKHASDF
|
20
|
-
# :secret_access_key: ADSGHASDFJASDFASDF
|
21
|
-
|
22
|
-
# == Path to your keypair file.
|
23
|
-
#
|
24
|
-
:key_pair_file: ~/.wukong/keypairs/gibbon.pem
|
25
|
-
|
26
|
-
# == Keypair will be named after your file, or force the name
|
27
|
-
#
|
28
|
-
# :key_pair: ~
|
29
|
-
|
30
|
-
# == Path to the Amazon elastic-mapreduce runner. Get a copy from
|
31
|
-
# http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
|
32
|
-
#
|
33
|
-
:emr_runner: ~/ics/hadoop/elastic-mapreduce/elastic-mapreduce
|
34
|
-
|
35
|
-
# ===========================================================================
|
36
|
-
#
|
37
|
-
# Remote Paths
|
38
|
-
#
|
39
|
-
|
40
|
-
# == Wukong is opinionated about the paths and locations of scripts and
|
41
|
-
# everything. It will organize files by job name within the following path:
|
42
|
-
#
|
43
|
-
:emr_root: s3://s3n.infinitemonkeys.info/emr
|
44
|
-
|
45
|
-
# == If you specify the :emr_data_root path, then relative pathnames -- ones that
|
46
|
-
# do not look like a URI (s3://yadda/yada) and do not start with a '/' -- will
|
47
|
-
# be prefixed with this path prefix.
|
48
|
-
:emr_data_root: s3n://s3n.infinitemonkeys.info/data
|
49
|
-
|
50
|
-
|
51
|
-
# ===========================================================================
|
52
|
-
#
|
53
|
-
# Cluster Config
|
54
|
-
#
|
55
|
-
:num_instances: 1
|
56
|
-
:instance_type: m1.small
|
57
|
-
:master_instance_type: ~
|
58
|
-
:hadoop_version: '0.20'
|
59
|
-
:availability_zone: us-east-1b
|
60
|
-
|
61
|
-
# ===========================================================================
|
62
|
-
#
|
63
|
-
# Running and reporting options
|
64
|
-
#
|
65
|
-
:alive: true
|
66
|
-
:enable_debugging: true
|
67
|
-
:emr_runner_verbose: true
|
68
|
-
:emr_runner_debug: ~
|
69
|
-
:step_action: CANCEL_AND_WAIT # CANCEL_AND_WAIT, TERMINATE_JOB_FLOW or CONTINUE
|
@@ -1,33 +0,0 @@
|
|
1
|
-
#!/usr/bin/env bash
|
2
|
-
set -x # turn on tracing
|
3
|
-
|
4
|
-
# A url directory with the scripts you'd like to stuff into the machine
|
5
|
-
REMOTE_FILE_URL_BASE="http://github.com/infochimps/wukong"
|
6
|
-
|
7
|
-
# echo "`date` Broaden the apt universe"
|
8
|
-
# sudo bash -c 'echo "deb http://ftp.us.debian.org/debian lenny multiverse restricted universe" >> /etc/apt/sources.list.d/multiverse.list'
|
9
|
-
|
10
|
-
# Do a non interactive apt-get so the user is never prompted for input
|
11
|
-
export DEBIAN_FRONTEND=noninteractive
|
12
|
-
|
13
|
-
# Update package index and update the basic system files to newest versions
|
14
|
-
echo "`date` Apt update"
|
15
|
-
sudo apt-get -y update ;
|
16
|
-
sudo dpkg --configure -a
|
17
|
-
echo "`date` Apt upgrade, could take a while"
|
18
|
-
sudo apt-get -y safe-upgrade
|
19
|
-
echo "`date` Apt install"
|
20
|
-
sudo apt-get -f install ;
|
21
|
-
|
22
|
-
echo "`date` Installing base packages"
|
23
|
-
# libopenssl-ruby1.8 ssl-cert
|
24
|
-
sudo apt-get install -y unzip build-essential git-core ruby ruby1.8-dev rubygems ri irb build-essential wget git-core zlib1g-dev libxml2-dev;
|
25
|
-
echo "`date` Unchaining rubygems from the tyrrany of ubuntu"
|
26
|
-
sudo gem install --no-rdoc --no-ri rubygems-update --version=1.3.7 ; sudo /var/lib/gems/1.8/bin/update_rubygems; sudo gem update --no-rdoc --no-ri --system ; gem --version ;
|
27
|
-
|
28
|
-
echo "`date` Installing wukong and related gems"
|
29
|
-
sudo gem install --no-rdoc --no-ri addressable extlib htmlentities configliere yard wukong right_aws uuidtools cheat
|
30
|
-
sudo gem list
|
31
|
-
|
32
|
-
echo "`date` Wukong bootstrap complete: `date`"
|
33
|
-
true
|
@@ -1,28 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
Dir[File.dirname(__FILE__)+'/vendor/**/lib'].each{|dir| $: << dir }
|
3
|
-
require 'rubygems'
|
4
|
-
require 'wukong/script'
|
5
|
-
require 'wukong/script/emr_command'
|
6
|
-
|
7
|
-
#
|
8
|
-
# * Copy the emr.yaml from here into ~/.wukong/emr.yaml
|
9
|
-
# and edit it to suit.
|
10
|
-
# * Download the Amazon elastic-mapreduce runner. Get a copy from
|
11
|
-
# http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
|
12
|
-
# * Find out what breaks, fix it or ask us for help (coders@infochimps.org) and
|
13
|
-
# submit a patch
|
14
|
-
#
|
15
|
-
|
16
|
-
class FooStreamer < Wukong::Streamer::LineStreamer
|
17
|
-
def initialize *args
|
18
|
-
super *args
|
19
|
-
@line_no = 0
|
20
|
-
end
|
21
|
-
|
22
|
-
def process *args
|
23
|
-
yield ["%5d" % @line_no, *args]
|
24
|
-
@line_no += 1
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
Wukong::Script.new(FooStreamer, FooStreamer).run
|
@@ -1,74 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
|
5
|
-
#
|
6
|
-
# Given an adjacency pairs (from \t to) representation of a directed graph:
|
7
|
-
#
|
8
|
-
# 1 2
|
9
|
-
# 1 7
|
10
|
-
# 2 7
|
11
|
-
# 2 9
|
12
|
-
# 7 2
|
13
|
-
#
|
14
|
-
# It produces an "adjacency list":http://en.wikipedia.org/wiki/Adjacency_list representation:
|
15
|
-
#
|
16
|
-
# 1 > 2 7
|
17
|
-
# 2 > 7 9
|
18
|
-
# 7 > 2
|
19
|
-
# 9 >
|
20
|
-
#
|
21
|
-
# and
|
22
|
-
#
|
23
|
-
# 1 <
|
24
|
-
# 2 < 1 7
|
25
|
-
# 7 < 1 2
|
26
|
-
# 9 < 2
|
27
|
-
#
|
28
|
-
# (each column is tab-separated in the actual output)
|
29
|
-
#
|
30
|
-
#
|
31
|
-
#
|
32
|
-
module Gen1HoodEdges
|
33
|
-
class Mapper < Wukong::Streamer::Base
|
34
|
-
def process rsrc, src, dest, *_
|
35
|
-
src = src.to_i ; dest = dest.to_i
|
36
|
-
yield [ src, '>', dest ]
|
37
|
-
yield [ dest, '<', src ]
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
#
|
42
|
-
# Accumulate links onto single line.
|
43
|
-
#
|
44
|
-
# The reduce key is the target node and direction; we just stream through all
|
45
|
-
# pairs for each target node and output its neighbor nodes on the same line.
|
46
|
-
#
|
47
|
-
# To control memory usage, we will print directly to the output (and not run
|
48
|
-
# through the Emitter)
|
49
|
-
#
|
50
|
-
class Reducer < Wukong::Streamer::AccumulatingReducer
|
51
|
-
# clear the list of incoming paths
|
52
|
-
def start! target, dir, *args
|
53
|
-
print target + "\t" + dir # start line with target and list type
|
54
|
-
end
|
55
|
-
def accumulate target, dir, neighbor
|
56
|
-
print "\t" + neighbor # append neighbor to output, same line
|
57
|
-
end
|
58
|
-
def finalize
|
59
|
-
puts '' # start new line
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
class Script < Wukong::Script
|
64
|
-
def default_options
|
65
|
-
super.merge :sort_fields => 1, :partition_fields => 1
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
# Execute the script
|
71
|
-
Gen1HoodEdges::Script.new(
|
72
|
-
Gen1HoodEdges::Mapper,
|
73
|
-
Gen1HoodEdges::Reducer
|
74
|
-
).run
|
@@ -1,72 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
|
5
|
-
#
|
6
|
-
# Use this script to do a Breadth-First Search (BFS) of a graph.
|
7
|
-
#
|
8
|
-
# Usage:
|
9
|
-
# ./make_paths --head=[path_in_key] --tail=[path_out_key] --out_rsrc=[combined_path_key]
|
10
|
-
#
|
11
|
-
# For example, given an edge list in the file '1path.tsv' that looks like
|
12
|
-
# 1path n1 n2
|
13
|
-
# 1path n1 n3
|
14
|
-
# ... and so forth ...
|
15
|
-
# you can run
|
16
|
-
# for t in 1 2 3 4 5 6 7 8 9 ; do next=$((t+1)) ; time cat 1path.tsv "${t}path.tsv" | ./make_paths.rb --map --head="1path" --tail="${t}path" | sort -u | ./make_paths.rb --reduce --out_rsrc="${next}path" | sort -u > "${next}path.tsv" ; done
|
17
|
-
# to do a 9-deep breadth-first search.
|
18
|
-
#
|
19
|
-
module Gen1HoodEdges
|
20
|
-
class Mapper < Wukong::Streamer::RecordStreamer
|
21
|
-
def initialize
|
22
|
-
@head = Settings[:head]
|
23
|
-
@tail = Settings[:tail]
|
24
|
-
end
|
25
|
-
def process rsrc, *nodes
|
26
|
-
yield [ nodes.last, 'i', nodes[0..-2] ] if (rsrc == self.head)
|
27
|
-
yield [ nodes.first, 'o', nodes[1..-1] ] if (rsrc == self.tail)
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
#
|
32
|
-
# Accumulate ( !!in memory!!) all inbound links onto middle node
|
33
|
-
#
|
34
|
-
# Then for each outbound link, loop over those inbound links and emit the
|
35
|
-
# triple (in, mid,out)
|
36
|
-
#
|
37
|
-
class Reducer < Wukong::Streamer::AccumulatingReducer
|
38
|
-
attr_accessor :paths_in, :out_rsrc
|
39
|
-
def initialize
|
40
|
-
self.out_rsrc = Settings[:out_rsrc]
|
41
|
-
end
|
42
|
-
# clear the list of incoming paths
|
43
|
-
def start! *args
|
44
|
-
self.paths_in = []
|
45
|
-
end
|
46
|
-
def accumulate mid, dir, *nodes
|
47
|
-
case dir
|
48
|
-
when 'i'
|
49
|
-
self.paths_in << nodes
|
50
|
-
if (self.paths_in.length % 1000 == 0) && (self.paths_in.length > 10000)
|
51
|
-
$stderr.puts ["Accumulating:", mid, self.paths_in.length].join("\t")
|
52
|
-
end
|
53
|
-
when 'o'
|
54
|
-
paths_in.each do |path_in|
|
55
|
-
yield [self.out_rsrc, path_in, mid, *nodes]
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
def finalize
|
60
|
-
end
|
61
|
-
def get_key mid, *_
|
62
|
-
mid
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
# Execute the script
|
68
|
-
Wukong.run(
|
69
|
-
Gen1HoodEdges::Mapper,
|
70
|
-
Gen1HoodEdges::Reducer,
|
71
|
-
:sort_fields => 2, :partition_fields => 1
|
72
|
-
)
|