wukong 3.0.0.pre → 3.0.0.pre2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
data/docpages/INSTALL.textile
DELETED
@@ -1,92 +0,0 @@
|
|
1
|
-
---
|
2
|
-
layout: default
|
3
|
-
title: Install Notes
|
4
|
-
collapse: false
|
5
|
-
---
|
6
|
-
h1(gemheader). {{ site.gemname }} %(small):: install%
|
7
|
-
|
8
|
-
** "Get the code":#getcode
|
9
|
-
** "Setup":#setup
|
10
|
-
** "Installing and Running Wukong with Hadoop":#gethadoop
|
11
|
-
** "Installing and Running Wukong with Datamapper, ActiveRecord, the command-line and more":#others
|
12
|
-
|
13
|
-
|
14
|
-
<notextile><div class="toggle"></notextile>
|
15
|
-
|
16
|
-
h2(#getcode). Get the code
|
17
|
-
|
18
|
-
We're still actively developing {{ site.gemname }}. The newest version is available via "Git":http://git-scm.com on "github:":http://github.com/mrflip/{{ site.gemname }}
|
19
|
-
|
20
|
-
pre. $ git clone git://github.com/mrflip/{{ site.gemname }}
|
21
|
-
|
22
|
-
A gem is available from "gemcutter:":http://gemcutter.org/gems/{{ site.gemname }}
|
23
|
-
|
24
|
-
pre. $ sudo gem install {{ site.gemname }} --source=http://gemcutter.org
|
25
|
-
|
26
|
-
(don't use the gems.github.com version -- it's way out of date.)
|
27
|
-
|
28
|
-
You can instead download this project in either "zip":http://github.com/mrflip/{{ site.gemname }}/zipball/master or "tar":http://github.com/mrflip/{{ site.gemname }}/tarball/master formats.
|
29
|
-
|
30
|
-
h3. Get the Dependencies
|
31
|
-
|
32
|
-
* Hadoop
|
33
|
-
* Pig (optional)
|
34
|
-
* Parts of {{ site.gemname }} require these gems:
|
35
|
-
** addressable/uri
|
36
|
-
** htmlentities
|
37
|
-
** extlib
|
38
|
-
** YAML
|
39
|
-
** JSON
|
40
|
-
|
41
|
-
<notextile></div><div class="toggle"></notextile>
|
42
|
-
|
43
|
-
h2(#setup). Setup
|
44
|
-
|
45
|
-
1. Allow Wukong to discover where his elephant friend lives by setting a $HADOOP_HOME environment variable: @export HADOOP_HOME="/usr/local/share/hadoop"@
|
46
|
-
2. Add wukong's @bin/@ directory to your $PATH if you'd like to use the "wutils":wutils.html
|
47
|
-
|
48
|
-
<i>(see also: "Ruby Hadoop Quickstart":http://blog.pdatasolutions.com/post/191978092/ruby-on-hadoop-quickstart)</i>
|
49
|
-
|
50
|
-
<notextile></div><div class="toggle"></notextile>
|
51
|
-
|
52
|
-
h2(#gethadoop). Installing and Running Wukong with Hadoop
|
53
|
-
|
54
|
-
Wukong was primarily developed for Hadoop, and we think it's the best way to use Hadoop (it's certainly the most fun!).
|
55
|
-
|
56
|
-
h3. Run Wukong on the Amazon AWS EC2 Cloud
|
57
|
-
|
58
|
-
h3. Hadoop Infrastructure
|
59
|
-
|
60
|
-
Even if you have a bunch of machines with spare cycles, lots of RAM, and a shared filesystem... do yourself a favor and start out using the "Cloudera AMIs on Amazon's EC2 cloud.":http://www.cloudera.com/hadoop-ec2 There are an overwhelming number of fiddly little parameters and you'll be glad for the user experience before you get into server setup. If it's still mid-late 2009 when you read this, ignore prudence and jump straight to using Hadoop 0.20. It will be a) more fun, b) much more robust (trust me, at "v0.20" you want to live on the bleeding edge), and c) you won't have to suffer through migrating your HDFS two weeks after setup.
|
61
|
-
|
62
|
-
To set up hadoop, your best bet are the Cloudera AMIs on Amazon's EC2 compute cloud:
|
63
|
-
|
64
|
-
* http://www.cloudera.com/hadoop-ec2
|
65
|
-
* http://www.cloudera.com/hadoop-ec2-ebs-beta
|
66
|
-
|
67
|
-
EC2 means anyone with a $10 bill can rent a 10-machine cluster with 1TB of distributed storage for 8 hours.
|
68
|
-
|
69
|
-
h3. Run Wukong using Amazon AWS Elastic MapReduce
|
70
|
-
|
71
|
-
AWS Elastic MapReduce saves the trouble of even setting up a cluster: click, bam, there it is.
|
72
|
-
|
73
|
-
Phil Ripperger has prepared a "Ruby Hadoop Quickstart":http://blog.pdatasolutions.com/post/191978092/ruby-on-hadoop-quickstart explaining how to get started with Wukong, Hadoop and the Amazon Elastic MapReduce cloud -- it's better than anything we could put here. Thanks Phil!
|
74
|
-
|
75
|
-
h3. Set up a Hadoop cluster
|
76
|
-
|
77
|
-
If you have a local cluster, or just want to experiment with a single-machine install, check out the Cloudera packages for both Debian/Ubuntu-based and Redhat/RPM-based Linux systems.
|
78
|
-
|
79
|
-
h3. More Hadoop Notes
|
80
|
-
|
81
|
-
I've braindumped some random notes on configuring and using hadoop "over here":hadoop-tips.html
|
82
|
-
|
83
|
-
<notextile></div><div class="toggle"></notextile>
|
84
|
-
|
85
|
-
h2(#others). Wukong isn't just Hadoop: Datamapper, ActiveRecord, command-line usage and more
|
86
|
-
|
87
|
-
Wukong is used by many in an non-Hadoop environment -- anywhere you can stream data records, you can unleash its monkey power.
|
88
|
-
|
89
|
-
Please see the "usage notes":usage.html#playnice for more!
|
90
|
-
|
91
|
-
|
92
|
-
<notextile></div></notextile>
|
data/docpages/LICENSE.textile
DELETED
@@ -1,107 +0,0 @@
|
|
1
|
-
---
|
2
|
-
layout: default
|
3
|
-
title: Apache License
|
4
|
-
---
|
5
|
-
|
6
|
-
|
7
|
-
h1(gemheader). {{ site.gemname }} %(small):: license%
|
8
|
-
|
9
|
-
|
10
|
-
The wukong code is __Copyright (c) 2009 Philip (flip) Kromer__
|
11
|
-
|
12
|
-
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
|
13
|
-
|
14
|
-
http://www.apache.org/licenses/LICENSE-2.0
|
15
|
-
|
16
|
-
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an **AS IS** BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
|
17
|
-
|
18
|
-
h1. Apache License
|
19
|
-
|
20
|
-
Apache License
|
21
|
-
Version 2.0, January 2004
|
22
|
-
http://www.apache.org/licenses/
|
23
|
-
|
24
|
-
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
25
|
-
|
26
|
-
<notextile><div class="toggle"></notextile>
|
27
|
-
|
28
|
-
h2. 1. Definitions.
|
29
|
-
|
30
|
-
* **License** shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
|
31
|
-
* **Licensor** shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
|
32
|
-
* **Legal Entity** shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, **control** means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
|
33
|
-
* **You** (or **Your**) shall mean an individual or Legal Entity exercising permissions granted by this License.
|
34
|
-
* **Source** form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
|
35
|
-
* **Object** form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
|
36
|
-
* **Work** shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
|
37
|
-
* **Derivative Works** shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
|
38
|
-
* **Contribution** shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, **submitted** means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
|
39
|
-
* **Contributor** shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
|
40
|
-
|
41
|
-
<notextile></div><div class="toggle"></notextile>
|
42
|
-
|
43
|
-
h2. 2. Grant of Copyright License.
|
44
|
-
|
45
|
-
Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
|
46
|
-
|
47
|
-
|
48
|
-
<notextile></div><div class="toggle"></notextile>
|
49
|
-
|
50
|
-
h2. 3. Grant of Patent License.
|
51
|
-
|
52
|
-
Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
|
53
|
-
|
54
|
-
<notextile></div><div class="toggle"></notextile>
|
55
|
-
|
56
|
-
h2. 4. Redistribution.
|
57
|
-
|
58
|
-
You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
|
59
|
-
|
60
|
-
# You must give any other recipients of the Work or Derivative Works a copy of this License; and
|
61
|
-
# You must cause any modified files to carry prominent notices stating that You changed the files; and
|
62
|
-
# You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
|
63
|
-
# If the Work includes a __NOTICE__ text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
|
64
|
-
|
65
|
-
You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
|
66
|
-
|
67
|
-
<notextile></div><div class="toggle"></notextile>
|
68
|
-
|
69
|
-
h2. 5. Submission of Contributions.
|
70
|
-
|
71
|
-
Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
|
72
|
-
|
73
|
-
<notextile></div><div class="toggle"></notextile>
|
74
|
-
|
75
|
-
h2. 6. Trademarks.
|
76
|
-
|
77
|
-
This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
|
78
|
-
|
79
|
-
<notextile></div><div class="toggle"></notextile>
|
80
|
-
|
81
|
-
h2. 7. Disclaimer of Warranty.
|
82
|
-
|
83
|
-
Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an **AS IS** BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
|
84
|
-
|
85
|
-
<notextile></div><div class="toggle"></notextile>
|
86
|
-
|
87
|
-
h2. 8. Limitation of Liability.
|
88
|
-
|
89
|
-
In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
|
90
|
-
|
91
|
-
<notextile></div><div class="toggle"></notextile>
|
92
|
-
|
93
|
-
h2. 9. Accepting Warranty or Additional Liability.
|
94
|
-
|
95
|
-
While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
|
96
|
-
|
97
|
-
END OF TERMS AND CONDITIONS
|
98
|
-
|
99
|
-
<notextile></div><div class="toggle"></notextile>
|
100
|
-
|
101
|
-
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
|
102
|
-
|
103
|
-
http://www.apache.org/licenses/LICENSE-2.0
|
104
|
-
|
105
|
-
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an **AS IS** BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
|
106
|
-
|
107
|
-
<notextile></div></notextile>
|
@@ -1,377 +0,0 @@
|
|
1
|
-
h2. Questions
|
2
|
-
|
3
|
-
* can I access an EC2 resource (eg cassandra cluster)
|
4
|
-
|
5
|
-
|
6
|
-
h2. Setup
|
7
|
-
|
8
|
-
* download from http://developer.amazonwebservices.com/connect/entry.jspa?externalID=2264&categoryID=273
|
9
|
-
* wget http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
|
10
|
-
* unzip elastic-mapreduce-ruby.zip
|
11
|
-
* cd elastic-mapreduce-ruby
|
12
|
-
* ln -nfs ~/.wukong/credentials.json
|
13
|
-
* put your keypair in ~/.wukong/keypairs/WHATEVER.pem
|
14
|
-
|
15
|
-
{
|
16
|
-
"access-id": "<insert your aws access id here>",
|
17
|
-
"private-key": "<insert your aws secret access key here>",
|
18
|
-
"key-pair": "WHATEVER",
|
19
|
-
"key-pair-file": "~/.wukong/keypairs/WHATEVER.pem",
|
20
|
-
"log-uri": "s3n://yourmom/emr/logs"
|
21
|
-
}
|
22
|
-
|
23
|
-
h4. Paths
|
24
|
-
|
25
|
-
|
26
|
-
Paths:
|
27
|
-
LogUri s3 s3n://yourmom/emr/logs
|
28
|
-
step log files s3 {log_uri}/Steps/{step}/{syslog,stdout,controller,stderr}
|
29
|
-
Script s3 s3://yourmom/emr/scripts/path/to/script
|
30
|
-
Wukong s3 s3://s3scripts.infochimps.org/wukong/current/....
|
31
|
-
Input s3 s3n://yourmom/data/wordcount/input
|
32
|
-
Output s3 s3n://yourmom/data/wordcount/output
|
33
|
-
Bootstrap Scripts s3 s3://elasticmapreduce/bootstrap-actions/{configure-hadoop,configure-daemons,run-if}
|
34
|
-
|
35
|
-
|
36
|
-
Credentials desk elastic-mapreduce-ruby/credentials.json
|
37
|
-
|
38
|
-
hadoop.tmp.dir inst /mnt/var/lib/hadoop/tmp
|
39
|
-
local hdfs inst /mnt/var/lib/hadoop/dfs
|
40
|
-
your home dir inst /home/hadoop (small space)
|
41
|
-
Job Settings inst /mnt/var/lib/info/job-flow.json
|
42
|
-
Instance Settings inst /mnt/var/lib/info/instance.json
|
43
|
-
|
44
|
-
|
45
|
-
h4. Launching emr tasks in wukong
|
46
|
-
|
47
|
-
* Uses configliere to get your credentials, log_uri, emr_root, script_path
|
48
|
-
* Uploads script phases.
|
49
|
-
s3://emr_root/scripts/:script_path/script_name-datetime-mapper.rb
|
50
|
-
s3://emr_root/scripts/:script_path/script_name-datetime-reducer.rb
|
51
|
-
** You can use the following symbols to assemble the path:
|
52
|
-
:emr_root, :script_name, :script_path, :username, :date, :datetime, :phase, :rand, :pid, :hostname, :keypair
|
53
|
-
The values for :emr_root and :script_path are taken from configliere.
|
54
|
-
if :script_path is missing, scripts/:username is used.
|
55
|
-
The same timestamp and random number will be used for each phase
|
56
|
-
|
57
|
-
* uses elastic-mapreduce-ruby to launch the job
|
58
|
-
|
59
|
-
** specify --emr.{option}
|
60
|
-
** eg --emr.alive, --emr.num-instances
|
61
|
-
|
62
|
-
reads ~/.wukong/emr.yaml
|
63
|
-
|
64
|
-
common
|
65
|
-
jobs / jobname
|
66
|
-
|
67
|
-
name same as for hadoop name
|
68
|
-
alive
|
69
|
-
|
70
|
-
num_instances .
|
71
|
-
instance_type .
|
72
|
-
master_instance_type .
|
73
|
-
availability_zone us-east-1b
|
74
|
-
key_pair job_handle
|
75
|
-
key_pair_file ~/.wukong/keypairs/{key_pair}.pem
|
76
|
-
|
77
|
-
hadoop_version 0.20
|
78
|
-
plain_output Return the job flow id from create step as simple text
|
79
|
-
info JSON hash
|
80
|
-
emr_root
|
81
|
-
log_uri emr_root/logs/:script_path/:script_name-:datetime
|
82
|
-
|
83
|
-
--hadoop-version=0.20 --stream --enable_debugging --verbose --debug --alive
|
84
|
-
--availability-zone AZ --key_pair KP --key_pair_file KPF --access_id EC2ID --private_key EC2PK
|
85
|
-
--slave_instance_type m2.xlarge --master_instance_type m2.xlarge --num_instances NUM
|
86
|
-
#
|
87
|
-
--step_name
|
88
|
-
--step_action CANCEL_AND_WAIT, TERMINATE_JOB_FLOW or CONTINUE
|
89
|
-
--jobflow JOBFLOWID
|
90
|
-
#
|
91
|
-
--info Settings.emr.info.to_json
|
92
|
-
#
|
93
|
-
--input INPUT
|
94
|
-
--output OUTPUT
|
95
|
-
--mapper s3://emr_root/jobs/:script_path/script_name-datetime-mapper.rb (or class)
|
96
|
-
--reducer s3://emr_root/jobs/:script_path/script_name-datetime-reducer.rb (or class)
|
97
|
-
--cache s3n://emr_root/jobs/:script_path/cache/sample.py#sample.py
|
98
|
-
--cache-archive s3://s3scripts.infochimps.org/wukong/current/wukong.zip
|
99
|
-
--cache-archive s3n://emr_root/jobs/:script_path/cache/sample.jar
|
100
|
-
--jobconf whatever
|
101
|
-
|
102
|
-
...
|
103
|
-
|
104
|
-
also:
|
105
|
-
|
106
|
-
--ssh
|
107
|
-
--scp SRC --to DEST
|
108
|
-
--terminate
|
109
|
-
--logs
|
110
|
-
--list
|
111
|
-
--all
|
112
|
-
|
113
|
-
h4. Aggregate
|
114
|
-
|
115
|
-
http://hadoop.apache.org/common/docs/r0.20.1/api/org/apache/hadoop/mapred/lib/aggregate/package-summary.html
|
116
|
-
|
117
|
-
DoubleValueSum sums up a sequence of double values.
|
118
|
-
LongValueMax maintain the maximum of a sequence of long values.
|
119
|
-
LongValueMin maintain the minimum of a sequence of long values.
|
120
|
-
LongValueSum sums up a sequence of long values.
|
121
|
-
StringValueMax maintain the biggest of a sequence of strings.
|
122
|
-
StringValueMin maintain the smallest of a sequence of strings.
|
123
|
-
UniqValueCount dedupes a sequence of objects.
|
124
|
-
ValueHistogram computes the histogram of a sequence of strings.
|
125
|
-
|
126
|
-
h2. Commands
|
127
|
-
|
128
|
-
# create a job and run a mapper written in python and stored in Amazon S3
|
129
|
-
elastic-mapreduce --create --enable_debugging \
|
130
|
-
--stream
|
131
|
-
--mapper s3://elasticmapreduce/samples/wordcount/wordSplitter.py \
|
132
|
-
--input s3n://elasticmapreduce/samples/wordcount/input \
|
133
|
-
--output s3n://mybucket/output_path
|
134
|
-
--log_uri
|
135
|
-
|
136
|
-
elastic-mapreduce --list # list recently created job flows
|
137
|
-
elastic-mapreduce --list --active # list all running or starting job flows
|
138
|
-
elastic-mapreduce --list --all # list all job flows
|
139
|
-
|
140
|
-
h4. Bootstrap actions
|
141
|
-
|
142
|
-
--bootstrap-action s3://elasticmapreduce/bootstrap-actions/configure-hadoop
|
143
|
-
--args "--site-config-file,s3://bucket/config.xml,-s,mapred.tasktracker.map.tasks.maximum=2"
|
144
|
-
|
145
|
-
--bootstrap-action s3://elasticmapreduce/bootstrap-actions/configure-daemons
|
146
|
-
--args "--namenode-heap-size=2048,--namenode-opts=\"-XX:GCTimeRatio=19\""
|
147
|
-
|
148
|
-
You should recompile cascading applications with the Hadoop 0.20 version specified so they can take advantage of the new features available in this version.
|
149
|
-
Hadoop 0.20 fully supports Pig scripts.
|
150
|
-
All Amazon Elastic MapReduce sample apps are compatible with Hadoop 0.20. The AWS Management Console supports only Hadoop 0.20, so samples will default to 0.20 once launched.
|
151
|
-
|
152
|
-
For Hadoop version 0.20, Hive version 0.5 and version Pig 0.6 is used. The version can be selected by setting HadoopVersion in JobFlowInstancesConfig.
|
153
|
-
|
154
|
-
h3. Pig
|
155
|
-
|
156
|
-
REGISTER s3:///my-bucket/piggybank.jar
|
157
|
-
|
158
|
-
Additional functions:
|
159
|
-
|
160
|
-
http://developer.amazonwebservices.com/connect/entry.jspa?externalID=2730
|
161
|
-
|
162
|
-
|
163
|
-
h2. Hadoop and Cluster setup
|
164
|
-
|
165
|
-
h3. Data Compression
|
166
|
-
|
167
|
-
Output Compression: -jobconf mapred.output.compress=true FileOutputFormat.setCompressOutput(conf, true);
|
168
|
-
Intermediate Compression: -jobconf mapred.compress.map.output=true conf.setCompressMapOutput(true);
|
169
|
-
|
170
|
-
You can also use a bootstrap action to automatically compress all job outputs. Here is how to do that with the Ruby client.
|
171
|
-
|
172
|
-
--bootstrap-action s3://elasticmapreduce/bootstrap-actions/configure-hadoop --args "-s,mapred.output.compress=true"
|
173
|
-
|
174
|
-
Compressed Input data Hadoop automatically detects the .gz extension on file names and extracts the contents. You do not need to take any action to extract gzipped files.
|
175
|
-
|
176
|
-
|
177
|
-
===========================================================================
|
178
|
-
|
179
|
-
|
180
|
-
$LOAD_PATH << File.dirname(__FILE__)
|
181
|
-
require 'amazon/coral/elasticmapreduceclient'
|
182
|
-
require 'amazon/retry_delegator'
|
183
|
-
|
184
|
-
config = {
|
185
|
-
:endpoint => "https://elasticmapreduce.amazonaws.com",
|
186
|
-
:ca_file => File.join(File.dirname(__FILE__), "cacert.pem"),
|
187
|
-
:aws_access_key => my_access_id,
|
188
|
-
:aws_secret_key => my_secret_key,
|
189
|
-
:signature_algorithm => :V2
|
190
|
-
}
|
191
|
-
client = Amazon::Coral::ElasticMapReduceClient.new_aws_query(config)
|
192
|
-
|
193
|
-
is_retryable_error_response = Proc.new do |response|
|
194
|
-
if response == nil then
|
195
|
-
false
|
196
|
-
else
|
197
|
-
ret = false
|
198
|
-
if response['Error'] then
|
199
|
-
ret ||= ['InternalFailure', 'Throttling', 'ServiceUnavailable', 'Timeout'].include?(response['Error']['Code'])
|
200
|
-
end
|
201
|
-
ret
|
202
|
-
end
|
203
|
-
end
|
204
|
-
|
205
|
-
client = Amazon::RetryDelegator.new(client, :retry_if => is_retryable_error_response)
|
206
|
-
|
207
|
-
puts client.DescribeJobFlows.inspect
|
208
|
-
puts client.DescribeJobFlows('JobFlowId' => 'j-ABAYAS1019012').inspect
|
209
|
-
|
210
|
-
h3. Example job-flow.json and instance.json
|
211
|
-
|
212
|
-
job-flow.json {"jobFlowId":"j-1UVPY9PQ3XAXE","jobFlowCreationInstant":1271711181000,
|
213
|
-
"instanceCount":4,"masterInstanceId":"i-f987ee92","masterPrivateDnsName":
|
214
|
-
"localhost","masterInstanceType":"m1.small","slaveInstanceType":
|
215
|
-
"m1.small","hadoopVersion":"0.18"}
|
216
|
-
|
217
|
-
instance.json {"isMaster":true,"isRunningNameNode":true,"isRunningDataNode":true,
|
218
|
-
"isRunningJobTracker":false,"isRunningTaskTracker":false}
|
219
|
-
|
220
|
-
h3. Configuraion
|
221
|
-
|
222
|
-
h4. Configure Hadoop
|
223
|
-
|
224
|
-
Location: s3://elasticmapreduce/bootstrap-actions/configure-hadoop
|
225
|
-
|
226
|
-
-<f>, --<file>-key-value
|
227
|
-
Key/value pair that will be merged into the specified config file.
|
228
|
-
|
229
|
-
-<F>, --<file>-config-file
|
230
|
-
Config file in Amazon S3 or locally that will be merged with the specified config file.
|
231
|
-
|
232
|
-
Acceptable config files:
|
233
|
-
s/S site hadoop-site.xml
|
234
|
-
d/D default hadoop-default.xml
|
235
|
-
c/C core core-site.xml
|
236
|
-
h/H hdfs hdfs-site.xml
|
237
|
-
m/M mapred mapred-site.xml
|
238
|
-
|
239
|
-
|
240
|
-
Example Usage:
|
241
|
-
|
242
|
-
elastic-mapreduce --create \
|
243
|
-
--bootstrap-action s3://elasticmapreduce/bootstrap-actions/configure-hadoop
|
244
|
-
--args "--site-config-file,s3://bucket/config.xml,-s,mapred.tasktracker.map.tasks.maximum=2"
|
245
|
-
|
246
|
-
|
247
|
-
Specify no reducers:
|
248
|
-
--mapred-key-value mapred.reduce.tasks=0
|
249
|
-
|
250
|
-
|
251
|
-
-cacheFile -files Comma separated URIs
|
252
|
-
-cacheArchive -archives Comma separated URIs
|
253
|
-
-jobconf -D key=value
|
254
|
-
|
255
|
-
h4. Run If
|
256
|
-
|
257
|
-
Location: s3://elasticmapreduce/bootstrap-actions/run-if <JSON path>[!]=<value> <command> [args...]
|
258
|
-
|
259
|
-
JSON path A path in the instance config or job flow config for the key we should look up.
|
260
|
-
Value The value we expect to find.
|
261
|
-
Command The command to run if the value is what we expect (or not what we expect in the case of !=). This can be a path in S3 or a local command.
|
262
|
-
Args Arguments to pass to the command as it runs.
|
263
|
-
|
264
|
-
elastic-mapreduce --create --alive \
|
265
|
-
--bootstrap-action s3://elasticmapreduce/bootstrap-actions/run-if
|
266
|
-
--args "instance.isMaster=true,echo,Running,on,master,node"
|
267
|
-
|
268
|
-
|
269
|
-
h4. Configure Daemons
|
270
|
-
|
271
|
-
--<daemon>-heap-size Set the heap size in megabytes for the specified daemon.
|
272
|
-
--<daemon>-opts Set additional Java options for the specified daemon.
|
273
|
-
--replace Replace the existing hadoop-user-env.sh file if it exists.
|
274
|
-
|
275
|
-
<daemon> is one of: namenode, datanode, jobtracker, tasktracker, client
|
276
|
-
|
277
|
-
elastic-mapreduce --create --alive
|
278
|
-
--bootstrap-action s3://elasticmapreduce/bootstrap-actions/configure-daemons
|
279
|
-
--args "--namenode-heap-size=2048,--namenode-opts=\"-XX:GCTimeRatio=19\""
|
280
|
-
|
281
|
-
|
282
|
-
h2. Command Line
|
283
|
-
|
284
|
-
|
285
|
-
Creating Job Flows
|
286
|
-
--create Create a new job flow
|
287
|
-
--name NAME Name of the job flow
|
288
|
-
--alive Create a job flow that stays running even though it has executed all its steps
|
289
|
-
--num-instances NUM Number of instances in the job flow
|
290
|
-
--instance-type TYPE The type of the instances to launch
|
291
|
-
--slave-instance-type TYPE The type of the slave instances to launch
|
292
|
-
--master-instance-type TYPE The type of the master instance to launch
|
293
|
-
--key-pair KEY_PAIR The name of your Amazon EC2 Keypair
|
294
|
-
--key-pair-file FILE_PATH Path to your local pem file for your EC2 key pair
|
295
|
-
--log-uri LOG_URI Location in S3 to store logs from the job flow, e.g. s3n://mybucket/logs
|
296
|
-
--availability-zone A_Z Specify the Availability Zone in which to launch the jobflow
|
297
|
-
--info INFO Specify additional info in JSON
|
298
|
-
--hadoop-version INFO Specify the Hadoop Version to install
|
299
|
-
--plain-output Return the job flow id from create step as simple text
|
300
|
-
|
301
|
-
Adding Jar Steps to Job Flows
|
302
|
-
--jar JAR Add a step that executes a jar
|
303
|
-
--wait-for-step Wait for the step to finish
|
304
|
-
--main-class MAIN_CLASS Specify main class for the JAR
|
305
|
-
|
306
|
-
Adding Streaming Steps to Job Flows
|
307
|
-
--stream Add a step that performs hadoop streaming
|
308
|
-
--input INPUT Input to the steps, e.g. s3n://mybucket/input
|
309
|
-
--output OUTPUT The output to the steps, e.g. s3n://mybucket/output
|
310
|
-
--mapper MAPPER The mapper program or class
|
311
|
-
--cache CACHE_FILE A file to load into the cache, e.g. s3n://mybucket/sample.py#sample.py
|
312
|
-
--cache-archive CACHE_FILE A file to unpack into the cache, e.g. s3n://mybucket/sample.jar
|
313
|
-
--jobconf KEY=VALUE Specify jobconf arguments to pass to streaming, e.g. mapred.task.timeout=800000
|
314
|
-
--reducer REDUCER The reducer program or class
|
315
|
-
|
316
|
-
Job Flow Deugging Options
|
317
|
-
--enable-debugging Enable job flow debugging (you must be signed up to SimpleDB for this to work)
|
318
|
-
|
319
|
-
Adding Pig steps to job flows
|
320
|
-
--pig-script Add a step that runs a Pig script
|
321
|
-
--pig-interactive Add a step that sets up the job flow for an interactive (via SSH) pig session
|
322
|
-
|
323
|
-
Configuring a Hive on a JobFlow
|
324
|
-
--hive-site HIVE_SITE Override Hive configuration with configuration from HIVE_SITE
|
325
|
-
--hive-script Add a step that runs a Hive script
|
326
|
-
--hive-interactive Add a step that sets up the job flow for an interactive (via SSH) hive session
|
327
|
-
|
328
|
-
Adding Steps from a Json File to Job Flows
|
329
|
-
--json FILE Add a sequence of steps stored in a json file
|
330
|
-
--param VARIABLE=VALUE subsitute <variable> with value in the json file
|
331
|
-
|
332
|
-
Contacting the Master Node
|
333
|
-
--no-wait Don't wait for the Master node to start before executing scp or ssh
|
334
|
-
--ssh [COMMAND] SSH to the master node and optionally run a command
|
335
|
-
--logs Display the step logs for the last executed step
|
336
|
-
--scp SRC Copy a file to the master node
|
337
|
-
--to DEST the destination to scp a file to
|
338
|
-
|
339
|
-
Settings common to all step types
|
340
|
-
--step-name STEP_NAME Set name for the step
|
341
|
-
--step-action STEP_NAME Action to take when step finishes. One of CANCEL_AND_WAIT, TERMINATE_JOB_FLOW or CONTINUE
|
342
|
-
--arg ARG Specify an argument to a bootstrap action, jar, streaming, pig-script or hive-script step
|
343
|
-
--args ARGS Specify a comma seperated list of arguments, e.g --args 1,2,3 would three arguments
|
344
|
-
|
345
|
-
Specifying Bootstrap Actions
|
346
|
-
--bootstrap-action SCRIPT Run a bootstrap action script on all instances
|
347
|
-
--bootstrap-name NAME Set the name of the bootstrap action
|
348
|
-
Note --arg and --args are used to pass arguments to bootstrap actions
|
349
|
-
|
350
|
-
Listing and Describing Job Flows
|
351
|
-
--list List all job flows created in the last 2 days
|
352
|
-
--describe Dump a JSON description of the supplied job flows
|
353
|
-
--active List running, starting or shutting down job flows
|
354
|
-
--all List all job flows in the last 2 months
|
355
|
-
--nosteps Do not list steps when listing jobs
|
356
|
-
--state STATE List job flows in STATE
|
357
|
-
-n, --max-results MAX_RESULTS Maximum number of results to list
|
358
|
-
|
359
|
-
Terminating Job Flows
|
360
|
-
--terminate Terminate the job flow
|
361
|
-
|
362
|
-
Common Options
|
363
|
-
-j, --jobflow JOB_FLOW_ID
|
364
|
-
--job-flow-id
|
365
|
-
-c, --credentials CRED_FILE File containing access-id and private-key
|
366
|
-
-a, --access-id ACCESS-ID AWS Access Id
|
367
|
-
-k, --private-key PRIVATE-KEY AWS Private Key
|
368
|
-
-v, --verbose Turn on verbose logging of program interaction
|
369
|
-
|
370
|
-
Uncommon Options
|
371
|
-
--debug Print stack traces when exceptions occur
|
372
|
-
--endpoint ENDPOINT Specify the webservice endpoint to talk to
|
373
|
-
--region REGION The region to use for the endpoint
|
374
|
-
--apps-path APPS_PATH Specify s3:// path to the base of the emr public bucket to use. e.g s3://us-east-1.elasticmapreduce
|
375
|
-
--beta-path BETA_PATH Specify s3:// path to the base of the emr public bucket to use for beta apps. e.g s3://beta.elasticmapreduce
|
376
|
-
--version Print a version string
|
377
|
-
-h, --help Show help message
|