wukong 3.0.0.pre → 3.0.0.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
data/lib/away/runner/execute.rb
DELETED
@@ -1,121 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
|
3
|
-
#
|
4
|
-
# Taken from the [Aruba project](). Original license:
|
5
|
-
#
|
6
|
-
# Copyright (c) 2010 Aslak Hellesøy, David Chelimsky
|
7
|
-
#
|
8
|
-
# Permission is hereby granted, free of charge, to any person obtaining
|
9
|
-
# a copy of this software and associated documentation files (the
|
10
|
-
# "Software"), to deal in the Software without restriction, including
|
11
|
-
# without limitation the rights to use, copy, modify, merge, publish,
|
12
|
-
# distribute, sublicense, and/or sell copies of the Software, and to
|
13
|
-
# permit persons to whom the Software is furnished to do so, subject to
|
14
|
-
# the following conditions:
|
15
|
-
#
|
16
|
-
# The above copyright notice and this permission notice shall be
|
17
|
-
# included in all copies or substantial portions of the Software.
|
18
|
-
#
|
19
|
-
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
20
|
-
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
21
|
-
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
22
|
-
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
23
|
-
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
24
|
-
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
25
|
-
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
26
|
-
|
27
|
-
|
28
|
-
require 'childprocess'
|
29
|
-
require 'tempfile'
|
30
|
-
|
31
|
-
module Wukong
|
32
|
-
class Runner
|
33
|
-
|
34
|
-
class Process
|
35
|
-
attr_reader :keep_ansi
|
36
|
-
|
37
|
-
def initialize(cmd, exit_timeout=2.0, io_wait=2.0, keep_ansi=true)
|
38
|
-
@exit_timeout = exit_timeout
|
39
|
-
@io_wait = io_wait
|
40
|
-
@keep_ansi = true
|
41
|
-
|
42
|
-
@process = ChildProcess.build(*cmd)
|
43
|
-
@process.io.stdout = raw_out_io
|
44
|
-
@process.io.stderr = raw_err_io
|
45
|
-
@process.duplex = true
|
46
|
-
end
|
47
|
-
|
48
|
-
def raw_out_io
|
49
|
-
@raw_out_io ||= StringIO.new('', 'w')
|
50
|
-
end
|
51
|
-
|
52
|
-
def raw_err_io
|
53
|
-
@raw_err_io ||= StringIO.new('', 'w')
|
54
|
-
end
|
55
|
-
|
56
|
-
def run!(&block)
|
57
|
-
@process.start
|
58
|
-
yield self if block_given?
|
59
|
-
end
|
60
|
-
|
61
|
-
def stdin
|
62
|
-
wait_for_io do
|
63
|
-
@process.io.stdin.sync = true
|
64
|
-
@process.io.stdin
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
def output
|
69
|
-
stdout + stderr
|
70
|
-
end
|
71
|
-
|
72
|
-
def stdout
|
73
|
-
wait_for_io do
|
74
|
-
@raw_out_io.rewind
|
75
|
-
filter_ansi(@raw_out_io.read)
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
def stderr
|
80
|
-
wait_for_io do
|
81
|
-
@raw_err_io.rewind
|
82
|
-
filter_ansi(@raw_err_io.read)
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
def stop(reader)
|
87
|
-
return unless @process
|
88
|
-
unless @process.exited?
|
89
|
-
reader.stdout stdout
|
90
|
-
reader.stderr stderr
|
91
|
-
@process.poll_for_exit(@exit_timeout)
|
92
|
-
end
|
93
|
-
@process.exit_code
|
94
|
-
end
|
95
|
-
|
96
|
-
def terminate
|
97
|
-
if @process
|
98
|
-
flush
|
99
|
-
@process.stop
|
100
|
-
flush
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
104
|
-
def flush
|
105
|
-
stdout && stderr # flush output
|
106
|
-
end
|
107
|
-
|
108
|
-
private
|
109
|
-
|
110
|
-
def wait_for_io(&block)
|
111
|
-
sleep @io_wait if @process.alive?
|
112
|
-
yield
|
113
|
-
end
|
114
|
-
|
115
|
-
def filter_ansi(string)
|
116
|
-
keep_ansi ? string : string.gsub(/\e\[\d+(?>(;\d+)*)m/, '')
|
117
|
-
end
|
118
|
-
|
119
|
-
end
|
120
|
-
end
|
121
|
-
end
|
data/lib/away/script.rb
DELETED
@@ -1,161 +0,0 @@
|
|
1
|
-
require 'wukong'
|
2
|
-
require 'wukong/script/hadoop_command'
|
3
|
-
require 'wukong/experimental'
|
4
|
-
|
5
|
-
#
|
6
|
-
# Runner settings
|
7
|
-
#
|
8
|
-
|
9
|
-
Settings.define :mode, :type => Symbol, :default => :mapper, :env_var => 'WUKONG_MODE', :description => "run the script's workflow: Specify 'hadoop' to use hadoop streaming; 'local' to run your_script.rb --map | sort | your_script.rb --reduce; 'emr' to launch on the amazon cloud; 'mapper' or 'reducer' to run that phase.", :wukong => true
|
10
|
-
Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
|
11
|
-
Settings.define :rm, :description => "Recursively remove the destination directory. Only used in hadoop mode.", :wukong => true
|
12
|
-
Settings.define :script_file, :type => :filename, :description => "script file to execute, or give as first arg", :wukong => true
|
13
|
-
|
14
|
-
module Wukong
|
15
|
-
# adds ability to execute
|
16
|
-
extend Wukong::Mixin::FromFile
|
17
|
-
|
18
|
-
def self.from_file(filename)
|
19
|
-
filename = filename.to_s
|
20
|
-
filename += ".rb" if filename !~ /\.rb$/
|
21
|
-
super(filename)
|
22
|
-
end
|
23
|
-
|
24
|
-
def self.run(filename=nil)
|
25
|
-
if filename
|
26
|
-
self.from_file(filename)
|
27
|
-
else
|
28
|
-
Settings.resolve!
|
29
|
-
end
|
30
|
-
if @main_run then return false ; end
|
31
|
-
Wukong::Script.new(Settings).run
|
32
|
-
@main_run = true
|
33
|
-
end
|
34
|
-
|
35
|
-
#
|
36
|
-
# sources a script file,
|
37
|
-
#
|
38
|
-
class Script
|
39
|
-
attr_reader :settings # configliere hash of settings
|
40
|
-
attr_reader :script_file # File to execute
|
41
|
-
attr_reader :input_paths
|
42
|
-
attr_reader :output_path
|
43
|
-
|
44
|
-
include Wukong::Script::HadoopCommand
|
45
|
-
|
46
|
-
def initialize(settings)
|
47
|
-
@settings = settings
|
48
|
-
|
49
|
-
@output_path = settings.rest.pop
|
50
|
-
@input_paths = settings.rest.reject(&:blank?)
|
51
|
-
end
|
52
|
-
|
53
|
-
|
54
|
-
# Execute the script file in the context of the Wukong module
|
55
|
-
def run_flow
|
56
|
-
Log.debug( "Running #{script_file} with settings #{settings}")
|
57
|
-
script_file = settings.script_file
|
58
|
-
mode = settings.mode
|
59
|
-
Wukong.flow(mode).run
|
60
|
-
end
|
61
|
-
|
62
|
-
#
|
63
|
-
# In --run mode, use the framework (local, hadoop, emr, etc) to re-launch
|
64
|
-
# the script as mapper, reducer, etc.
|
65
|
-
# If --map or --reduce, dispatch to the mapper or reducer.
|
66
|
-
#
|
67
|
-
def run
|
68
|
-
case settings.mode
|
69
|
-
when :local then execute_local_workflow
|
70
|
-
when :hadoop, :mapred then execute_hadoop_workflow
|
71
|
-
else
|
72
|
-
run_flow
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
#
|
77
|
-
# Shell command for map phase. By default, calls the script in --map mode
|
78
|
-
# In hadoop mode, this is given to the hadoop streaming command.
|
79
|
-
# In local mode, it's given to the system() call
|
80
|
-
#
|
81
|
-
def mapper_commandline
|
82
|
-
"#{ruby_interpreter_path} #{this_script_filename} --mode=mapper " + non_wukong_params
|
83
|
-
end
|
84
|
-
|
85
|
-
#
|
86
|
-
# Shell command for reduce phase. By default, calls the script in --reduce mode
|
87
|
-
# In hadoop mode, this is given to the hadoop streaming command.
|
88
|
-
# In local mode, it's given to the system() call
|
89
|
-
#
|
90
|
-
def reducer_commandline
|
91
|
-
"#{ruby_interpreter_path} #{this_script_filename} --mode=reducer " + non_wukong_params
|
92
|
-
end
|
93
|
-
|
94
|
-
def job_name
|
95
|
-
settings[:job_name] ||
|
96
|
-
"#{File.basename(this_script_filename)}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
|
97
|
-
end
|
98
|
-
|
99
|
-
# Wrapper for dangerous operations to catch errors
|
100
|
-
def safely action, &block
|
101
|
-
begin
|
102
|
-
block.call
|
103
|
-
rescue StandardError => e ; handle_error(action, e); end
|
104
|
-
end
|
105
|
-
|
106
|
-
protected
|
107
|
-
|
108
|
-
#
|
109
|
-
# Execute the runner phase:
|
110
|
-
# use the running framework to relaunch the script in map and in reduce mode
|
111
|
-
#
|
112
|
-
def execute_command! *args
|
113
|
-
command = args.flatten.reject(&:blank?).join(" \\\n ")
|
114
|
-
Log.info "Running\n\n#{command}\n"
|
115
|
-
if settings[:dry_run]
|
116
|
-
Log.info '== [Not running preceding command: dry run] =='
|
117
|
-
else
|
118
|
-
maybe_overwrite_output_paths! output_path
|
119
|
-
$stdout.puts `#{command}`
|
120
|
-
raise "Streaming command failed!" unless $?.success?
|
121
|
-
end
|
122
|
-
end
|
123
|
-
|
124
|
-
#
|
125
|
-
# In hadoop mode only, removes the destination path before launching
|
126
|
-
#
|
127
|
-
# To the panic-stricken: look in .Trash/current/path/to/accidentally_deleted_files
|
128
|
-
#
|
129
|
-
def maybe_overwrite_output_paths! output_path
|
130
|
-
if (settings.rm && (settings.mode == :hadoop))
|
131
|
-
cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
|
132
|
-
Log.info "Removing output file #{output_path}: #{cmd}"
|
133
|
-
puts `#{cmd}`
|
134
|
-
end
|
135
|
-
end
|
136
|
-
|
137
|
-
# Reassemble all the non-internal-to-wukong settings into a command line for
|
138
|
-
# the map/reducer phase scripts
|
139
|
-
def non_wukong_params
|
140
|
-
settings.
|
141
|
-
reject{|param, val| settings.definition_of(param, :wukong) }.
|
142
|
-
reject{|param, val| param.to_s =~ /catalog_root/ }.
|
143
|
-
map{|param,val| "--#{param}=#{val}" }.
|
144
|
-
join(" ")
|
145
|
-
end
|
146
|
-
|
147
|
-
# the full, real path to the script file
|
148
|
-
def this_script_filename
|
149
|
-
Pathname.new($0).realpath
|
150
|
-
end
|
151
|
-
|
152
|
-
# use the full ruby interpreter path to run slave processes
|
153
|
-
def ruby_interpreter_path
|
154
|
-
Pathname.new(File.join(
|
155
|
-
Config::CONFIG["bindir"],
|
156
|
-
Config::CONFIG["RUBY_INSTALL_NAME"]+Config::CONFIG["EXEEXT"])).realpath
|
157
|
-
end
|
158
|
-
|
159
|
-
|
160
|
-
end
|
161
|
-
end
|
@@ -1,240 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
module Wukong
|
3
|
-
class Script
|
4
|
-
module HadoopCommand
|
5
|
-
|
6
|
-
# ===========================================================================
|
7
|
-
#
|
8
|
-
# Hadoop Options
|
9
|
-
#
|
10
|
-
Settings.define :hadoop_home, :default => '/usr/lib/hadoop', :description => "Path to hadoop installation; ENV['HADOOP_HOME'] by default. HADOOP_HOME/bin/hadoop is used to run hadoop.", :env_var => 'HADOOP_HOME', :wukong => true
|
11
|
-
Settings.define :hadoop_runner, :description => "Path to hadoop script. Usually set --hadoop_home instead of this.", :wukong => true
|
12
|
-
|
13
|
-
#
|
14
|
-
# Translate simplified args to their hairy hadoop equivalents
|
15
|
-
#
|
16
|
-
Settings.define :io_sort_mb, :jobconf => true, :description => 'io.sort.mb', :wukong => true
|
17
|
-
Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
|
18
|
-
Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
|
19
|
-
Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
|
20
|
-
Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
|
21
|
-
Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
|
22
|
-
Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
|
23
|
-
Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
|
24
|
-
Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
|
25
|
-
Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
|
26
|
-
Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
27
|
-
Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
|
28
|
-
Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
|
29
|
-
Settings.define :max_tracker_failures, :jobconf => true, :description => 'mapred.max.tracker.failures', :wukong => true
|
30
|
-
Settings.define :max_map_attempts, :jobconf => true, :description => 'mapred.map.max.attempts', :wukong => true
|
31
|
-
Settings.define :max_reduce_attempts, :jobconf => true, :description => 'mapred.reduce.max.attempts', :wukong => true
|
32
|
-
Settings.define :min_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
|
33
|
-
Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
|
34
|
-
Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
|
35
|
-
Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
|
36
|
-
Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
|
37
|
-
Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
|
38
|
-
Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
|
39
|
-
Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
|
40
|
-
Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
|
41
|
-
Settings.define :split_on_xml_tag, :description => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'", :wukong => true
|
42
|
-
|
43
|
-
# emit a -jobconf hadoop option if the simplified command line arg is present
|
44
|
-
# if not, the resulting nil will be elided later
|
45
|
-
def jobconf option
|
46
|
-
if settings[option]
|
47
|
-
# "-jobconf %s=%s" % [settings.definition_of(option, :description), settings[option]]
|
48
|
-
"-D %s=%s" % [settings.definition_of(option, :description), settings[option]]
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
#
|
53
|
-
# Assemble the hadoop command to execute
|
54
|
-
# and launch the hadoop runner to execute the script across all tasktrackers
|
55
|
-
#
|
56
|
-
# FIXME: Should add some simple logic to ensure that commands are in the
|
57
|
-
# right order or hadoop will complain. ie. -D settings MUST come before
|
58
|
-
# others
|
59
|
-
#
|
60
|
-
def execute_hadoop_workflow
|
61
|
-
# Input paths join by ','
|
62
|
-
input_paths = @input_paths.join(',')
|
63
|
-
#
|
64
|
-
# Use Settings[:hadoop_home] to set the path your config install.
|
65
|
-
hadoop_commandline = [
|
66
|
-
hadoop_runner,
|
67
|
-
"jar #{settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
|
68
|
-
hadoop_jobconf_settings,
|
69
|
-
"-D mapred.job.name='#{job_name}'",
|
70
|
-
hadoop_other_args,
|
71
|
-
"-mapper '#{mapper_commandline}'",
|
72
|
-
"-reducer '#{reducer_commandline}'",
|
73
|
-
"-input '#{input_paths}'",
|
74
|
-
"-output '#{output_path}'",
|
75
|
-
"-file '#{this_script_filename}'",
|
76
|
-
hadoop_recycle_env,
|
77
|
-
].flatten.compact.join(" \t\\\n ")
|
78
|
-
Log.info " Launching hadoop!"
|
79
|
-
execute_command!(hadoop_commandline)
|
80
|
-
end
|
81
|
-
|
82
|
-
def hadoop_jobconf_settings
|
83
|
-
jobconf_settings = []
|
84
|
-
# Fixup these settings
|
85
|
-
settings[:reuse_jvms] = '-1' if (settings[:reuse_jvms] == true)
|
86
|
-
settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
|
87
|
-
# If no reducer and no reduce_command, then skip the reduce phase
|
88
|
-
# FIXME: auto-detect nil reducer
|
89
|
-
# settings[:reduce_tasks] = 0 if (! reducer) && (! settings[:reduce_command]) && (! settings[:reduce_tasks])
|
90
|
-
# Fields hadoop should use to distribute records to reducers
|
91
|
-
unless settings[:partition_fields].blank?
|
92
|
-
jobconf_settings += [
|
93
|
-
jobconf(:partition_fields),
|
94
|
-
jobconf(:output_field_separator),
|
95
|
-
]
|
96
|
-
end
|
97
|
-
jobconf_settings += [
|
98
|
-
:io_sort_mb, :io_sort_record_percent,
|
99
|
-
:map_speculative, :map_tasks,
|
100
|
-
:max_maps_per_cluster, :max_maps_per_node,
|
101
|
-
:max_node_map_tasks, :max_node_reduce_tasks,
|
102
|
-
:max_reduces_per_cluster, :max_reduces_per_node,
|
103
|
-
:max_record_length, :min_split_size,
|
104
|
-
:output_field_separator, :key_field_separator,
|
105
|
-
:partition_fields, :sort_fields,
|
106
|
-
:reduce_tasks, :respect_exit_status,
|
107
|
-
:reuse_jvms, :timeout,
|
108
|
-
:max_tracker_failures, :max_map_attempts,
|
109
|
-
:max_reduce_attempts
|
110
|
-
].map{|opt| jobconf(opt)}
|
111
|
-
jobconf_settings.flatten.compact
|
112
|
-
end
|
113
|
-
|
114
|
-
def hadoop_other_args
|
115
|
-
extra_str_args = [ settings[:extra_args] ]
|
116
|
-
if settings.split_on_xml_tag
|
117
|
-
extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{settings.split_on_xml_tag}>,end=</#{settings.split_on_xml_tag}>'}
|
118
|
-
end
|
119
|
-
extra_str_args << ' -lazyOutput' if settings[:noempty] # don't create reduce file if no records
|
120
|
-
extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless settings[:partition_fields].blank?
|
121
|
-
extra_str_args
|
122
|
-
end
|
123
|
-
|
124
|
-
def hadoop_recycle_env
|
125
|
-
%w[RUBYLIB].map do |var|
|
126
|
-
%Q{-cmdenv '#{var}=#{ENV[var]}'} if ENV[var]
|
127
|
-
end.compact
|
128
|
-
end
|
129
|
-
|
130
|
-
# The path to the hadoop runner script
|
131
|
-
def hadoop_runner
|
132
|
-
settings[:hadoop_runner] || (settings[:hadoop_home]+'/bin/hadoop')
|
133
|
-
end
|
134
|
-
|
135
|
-
module ClassMethods
|
136
|
-
#
|
137
|
-
# Via @pskomoroch via @tlipcon,
|
138
|
-
#
|
139
|
-
# "there is a little known Hadoop Streaming trick buried in this Python
|
140
|
-
# script. You will notice that the date is not actually in the raw log
|
141
|
-
# data itself, but is part of the filename. It turns out that Hadoop makes
|
142
|
-
# job parameters you would fetch in Java with something like
|
143
|
-
# job.get("mapred.input.file") available as environment variables for
|
144
|
-
# streaming jobs, with periods replaced with underscores:
|
145
|
-
#
|
146
|
-
# filepath = os.environ["map_input_file"]
|
147
|
-
# filename = os.path.split(filepath)[-1]
|
148
|
-
# Thanks to Todd Lipcon for directing me to that hack.
|
149
|
-
#
|
150
|
-
|
151
|
-
# HDFS pathname to the input file currently being processed.
|
152
|
-
def input_file
|
153
|
-
ENV['map_input_file']
|
154
|
-
end
|
155
|
-
|
156
|
-
# Directory of the input file
|
157
|
-
def input_dir
|
158
|
-
ENV['mapred_input_dir']
|
159
|
-
end
|
160
|
-
|
161
|
-
# Offset of this chunk within the input file
|
162
|
-
def map_input_start_offset
|
163
|
-
ENV['map_input_start']
|
164
|
-
end
|
165
|
-
|
166
|
-
# length of the mapper's input chunk
|
167
|
-
def map_input_length
|
168
|
-
ENV['map_input_length']
|
169
|
-
end
|
170
|
-
|
171
|
-
def attempt_id
|
172
|
-
ENV['mapred_task_id']
|
173
|
-
end
|
174
|
-
def curr_task_id
|
175
|
-
ENV['mapred_tip_id']
|
176
|
-
end
|
177
|
-
|
178
|
-
def script_cmdline_urlenc
|
179
|
-
ENV['stream_map_streamprocessor']
|
180
|
-
end
|
181
|
-
end
|
182
|
-
|
183
|
-
# Standard ClassMethods-on-include trick
|
184
|
-
def self.included base
|
185
|
-
base.class_eval do
|
186
|
-
extend ClassMethods
|
187
|
-
end
|
188
|
-
end
|
189
|
-
end
|
190
|
-
end
|
191
|
-
end
|
192
|
-
|
193
|
-
# -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
|
194
|
-
# -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
|
195
|
-
# -D mapred.text.key.comparator.options=-k2,2nr\
|
196
|
-
# -D mapred.text.key.partitioner.options=-k1,2\
|
197
|
-
# -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
|
198
|
-
# -D stream.num.map.output.key.fields=\"$sortfields\"
|
199
|
-
#
|
200
|
-
# -D stream.map.output.field.separator=\"'/t'\"
|
201
|
-
# -D map.output.key.field.separator=. \
|
202
|
-
# -D mapred.data.field.separator=. \
|
203
|
-
# -D map.output.key.value.fields.spec=6,5,1-3:0- \
|
204
|
-
# -D reduce.output.key.value.fields.spec=0-2:5- \
|
205
|
-
|
206
|
-
# "HADOOP_HOME" =>"/usr/lib/hadoop-0.20/bin/..",
|
207
|
-
# "HADOOP_IDENT_STRING" =>"hadoop",
|
208
|
-
# "HADOOP_LOGFILE" =>"hadoop-hadoop-tasktracker-ip-10-242-14-223.log",
|
209
|
-
# "HADOOP_LOG_DIR" =>"/usr/lib/hadoop-0.20/bin/../logs",
|
210
|
-
# "HOME" =>"/var/run/hadoop-0.20",
|
211
|
-
# "JAVA_HOME" =>"/usr/lib/jvm/java-6-sun",
|
212
|
-
# "LD_LIBRARY_PATH" =>"/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386:/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386",
|
213
|
-
# "PATH" =>"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games",
|
214
|
-
# "USER" =>"hadoop",
|
215
|
-
#
|
216
|
-
# "dfs_block_size" =>"134217728",
|
217
|
-
# "map_input_start" =>"0",
|
218
|
-
# "map_input_length" =>"125726898",
|
219
|
-
# "mapred_output_key_class" =>"org.apache.hadoop.io.Text",
|
220
|
-
# "mapred_output_value_class" =>"org.apache.hadoop.io.Text",
|
221
|
-
# "mapred_output_format_class" =>"org.apache.hadoop.mapred.TextOutputFormat",
|
222
|
-
# "mapred_output_compression_codec" =>"org.apache.hadoop.io.compress.DefaultCodec",
|
223
|
-
# "mapred_output_compression_type" =>"BLOCK",
|
224
|
-
# "mapred_task_partition" =>"0",
|
225
|
-
# "mapred_tasktracker_map_tasks_maximum" =>"4",
|
226
|
-
# "mapred_tasktracker_reduce_tasks_maximum" =>"2",
|
227
|
-
# "mapred_tip_id" =>"task_200910221152_0023_m_000000",
|
228
|
-
# "mapred_task_id" =>"attempt_200910221152_0023_m_000000_0",
|
229
|
-
# "mapred_job_tracker" =>"ec2-174-129-141-78.compute-1.amazonaws.com:8021",
|
230
|
-
#
|
231
|
-
# "mapred_input_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809",
|
232
|
-
# "map_input_file" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809/com.twitter.search+20090809233441-56735-womper.tsv.bz2",
|
233
|
-
# "mapred_working_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip",
|
234
|
-
# "mapred_work_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809/_temporary/_attempt_200910221152_0023_m_000000_0",
|
235
|
-
# "mapred_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809",
|
236
|
-
# "mapred_temp_dir" =>"/mnt/tmp/hadoop-hadoop/mapred/temp",
|
237
|
-
# "PWD" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work",
|
238
|
-
# "TMPDIR" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work/tmp",
|
239
|
-
# "stream_map_streamprocessor" =>"%2Fusr%2Fbin%2Fruby1.8+%2Fmnt%2Fhome%2Fflip%2Fics%2Fwuclan%2Fexamples%2Ftwitter%2Fparse%2Fparse_twitter_search_requests.rb+--map+--rm",
|
240
|
-
# "user_name" =>"flip",
|