wukong 3.0.0.pre → 3.0.0.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
data/lib/away/escapement.rb
DELETED
@@ -1,129 +0,0 @@
|
|
1
|
-
require 'htmlentities'
|
2
|
-
require 'addressable/uri'
|
3
|
-
|
4
|
-
# Fix a bug (?) in the HTMLEntities encoder class with $KCODE='NONE'
|
5
|
-
HTMLEntities::Encoder.class_eval do
|
6
|
-
private
|
7
|
-
def extended_entity_regexp
|
8
|
-
@extended_entity_regexp ||= (
|
9
|
-
if encoding_aware?
|
10
|
-
regexp = '[^\u{20}-\u{7E}]'
|
11
|
-
else
|
12
|
-
# regexp = '[^\x20-\x7E]'
|
13
|
-
regexp = '[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+'
|
14
|
-
end
|
15
|
-
regexp += "|'" if @flavor == 'html4'
|
16
|
-
Regexp.new(regexp)
|
17
|
-
)
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
module Wukong
|
22
|
-
#
|
23
|
-
# By default (or explicitly with the :xml strategy), convert string to
|
24
|
-
# * XML-encoded ASCII,
|
25
|
-
#
|
26
|
-
# * with a guarantee that the characters " quote, ' apos \\ backslash,
|
27
|
-
# carriage-return \r newline \n and tab \t (as well as all other control
|
28
|
-
# characters) are encoded.
|
29
|
-
#
|
30
|
-
# * Any XML-encoding in the original text is encoded with no introspection:
|
31
|
-
# encode_str("<a href=\"foo\">")
|
32
|
-
# # => "&lt;a href="foo"&gt;"
|
33
|
-
#
|
34
|
-
# * Useful: http://rishida.net/scripts/uniview/conversion.php
|
35
|
-
#
|
36
|
-
# With the :url strategy,
|
37
|
-
# * URL-encode the string
|
38
|
-
# * This is as strict as possible: encodes all but alphanumeric and _ underscore.
|
39
|
-
# The resulting string is thus XML- and URL-safe.
|
40
|
-
# http://addressable.rubyforge.org/api/classes/Addressable/URI.html#M000010
|
41
|
-
#
|
42
|
-
# Wukong.decode_str(Wukong.encode_str(str)) returns the original str
|
43
|
-
#
|
44
|
-
# If you're seeing bad_encoding errors, try
|
45
|
-
# $KCODE='u' unless "1.9".respond_to?(:encoding)
|
46
|
-
# at the start of your script.
|
47
|
-
#
|
48
|
-
def self.encode_str str, strategy=:xml
|
49
|
-
begin
|
50
|
-
case strategy
|
51
|
-
when :xml then self.html_encoder.encode(str, :basic, :named, :decimal).gsub(/\\/, '\')
|
52
|
-
when :url then Addressable::URI.encode_component(str, /[^\w]/)
|
53
|
-
else raise "Don't know how to encode with strategy #{strategy}"
|
54
|
-
end
|
55
|
-
rescue ArgumentError => e
|
56
|
-
'!bad_encoding!! ' + str.gsub(/[^\w\s\.\-@#%]+/, '')
|
57
|
-
end
|
58
|
-
end
|
59
|
-
# HTMLEntities encoder instance
|
60
|
-
def self.html_encoder
|
61
|
-
@html_encoder ||= HTMLEntities.new
|
62
|
-
end
|
63
|
-
|
64
|
-
#
|
65
|
-
# Decode string from its encode_str representation. This can include
|
66
|
-
# dangerous things such as tabs, newlines, backslashes and cryptofascist
|
67
|
-
# propaganda.
|
68
|
-
#
|
69
|
-
def self.decode_str str, strategy=:xml
|
70
|
-
case strategy
|
71
|
-
when :xml then self.html_encoder.decode(str)
|
72
|
-
when :url then Addressable::URI.unencode_component(str)
|
73
|
-
else raise "Don't know how to decode with strategy #{strategy}"
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
#
|
78
|
-
# Replace each given field in the hash with its
|
79
|
-
# encoded value
|
80
|
-
#
|
81
|
-
def self.encode_components hsh, *fields
|
82
|
-
fields.each do |field|
|
83
|
-
hsh[field] = hsh[field].to_s.wukong_encode if hsh[field]
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
String.class_eval do
|
89
|
-
|
90
|
-
#
|
91
|
-
# Strip control characters that might harsh our buzz, TSV-wise
|
92
|
-
# See Wukong.encode_str
|
93
|
-
#
|
94
|
-
def wukong_encode! *args
|
95
|
-
replace self.wukong_encode(*args)
|
96
|
-
end
|
97
|
-
|
98
|
-
def wukong_encode(*args)
|
99
|
-
Wukong.encode_str(self, *args)
|
100
|
-
end
|
101
|
-
|
102
|
-
#
|
103
|
-
# Decode string into original (and possibly unsafe) form
|
104
|
-
# See Wukong.encode_str and Wukong.decode_str
|
105
|
-
#
|
106
|
-
def wukong_decode!(*args)
|
107
|
-
replace self.wukong_decode(*args)
|
108
|
-
end
|
109
|
-
|
110
|
-
def wukong_decode(*args)
|
111
|
-
Wukong.decode_str(self, *args)
|
112
|
-
end
|
113
|
-
|
114
|
-
#
|
115
|
-
# Takes an XML-encoded or plaintext string and forces it into canonical encoding
|
116
|
-
#
|
117
|
-
def wukong_recode!(*args)
|
118
|
-
replace self.wukong_decode(*args).wukong_encode(*args)
|
119
|
-
end
|
120
|
-
def wukong_recode
|
121
|
-
Wukong.encode_str(Wukong.decode_str(self, *args), *args)
|
122
|
-
end
|
123
|
-
end
|
124
|
-
|
125
|
-
Struct.class_eval do
|
126
|
-
def recode!(*args)
|
127
|
-
each_pair{|k,v| v.wukong_recode!(*args) if (v && v.respond_to?(:wukong_recode!)) }
|
128
|
-
end
|
129
|
-
end
|
data/lib/away/exe.rb
DELETED
data/lib/away/experimental.rb
DELETED
data/lib/away/from_file.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Borrowed from Opscode Chef -- thanks guys
|
3
|
-
#
|
4
|
-
# Author:: Adam Jacob (<adam@opscode.com>)
|
5
|
-
# Author:: Christopher Walters (<cw@opscode.com>)
|
6
|
-
# Copyright:: Copyright (c) 2008 Opscode, Inc.
|
7
|
-
# License:: Apache License, Version 2.0
|
8
|
-
#
|
9
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
10
|
-
# you may not use this file except in compliance with the License.
|
11
|
-
# You may obtain a copy of the License at
|
12
|
-
#
|
13
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
14
|
-
#
|
15
|
-
# Unless required by applicable law or agreed to in writing, software
|
16
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
17
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
18
|
-
# See the License for the specific language governing permissions and
|
19
|
-
# limitations under the License.
|
20
|
-
#
|
21
|
-
|
22
|
-
module Wukong
|
23
|
-
module Mixin
|
24
|
-
module FromFile
|
25
|
-
|
26
|
-
# Loads a given ruby file, and runs instance_eval against it in the context of the current
|
27
|
-
# object.
|
28
|
-
#
|
29
|
-
# Raises an IOError if the file cannot be found, or is not readable.
|
30
|
-
def from_file(filename)
|
31
|
-
if File.exists?(filename) && File.readable?(filename)
|
32
|
-
self.instance_eval(IO.read(filename), filename, 1)
|
33
|
-
else
|
34
|
-
raise IOError, "Cannot open or read #{filename}!"
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
# Loads a given ruby file, and runs class_eval against it in the context of the current
|
39
|
-
# object.
|
40
|
-
#
|
41
|
-
# Raises an IOError if the file cannot be found, or is not readable.
|
42
|
-
def class_from_file(filename)
|
43
|
-
if File.exists?(filename) && File.readable?(filename)
|
44
|
-
self.class_eval(IO.read(filename), filename, 1)
|
45
|
-
else
|
46
|
-
raise IOError, "Cannot open or read #{filename}!"
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
data/lib/away/job.rb
DELETED
@@ -1,56 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
#
|
3
|
-
#
|
4
|
-
#
|
5
|
-
#
|
6
|
-
class Job < Wukong::Graph
|
7
|
-
# invokable resources
|
8
|
-
attr_reader :resources
|
9
|
-
|
10
|
-
def to_s
|
11
|
-
['<job', handle,
|
12
|
-
"resources={#{resources.join(' | ')}}",
|
13
|
-
"chain={#{chain.join(' | ')}}",
|
14
|
-
].join(' ')+'>'
|
15
|
-
end
|
16
|
-
|
17
|
-
|
18
|
-
def add_resource(type, handle=nil, *args, &block)
|
19
|
-
rsrc = Wukong.create(type, handle, *args, &block)
|
20
|
-
rsrc.graph = self
|
21
|
-
@resources << rsrc
|
22
|
-
rsrc
|
23
|
-
end
|
24
|
-
|
25
|
-
end
|
26
|
-
|
27
|
-
module Task
|
28
|
-
extend Gorillib::Concern
|
29
|
-
include Wukong::Stage
|
30
|
-
|
31
|
-
module ClassMethods
|
32
|
-
def define_action(name, options={}, &block)
|
33
|
-
self.actions = self.actions.merge(name => options.merge(:block => block))
|
34
|
-
end
|
35
|
-
|
36
|
-
def class_defaults
|
37
|
-
super
|
38
|
-
# field :actions, Array, :of => Symbol, :description => 'list of actions this stage responds to'
|
39
|
-
class_attribute :actions
|
40
|
-
self.actions ||= Hash.new
|
41
|
-
class_attribute :default_action
|
42
|
-
|
43
|
-
define_action :nothing, :description => 'ze goggles, zey do nussing'
|
44
|
-
end
|
45
|
-
|
46
|
-
end
|
47
|
-
included do
|
48
|
-
self.class_defaults
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def self.job(handle, *args, &block)
|
53
|
-
@jobs ||= Hash.new
|
54
|
-
@jobs[handle] ||= Job.new(handle, *args, &block)
|
55
|
-
end
|
56
|
-
end
|
data/lib/away/job/rake_compat.rb
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
|
3
|
-
#
|
4
|
-
# Let `Wukong::Job`s invoke and depend on `Rake::Task`s.
|
5
|
-
#
|
6
|
-
# for example, Rails defines the `:environment` task:
|
7
|
-
#
|
8
|
-
# task :email_expiring, :depends => :environment do
|
9
|
-
# desc "Email expiring accounts to let them know"
|
10
|
-
# date = ENV['from'] ? Date.parse(ENV['from']) : Date.today
|
11
|
-
# Account.notify_expiring(date)
|
12
|
-
# end
|
13
|
-
#
|
14
|
-
#
|
15
|
-
module RakeCompat
|
16
|
-
end
|
17
|
-
end
|
data/lib/away/registry.rb
DELETED
@@ -1,79 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
@@registries ||= Hash.new
|
3
|
-
|
4
|
-
def self.registry(type, options={})
|
5
|
-
type = type.to_sym
|
6
|
-
plural = options[:plural] || "#{type}s"
|
7
|
-
return if @@registries[type]
|
8
|
-
@@registries[type] = Registry.new(type)
|
9
|
-
self.singleton_class.class_eval do
|
10
|
-
# def self.sources() @sources ; end
|
11
|
-
define_method(plural){ @@registries[type] }
|
12
|
-
# def self.source(handle) sources.find(handle) ; end
|
13
|
-
define_method("#{type}_klass"){|handle| @@registries[type].find(handle) }
|
14
|
-
# def self.register_source(klass) sources.register(klass) ; end
|
15
|
-
define_method("register_#{type}"){ |klass| @@registries[type].register(klass) }
|
16
|
-
# def self.unregister_source(handle) sources.register(klass) ; end
|
17
|
-
define_method("unregister_#{type}"){ |handle| @@registries[type].unregister(handle) }
|
18
|
-
# def self.source_exists?(handle) sources.exists?(handle) ; end
|
19
|
-
define_method("#{type}_exists?"){ |handle| @@registries[type].exists?(handle) }
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def self.create(type, *args, &block)
|
24
|
-
@@registries[type].create(*args, &block)
|
25
|
-
end
|
26
|
-
|
27
|
-
class Registry < Mash
|
28
|
-
attr_reader :type
|
29
|
-
|
30
|
-
def initialize(type)
|
31
|
-
@type = type
|
32
|
-
end
|
33
|
-
|
34
|
-
def all
|
35
|
-
self.dup.freeze
|
36
|
-
end
|
37
|
-
|
38
|
-
# given example of registry's class, return it;
|
39
|
-
# otherwise, look up the handle and return that.
|
40
|
-
def find(handle)
|
41
|
-
return handle if handle.is_a?(Class)
|
42
|
-
self[handle]
|
43
|
-
end
|
44
|
-
|
45
|
-
def find!(handle, *args)
|
46
|
-
find(handle, *args) or raise ArgumentError, "cannot find #{type} named '#{handle}'"
|
47
|
-
end
|
48
|
-
|
49
|
-
def exists?(handle)
|
50
|
-
self.has_key?(handle)
|
51
|
-
end
|
52
|
-
|
53
|
-
def create(handle, *args, &block)
|
54
|
-
find!(handle).new(*args, &block)
|
55
|
-
end
|
56
|
-
|
57
|
-
# add given class to registry
|
58
|
-
def register(klass)
|
59
|
-
self[klass.handle] = klass
|
60
|
-
end
|
61
|
-
|
62
|
-
def unregister(klass)
|
63
|
-
self.delete(klass.handle)
|
64
|
-
end
|
65
|
-
|
66
|
-
def convert_key(key)
|
67
|
-
key.is_a?(Class) ? key.handle : super(key)
|
68
|
-
end
|
69
|
-
|
70
|
-
# A valid identifier starts with a letter and has only letters, numbers and underscores
|
71
|
-
VALID_IDENTIFIER_RE = /\A[a-z]\w+\z/i
|
72
|
-
|
73
|
-
def self.valid_handle?(handle)
|
74
|
-
handle.to_s =~ VALID_IDENTIFIER_RE
|
75
|
-
end
|
76
|
-
|
77
|
-
end
|
78
|
-
|
79
|
-
end
|
data/lib/away/runner.rb
DELETED
@@ -1,276 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
class RunnerResult
|
3
|
-
field :runner, Runner, :doc => 'Runner object that created this job'
|
4
|
-
field :command, Array, :of => String, :doc => 'launch command'
|
5
|
-
field :beg_time, Time
|
6
|
-
field :end_time, Time
|
7
|
-
field :raw_out, String
|
8
|
-
field :raw_err, String
|
9
|
-
end
|
10
|
-
|
11
|
-
#
|
12
|
-
# A uniform interface for launching processes.
|
13
|
-
#
|
14
|
-
# * accepts humanized and standardized args
|
15
|
-
# * synthesize args into a command
|
16
|
-
# * launch the process
|
17
|
-
# * parse its output
|
18
|
-
#
|
19
|
-
class Runner
|
20
|
-
class_attribute :result_parser ; self.result_parser = RunnerResult
|
21
|
-
|
22
|
-
field :name, Symbol, :required => true
|
23
|
-
field :executor_path, Pathname, :required => true
|
24
|
-
|
25
|
-
def to_long_params(arg_hsh, dash='-')
|
26
|
-
arg_hsh.inject([]) do |acc, (param, val)|
|
27
|
-
param = param.to_s.gsub(/[\-_\W]+/, dash)
|
28
|
-
acc << "--#{param.to_s}" << val.to_s
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
def native_args(arg_hsh)
|
33
|
-
to_dashed_params(arg_hsh)
|
34
|
-
end
|
35
|
-
|
36
|
-
def command(arg_hsh)
|
37
|
-
[executor_path, *native_args(arg_hsh)]
|
38
|
-
end
|
39
|
-
|
40
|
-
def run(input, arg_hsh)
|
41
|
-
cmd = command(input, arg_hsh)
|
42
|
-
beg_time = Time.now
|
43
|
-
out, err = launch( *cmd )
|
44
|
-
end_time = Time.now
|
45
|
-
|
46
|
-
result_parser.new({
|
47
|
-
:runner => self,
|
48
|
-
:command => cmd,
|
49
|
-
:beg_time => beg_time,
|
50
|
-
:end_time => end_time,
|
51
|
-
:input => input,
|
52
|
-
:arg_hsh => arg_hsh,
|
53
|
-
:raw_out => out,
|
54
|
-
:raw_err => err,
|
55
|
-
})
|
56
|
-
end
|
57
|
-
|
58
|
-
class << self
|
59
|
-
def executor(*args)
|
60
|
-
ArgumentError.check_arity!(args, 1)
|
61
|
-
@executor = args.first if args.present?
|
62
|
-
@executor
|
63
|
-
end
|
64
|
-
|
65
|
-
def launch(*cmd)
|
66
|
-
out = `#{cmd.join(' ')}`
|
67
|
-
end
|
68
|
-
|
69
|
-
def which(basename)
|
70
|
-
raise ArgumentError, "which wants a basename, not a path (#{basename})" if basename =~ %r{\/}
|
71
|
-
out, err = launch('which', basename)
|
72
|
-
out.chomp
|
73
|
-
end
|
74
|
-
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
module RunnerWithInputOutput
|
79
|
-
extend Gorillib::Concern
|
80
|
-
include Hanuman::IsOwnInputSlot
|
81
|
-
include Hanuman::IsOwnOutputSlot
|
82
|
-
|
83
|
-
# sugar for a command that takes input to produce output.
|
84
|
-
#
|
85
|
-
# @param [Array<String>, String] inputs -- added as the `:inputs` arg (converting to an array if necessary)
|
86
|
-
# @param [String] output -- added as the `:output` arg
|
87
|
-
#
|
88
|
-
def run(inputs, output, args={})
|
89
|
-
inputs = Array.wrap(inputs)
|
90
|
-
super args.merge(:inputs => inputs, :output => output)
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
#
|
95
|
-
# Wukong::Runner interface for the `cp` command
|
96
|
-
#
|
97
|
-
# @example
|
98
|
-
# runner = Wukong::CpRunner.new
|
99
|
-
# runner.run('my_src.jpg', 'my_dest.jpg')
|
100
|
-
#
|
101
|
-
class CpRunner
|
102
|
-
include RunnerWithInputOutput
|
103
|
-
executor which('cp')
|
104
|
-
|
105
|
-
argument :verbose, Boolean, :native => '-v', :solo => true, :doc => 'show files as they are copied'
|
106
|
-
argument :duplicate, Boolean, :native => '-a', :solo => true, :doc => 'Preserves structure and attributes of files'
|
107
|
-
end
|
108
|
-
|
109
|
-
class ScpRunner
|
110
|
-
include RunnerWithInputOutput
|
111
|
-
executor which('scp')
|
112
|
-
|
113
|
-
argument :verbose, Boolean, :native => '-v', :solo => true, :doc => 'show files as they are copied'
|
114
|
-
argument :duplicate, Boolean, :native => '-p', :solo => true, :doc => 'Preserves structure and attributes of files'
|
115
|
-
#
|
116
|
-
argument :ssh_user, String
|
117
|
-
argument :dest_host, String
|
118
|
-
argument :ssh_key_file, Pathname, :native => '-i'
|
119
|
-
argument :dest_port, Integer, :native => '-P'
|
120
|
-
|
121
|
-
argument :compression, Boolean, :native => '-C'
|
122
|
-
argument :recursive, Boolean, :native => '-r'
|
123
|
-
|
124
|
-
self.success_exit_status = 0
|
125
|
-
end
|
126
|
-
|
127
|
-
module RunnerForJava
|
128
|
-
|
129
|
-
argument :java_home, :env_var => 'JAVA_HOME', :doc => 'path to the java environment; $JAVA_HOME/bin usually holds your java runner'
|
130
|
-
|
131
|
-
argument :java_prog, :finally => ->(){ path_to(arg_val(:java_home), 'bin', 'java') }
|
132
|
-
|
133
|
-
argument :jar
|
134
|
-
|
135
|
-
argument :classpath
|
136
|
-
|
137
|
-
def java_conf
|
138
|
-
end
|
139
|
-
|
140
|
-
end
|
141
|
-
|
142
|
-
class HadoopRunner
|
143
|
-
include RunnerWithInputOutput
|
144
|
-
executor which('hadoop')
|
145
|
-
|
146
|
-
argument :verbose, Boolean, :native => '-v', :solo => true, :doc => 'show files as they are copied'
|
147
|
-
|
148
|
-
argument :hadoop_home, :default => '/usr/lib/hadoop', :doc => "Path to hadoop installation; ENV['HADOOP_HOME'] by default. HADOOP_HOME/bin/hadoop is used to run hadoop.", :env_var => 'HADOOP_HOME'
|
149
|
-
argument :hadoop_runner, :doc => "Path to hadoop script. Usually set --hadoop_home instead of this."
|
150
|
-
|
151
|
-
#
|
152
|
-
# Translate simplified args to their hairy hadoop equivalents
|
153
|
-
#
|
154
|
-
argument :job_name, :jobconf => 'mapred.job.name'
|
155
|
-
#
|
156
|
-
argument :io_sort_mb, :jobconf => 'io.sort.mb'
|
157
|
-
argument :io_sort_record_percent, :jobconf => 'io.sort.record.percent'
|
158
|
-
argument :key_field_separator, :jobconf => 'map.output.key.field.separator'
|
159
|
-
argument :map_speculative, :jobconf => 'mapred.map.tasks.speculative.execution'
|
160
|
-
argument :map_tasks, :jobconf => 'mapred.map.tasks'
|
161
|
-
argument :max_maps_per_cluster, :jobconf => 'mapred.max.maps.per.cluster'
|
162
|
-
argument :max_maps_per_node, :jobconf => 'mapred.max.maps.per.node'
|
163
|
-
argument :max_node_map_tasks, :jobconf => 'mapred.tasktracker.map.tasks.maximum'
|
164
|
-
argument :max_node_reduce_tasks, :jobconf => 'mapred.tasktracker.reduce.tasks.maximum'
|
165
|
-
argument :max_record_length, :jobconf => 'mapred.linerecordreader.maxlength', :doc => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
166
|
-
argument :max_reduces_per_cluster,:jobconf => 'mapred.max.reduces.per.cluster'
|
167
|
-
argument :max_reduces_per_node, :jobconf => 'mapred.max.reduces.per.node'
|
168
|
-
argument :max_tracker_failures, :jobconf => 'mapred.max.tracker.failures'
|
169
|
-
argument :max_map_attempts, :jobconf => 'mapred.map.max.attempts'
|
170
|
-
argument :max_reduce_attempts, :jobconf => 'mapred.reduce.max.attempts'
|
171
|
-
argument :min_split_size, :jobconf => 'mapred.min.split.size'
|
172
|
-
argument :output_field_separator, :jobconf => 'stream.map.output.field.separator'
|
173
|
-
argument :partition_fields, :jobconf => 'num.key.fields.for.partition'
|
174
|
-
argument :reduce_tasks, :jobconf => 'mapred.reduce.tasks'
|
175
|
-
argument :respect_exit_status, :jobconf => 'stream.non.zero.exit.is.failure'
|
176
|
-
argument :reuse_jvms, :jobconf => 'mapred.job.reuse.jvm.num.tasks'
|
177
|
-
argument :sort_fields, :jobconf => 'stream.num.map.output.key.fields'
|
178
|
-
argument :timeout, :jobconf => 'mapred.task.timeout'
|
179
|
-
argument :noempty, :doc => "don't create zero-byte reduce files (hadoop mode only)"
|
180
|
-
argument :split_on_xml_tag, :doc => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'"
|
181
|
-
|
182
|
-
|
183
|
-
argument :mapper_command, String, :native => '-mapper'
|
184
|
-
argument :reducer_command, String, :native => '-reducer'
|
185
|
-
|
186
|
-
repeated_argument :file, String, :native => '-file'
|
187
|
-
|
188
|
-
# emit a -jobconf hadoop option if the simplified command line arg is present
|
189
|
-
def jobconf option
|
190
|
-
if settings[option]
|
191
|
-
# "-jobconf %s=%s" % [settings.definition_of(option, :description), settings[option]]
|
192
|
-
"-D %s=%s" % [settings.definition_of(option, :description), settings[option]]
|
193
|
-
end
|
194
|
-
end
|
195
|
-
|
196
|
-
def finalize_settings
|
197
|
-
settings[:reuse_jvms] = '-1' if (settings[:reuse_jvms] == true)
|
198
|
-
settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
|
199
|
-
settings[:reduce_tasks] = 0 if (! settings[:reduce_command])
|
200
|
-
end
|
201
|
-
|
202
|
-
def hadoop_other_args
|
203
|
-
extra_str_args = [ settings[:extra_args] ]
|
204
|
-
if settings.split_on_xml_tag
|
205
|
-
extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{settings.split_on_xml_tag}>,end=</#{settings.split_on_xml_tag}>'}
|
206
|
-
end
|
207
|
-
extra_str_args << ' -lazyOutput' if settings[:noempty] # don't create reduce file if no records
|
208
|
-
extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless settings[:partition_fields].blank?
|
209
|
-
extra_str_args
|
210
|
-
end
|
211
|
-
|
212
|
-
def hadoop_recycle_env
|
213
|
-
%w[RUBYLIB].map do |var|
|
214
|
-
%Q{-cmdenv '#{var}=#{ENV[var]}'} if ENV[var]
|
215
|
-
end.compact
|
216
|
-
end
|
217
|
-
|
218
|
-
# The path to the hadoop runner script
|
219
|
-
def hadoop_runner
|
220
|
-
settings[:hadoop_runner] || (settings[:hadoop_home]+'/bin/hadoop')
|
221
|
-
end
|
222
|
-
|
223
|
-
#
|
224
|
-
# Assemble the hadoop command to execute
|
225
|
-
# and launch the hadoop runner to execute the script across all tasktrackers
|
226
|
-
#
|
227
|
-
# FIXME: Should add some simple logic to ensure that commands are in the
|
228
|
-
# right order or hadoop will complain. ie. -D settings MUST come before
|
229
|
-
# others
|
230
|
-
#
|
231
|
-
def execute_hadoop_workflow
|
232
|
-
# Input paths join by ','
|
233
|
-
input_paths = @input_paths.join(',')
|
234
|
-
#
|
235
|
-
# Use Settings[:hadoop_home] to set the path your config install.
|
236
|
-
hadoop_commandline = [
|
237
|
-
hadoop_runner,
|
238
|
-
"jar #{settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
|
239
|
-
hadoop_jobconf_settings,
|
240
|
-
"-D mapred.job.name='#{job_name}'",
|
241
|
-
hadoop_other_args,
|
242
|
-
"-mapper '#{mapper_commandline}'",
|
243
|
-
"-reducer '#{reducer_commandline}'",
|
244
|
-
"-input '#{input_paths}'",
|
245
|
-
"-output '#{output_path}'",
|
246
|
-
"-file '#{this_script_filename}'",
|
247
|
-
hadoop_recycle_env,
|
248
|
-
].flatten.compact.join(" \t\\\n ")
|
249
|
-
Log.info " Launching hadoop!"
|
250
|
-
execute_command!(hadoop_commandline)
|
251
|
-
end
|
252
|
-
|
253
|
-
end
|
254
|
-
|
255
|
-
|
256
|
-
#
|
257
|
-
# Req
|
258
|
-
#
|
259
|
-
class HadoopJob
|
260
|
-
field :job_id
|
261
|
-
field :k
|
262
|
-
|
263
|
-
def from_jobtracker(jobtracker_host)
|
264
|
-
contents = fetch_jobtracker_raw(jobtracker_host)
|
265
|
-
attrs = parse_jobtracker_raw(contents)
|
266
|
-
end
|
267
|
-
|
268
|
-
def fetch_jobtracker_raw(jobtracker_host)
|
269
|
-
end
|
270
|
-
|
271
|
-
def parse_jobtracker_raw(contents)
|
272
|
-
end
|
273
|
-
end
|
274
|
-
|
275
|
-
|
276
|
-
end
|