wukong 3.0.0.pre → 3.0.0.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,35 +0,0 @@
|
|
1
|
-
require File.expand_path('stopwords', File.dirname(__FILE__))
|
2
|
-
module Wukong
|
3
|
-
module Helper
|
4
|
-
|
5
|
-
module Tokenize
|
6
|
-
#
|
7
|
-
# Split a string into its constituent words.
|
8
|
-
#
|
9
|
-
# This is pretty simpleminded:
|
10
|
-
# * downcase the word
|
11
|
-
# * Split at any non-alphanumeric boundary, including '_'
|
12
|
-
# * However, preserve the special cases of 's, 'd or 't at the end of a
|
13
|
-
# word.
|
14
|
-
#
|
15
|
-
# tokenize("Ability is a poor man's wealth #johnwoodenquote")
|
16
|
-
# # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
|
17
|
-
#
|
18
|
-
def self.tokenize str
|
19
|
-
return [] if str.blank?
|
20
|
-
str = str.downcase;
|
21
|
-
# kill off all punctuation except [stuff]'s or [stuff]'t
|
22
|
-
# this includes hyphens (words are split)
|
23
|
-
str = str.
|
24
|
-
gsub(/[^a-zA-Z0-9\']+/, ' ').
|
25
|
-
gsub(/(\w)\'([stdm]|re|ve|ll)\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
|
26
|
-
# Busticate at whitespace
|
27
|
-
words = str.split(/\s+/)
|
28
|
-
words.reject!{|w| w.length < 3 || Wukong::Corpus::STOPWORDS_3.include?(w) }
|
29
|
-
words
|
30
|
-
end
|
31
|
-
|
32
|
-
end
|
33
|
-
|
34
|
-
end
|
35
|
-
end
|
data/old/wukong/logger.rb
DELETED
@@ -1,38 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
# Common logger
|
3
|
-
#
|
4
|
-
# Set your own at any time with
|
5
|
-
# Wukong.logger = YourAwesomeLogger.new(...)
|
6
|
-
# If you have log4r installed you can use
|
7
|
-
# Wukong.logger = Wukong.default_log4r_logger
|
8
|
-
#
|
9
|
-
# If Wukong.logger is too much typing for you,
|
10
|
-
# use the Log constant
|
11
|
-
#
|
12
|
-
# Default format:
|
13
|
-
# I, [2009-07-26T19:58:46-05:00 #12332]: Up to 2000 char message
|
14
|
-
#
|
15
|
-
def self.logger
|
16
|
-
return @logger if defined?(@logger)
|
17
|
-
require 'logger'
|
18
|
-
@logger = Logger.new STDERR
|
19
|
-
@logger.instance_eval do
|
20
|
-
def dump *args
|
21
|
-
debug args.inspect
|
22
|
-
end
|
23
|
-
end
|
24
|
-
@logger
|
25
|
-
end
|
26
|
-
|
27
|
-
def self.logger= logger
|
28
|
-
@logger = logger
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
#
|
33
|
-
# A convenient logger.
|
34
|
-
#
|
35
|
-
# define Log yourself to prevent its creation
|
36
|
-
#
|
37
|
-
Log = Wukong.logger unless defined?(Log)
|
38
|
-
|
@@ -1,72 +0,0 @@
|
|
1
|
-
Settings.define :log_interval, :default => 10_000, :type => Integer, :description => 'How many iterations between log statements'
|
2
|
-
Settings.define :log_seconds, :default => 30, :type => Integer, :description => 'How many seconds between log statements'
|
3
|
-
|
4
|
-
#
|
5
|
-
# Periodic monitor
|
6
|
-
#
|
7
|
-
#
|
8
|
-
# This is very much a work in progress
|
9
|
-
#
|
10
|
-
class PeriodicMonitor
|
11
|
-
attr_reader :iter, :start_time, :options
|
12
|
-
attr_accessor :interval
|
13
|
-
attr_accessor :time_interval
|
14
|
-
|
15
|
-
def initialize extra_options={}
|
16
|
-
@options = {}
|
17
|
-
@options.deep_merge!( extra_options || {} )
|
18
|
-
@iter = 0
|
19
|
-
@start_time = now
|
20
|
-
@last_report = @start_time
|
21
|
-
@interval = (options[:log_interval] || Settings[:log_interval]).to_i
|
22
|
-
@interval = 1000 unless @interval >= 1
|
23
|
-
@time_interval = (options[:log_seconds] || Settings[:log_seconds]).to_i
|
24
|
-
end
|
25
|
-
|
26
|
-
def periodically *args, &block
|
27
|
-
incr!
|
28
|
-
if ready?
|
29
|
-
@last_report = Time.now
|
30
|
-
if block
|
31
|
-
emit block.call(self, *args)
|
32
|
-
else
|
33
|
-
emit progress(*args)
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def emit log_line
|
39
|
-
Log.info log_line
|
40
|
-
end
|
41
|
-
|
42
|
-
def incr!
|
43
|
-
@iter += 1
|
44
|
-
end
|
45
|
-
|
46
|
-
def ready?
|
47
|
-
(iter % @interval == 0) || (since > time_interval)
|
48
|
-
end
|
49
|
-
|
50
|
-
def progress *stuff
|
51
|
-
[
|
52
|
-
"%15d" % iter,
|
53
|
-
"%7.1f"% elapsed_time, "sec",
|
54
|
-
"%7.1f"% rate, "/sec",
|
55
|
-
now.to_flat,
|
56
|
-
*stuff
|
57
|
-
].flatten.join("\t")
|
58
|
-
end
|
59
|
-
|
60
|
-
def elapsed_time
|
61
|
-
now - start_time
|
62
|
-
end
|
63
|
-
def since
|
64
|
-
now - @last_report
|
65
|
-
end
|
66
|
-
def now
|
67
|
-
Time.now.utc
|
68
|
-
end
|
69
|
-
def rate
|
70
|
-
iter.to_f / elapsed_time
|
71
|
-
end
|
72
|
-
end
|
data/old/wukong/schema.rb
DELETED
@@ -1,269 +0,0 @@
|
|
1
|
-
require 'extlib/inflection'
|
2
|
-
require 'wukong'
|
3
|
-
|
4
|
-
|
5
|
-
#
|
6
|
-
# Basic types: SQL conversion
|
7
|
-
#
|
8
|
-
class << Integer ; def to_sql() 'INT' end ; end
|
9
|
-
class << Bignum ; def to_sql() 'BIGINT' end ; end
|
10
|
-
class << String ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end
|
11
|
-
class << Symbol ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end
|
12
|
-
class << BigDecimal ; def to_sql() 'DECIMAL' end ; end if defined?(BigDecimal)
|
13
|
-
class << EpochTime ; def to_sql() 'INT' end ; end if defined?(EpochTime)
|
14
|
-
class << FilePath ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(FilePath)
|
15
|
-
class << Flag ; def to_sql() 'CHAR(1) CHARACTER SET ASCII' end ; end if defined?(Flag)
|
16
|
-
class << IPAddress ; def to_sql() 'CHAR(15) CHARACTER SET ASCII' end ; end if defined?(IPAddress)
|
17
|
-
class << URI ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(URI)
|
18
|
-
class << Csv ; def to_sql() 'TEXT' end ; end if defined?(Csv)
|
19
|
-
class << Yaml ; def to_sql() 'TEXT' end ; end if defined?(Yaml)
|
20
|
-
class << Json ; def to_sql() 'TEXT' end ; end if defined?(Json)
|
21
|
-
class << Regex ; def to_sql() 'TEXT' end ; end if defined?(Regex)
|
22
|
-
class String ; def to_sql() self ; end ; end
|
23
|
-
class Symbol ; def to_sql() self.to_s.upcase ; end ; end
|
24
|
-
|
25
|
-
#
|
26
|
-
# Basic types: Pig conversion
|
27
|
-
#
|
28
|
-
class << Integer ; def to_pig() 'int' end ; end
|
29
|
-
class << Bignum ; def to_pig() 'long' end ; end
|
30
|
-
class << Float ; def to_pig() 'float' end ; end
|
31
|
-
class << Symbol ; def to_pig() 'chararray' end ; end
|
32
|
-
class << Date ; def to_pig() 'long' end ; end
|
33
|
-
class << Time ; def to_pig() 'long' end ; end
|
34
|
-
class << DateTime ; def to_pig() 'long' end ; end
|
35
|
-
class << String ; def to_pig() 'chararray' end ; end
|
36
|
-
class << Text ; def to_pig() 'chararray' end ; end if defined?(Text)
|
37
|
-
class << Blob ; def to_pig() 'bytearray' end ; end if defined?(Blob)
|
38
|
-
class << Boolean ; def to_pig() 'bytearray' end ; end if defined?(Boolean)
|
39
|
-
class String ; def to_pig() self.to_s ; end ; end
|
40
|
-
class Symbol ; def to_pig() self.to_s ; end ; end
|
41
|
-
|
42
|
-
class << BigDecimal ; def to_pig() 'long' end ; end if defined?(BigDecimal)
|
43
|
-
class << EpochTime ; def to_pig() 'integer' end ; end if defined?(EpochTime)
|
44
|
-
class << FilePath ; def to_pig() 'chararray' end ; end if defined?(FilePath)
|
45
|
-
class << Flag ; def to_pig() 'chararray' end ; end if defined?(Flag)
|
46
|
-
class << IPAddress ; def to_pig() 'chararray' end ; end if defined?(IPAddress)
|
47
|
-
class << URI ; def to_pig() 'chararray' end ; end if defined?(URI)
|
48
|
-
class << Csv ; def to_pig() 'chararray' end ; end if defined?(Csv)
|
49
|
-
class << Yaml ; def to_pig() 'chararray' end ; end if defined?(Yaml)
|
50
|
-
class << Json ; def to_pig() 'chararray' end ; end if defined?(Json)
|
51
|
-
class << Regex ; def to_pig() 'chararray' end ; end if defined?(Regex)
|
52
|
-
|
53
|
-
|
54
|
-
#
|
55
|
-
# Basic types: Avro conversion
|
56
|
-
#
|
57
|
-
class << Integer ; def to_avro() 'int' end ; end
|
58
|
-
class << Bignum ; def to_avro() 'long' end ; end
|
59
|
-
class << Float ; def to_avro() 'float' end ; end
|
60
|
-
class << Symbol ; def to_avro() 'string' end ; end
|
61
|
-
class << Date ; def to_avro() 'long' end ; end
|
62
|
-
class << Time ; def to_avro() 'long' end ; end
|
63
|
-
class << DateTime ; def to_avro() 'long' end ; end
|
64
|
-
class << String ; def to_avro() 'string' end ; end
|
65
|
-
class << Text ; def to_avro() 'string' end ; end if defined?(Text)
|
66
|
-
class << Blob ; def to_avro() 'bytearray' end ; end if defined?(Blob)
|
67
|
-
class << Boolean ; def to_avro() 'bytearray' end ; end if defined?(Boolean)
|
68
|
-
class String ; def to_avro() self.to_s ; end ; end
|
69
|
-
class Symbol ; def to_avro() self.to_s ; end ; end
|
70
|
-
|
71
|
-
class << BigDecimal ; def to_avro() 'long' end ; end if defined?(BigDecimal)
|
72
|
-
class << EpochTime ; def to_avro() 'integer' end ; end if defined?(EpochTime)
|
73
|
-
class << FilePath ; def to_avro() 'string' end ; end if defined?(FilePath)
|
74
|
-
class << Flag ; def to_avro() 'string' end ; end if defined?(Flag)
|
75
|
-
class << IPAddress ; def to_avro() 'string' end ; end if defined?(IPAddress)
|
76
|
-
class << URI ; def to_avro() 'string' end ; end if defined?(URI)
|
77
|
-
class << Csv ; def to_avro() 'string' end ; end if defined?(Csv)
|
78
|
-
class << Yaml ; def to_avro() 'string' end ; end if defined?(Yaml)
|
79
|
-
class << Json ; def to_avro() 'string' end ; end if defined?(Json)
|
80
|
-
class << Regex ; def to_avro() 'string' end ; end if defined?(Regex)
|
81
|
-
|
82
|
-
module Wukong
|
83
|
-
#
|
84
|
-
# Export model's structure for loading and manipulating in other frameworks,
|
85
|
-
# such as SQL and Pig
|
86
|
-
#
|
87
|
-
# Your class should support the #resource_name and #mtypes methods
|
88
|
-
# An easy way to do this is by being a TypedStruct.
|
89
|
-
#
|
90
|
-
# You can use this to do silly stunts like
|
91
|
-
#
|
92
|
-
# % ruby -rubygems -r'wukong/schema' -e 'require "/path/to/user_model.rb" ; puts User.pig_load ; '
|
93
|
-
#
|
94
|
-
# If you include the classes from Wukong::Datatypes::MoreTypes, you can draw
|
95
|
-
# on a richer set of type definitions
|
96
|
-
#
|
97
|
-
# require 'wukong/datatypes/more_types'
|
98
|
-
# include Wukong::Datatypes::MoreTypes
|
99
|
-
# require 'wukong/schema'
|
100
|
-
#
|
101
|
-
# (if you're using Wukong to bulk-process Datamapper records, these should
|
102
|
-
# fall right in line as well -- make sure *not* to include
|
103
|
-
# Wukong::Datatypes::MoreTypes, and to require 'dm-more' before 'wukong/schema')
|
104
|
-
#
|
105
|
-
module Schema
|
106
|
-
module ClassMethods
|
107
|
-
|
108
|
-
#
|
109
|
-
# Table name for this class
|
110
|
-
#
|
111
|
-
def table_name
|
112
|
-
resource_name.to_s.pluralize
|
113
|
-
end
|
114
|
-
|
115
|
-
# ===========================================================================
|
116
|
-
#
|
117
|
-
# Pig
|
118
|
-
#
|
119
|
-
|
120
|
-
# Export schema as Pig
|
121
|
-
#
|
122
|
-
# Won't correctly handle complex types (struct having struct as member, eg)
|
123
|
-
#
|
124
|
-
def to_pig
|
125
|
-
members.zip(mtypes).map do |member, type|
|
126
|
-
member.to_s + ': ' + type.to_pig
|
127
|
-
end.join(', ')
|
128
|
-
end
|
129
|
-
|
130
|
-
#
|
131
|
-
# A pig snippet to load a tsv file containing
|
132
|
-
# serialized instances of this class.
|
133
|
-
#
|
134
|
-
# Assumes the first column is the resource name (you can, and probably
|
135
|
-
# should, follow with an immediate GENERATE to ditch that field.)
|
136
|
-
#
|
137
|
-
def pig_load filename=nil
|
138
|
-
filename ||= resource_name.to_s+'.tsv'
|
139
|
-
cmd = [
|
140
|
-
"%-23s" % self.to_s.gsub(/^.*\W/, ""),
|
141
|
-
"= LOAD '#{filename}'",
|
142
|
-
"AS ( rsrc:chararray,", self.to_pig, ') ;',
|
143
|
-
].join(" ")
|
144
|
-
end
|
145
|
-
|
146
|
-
# ===========================================================================
|
147
|
-
#
|
148
|
-
# SQL
|
149
|
-
|
150
|
-
#
|
151
|
-
# Schema definition for use in a CREATE TABLE statement
|
152
|
-
#
|
153
|
-
def to_sql
|
154
|
-
sql_str = []
|
155
|
-
members.zip(mtypes).each do |attr, type|
|
156
|
-
type_str = type.respond_to?(:to_sql) ? type.to_sql : type.to_s.upcase
|
157
|
-
sql_str << " %-29s\t%s" %["`#{attr}`", type_str]
|
158
|
-
end
|
159
|
-
sql_str.join(",\n")
|
160
|
-
end
|
161
|
-
|
162
|
-
#
|
163
|
-
# List off member names, to be stuffed into a SELECT or a LOAD DATA
|
164
|
-
#
|
165
|
-
def sql_members
|
166
|
-
members.map{|attr| "`#{attr}`" }.join(", ")
|
167
|
-
end
|
168
|
-
|
169
|
-
#
|
170
|
-
# Creates a table for the wukong class.
|
171
|
-
#
|
172
|
-
# * primary_key gives the name of one column to be set as the primary key
|
173
|
-
#
|
174
|
-
# * if drop_first is given, a "DROP TABLE IF EXISTS" statement will
|
175
|
-
# precede the snippet.
|
176
|
-
#
|
177
|
-
# * table_options sets the table parameters. Useful table_options for a
|
178
|
-
# read-only database in MySQL:
|
179
|
-
# ENGINE=MyISAM PACK_KEYS=0
|
180
|
-
#
|
181
|
-
def sql_create_table primary_key=nil, drop_first=nil, table_options=''
|
182
|
-
str = []
|
183
|
-
str << %Q{DROP TABLE IF EXISTS `#{self.table_name}`; } if drop_first
|
184
|
-
str << %Q{CREATE TABLE `#{self.table_name}` ( }
|
185
|
-
str << self.to_sql
|
186
|
-
if primary_key then str.last << ',' ; str << %Q{ PRIMARY KEY \t(`#{primary_key}`)} ; end
|
187
|
-
str << %Q{ ) #{table_options} ;}
|
188
|
-
str.join("\n")
|
189
|
-
end
|
190
|
-
|
191
|
-
#
|
192
|
-
# A mysql snippet to bulk load the tab-separated-values file emitted by a
|
193
|
-
# Wukong script.
|
194
|
-
#
|
195
|
-
# Let's say your class is ClickLog; its resource_name is "click_log"
|
196
|
-
# and thus its table_name is 'click_logs'. sql_load_mysql will:
|
197
|
-
#
|
198
|
-
# * disable indexing on the table
|
199
|
-
# * import the file, replacing any existing rows. (Replacement is governed
|
200
|
-
# by primary key and unique index constraints -- see the mysql docs).
|
201
|
-
# * re-enable indexing on that table
|
202
|
-
# * show the number of
|
203
|
-
#
|
204
|
-
# The load portion will
|
205
|
-
#
|
206
|
-
# * Load into a table named click_logs
|
207
|
-
# * from a file named click_logs.tsv
|
208
|
-
# * where all rows have the string 'click_logs' in their first column
|
209
|
-
# * and all remaining fields in their #members order
|
210
|
-
# * assuming strings are wukong_encode'd and so shouldn't be escaped or enclosed.
|
211
|
-
#
|
212
|
-
# Why the "LINES STARTING BY" part? For map/reduce outputs that have many
|
213
|
-
# different objects jumbled together, you can just dump in the whole file,
|
214
|
-
# landing each object in its correct table.
|
215
|
-
#
|
216
|
-
def sql_load_mysql(filename=nil)
|
217
|
-
filename ||= ":resource_name.tsv"
|
218
|
-
filename.gsub!(/:resource_name/, self.table_name)
|
219
|
-
str = []
|
220
|
-
# disable indexing during bulk load
|
221
|
-
str << %Q{ALTER TABLE `#{self.table_name}` DISABLE KEYS; }
|
222
|
-
# Bulk load the tab-separated-values file.
|
223
|
-
str << %Q{LOAD DATA LOCAL INFILE '#{filename}'}
|
224
|
-
str << %Q{ REPLACE INTO TABLE `#{self.table_name}` }
|
225
|
-
str << %Q{ COLUMNS }
|
226
|
-
str << %Q{ TERMINATED BY '\\t' }
|
227
|
-
str << %Q{ OPTIONALLY ENCLOSED BY '' }
|
228
|
-
str << %Q{ ESCAPED BY '' }
|
229
|
-
str << %Q{ LINES STARTING BY '#{self.resource_name}' }
|
230
|
-
str << %Q{ ( @dummy,\n }
|
231
|
-
str << ' '+self.sql_members
|
232
|
-
str << %Q{\n ); }
|
233
|
-
# Re-enable indexing
|
234
|
-
str << %Q{ALTER TABLE `#{self.table_name}` ENABLE KEYS ; }
|
235
|
-
# Show it loaded correctly
|
236
|
-
str << %Q{SELECT NOW(), COUNT(*), '#{self.table_name}' FROM `#{self.table_name}`; }
|
237
|
-
str.join("\n")
|
238
|
-
end
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
#
|
244
|
-
# Avro
|
245
|
-
#
|
246
|
-
def to_avro
|
247
|
-
require 'json' # yikes
|
248
|
-
h = {}
|
249
|
-
h[:name] = self.name
|
250
|
-
h[:type] = "record"
|
251
|
-
h[:fields] = []
|
252
|
-
members.zip(mtypes).each do |member, type|
|
253
|
-
h[:fields] << {:name => member.to_s, :type => type.to_avro}
|
254
|
-
end
|
255
|
-
h.to_json
|
256
|
-
end
|
257
|
-
|
258
|
-
end
|
259
|
-
# standard stanza for making methods appear on the class itself on include
|
260
|
-
def self.included base
|
261
|
-
base.class_eval{ extend ClassMethods }
|
262
|
-
end
|
263
|
-
end
|
264
|
-
end
|
265
|
-
|
266
|
-
#
|
267
|
-
# TypedStructs are class-schematizeable
|
268
|
-
#
|
269
|
-
Struct.class_eval do include(Wukong::Schema) ; end
|
data/old/wukong/script.rb
DELETED
@@ -1,286 +0,0 @@
|
|
1
|
-
require 'pathname'
|
2
|
-
require 'wukong/extensions'
|
3
|
-
require 'configliere' ; Settings.use(:commandline, :env_var, :define)
|
4
|
-
require 'wukong'
|
5
|
-
require 'wukong/script/hadoop_command'
|
6
|
-
require 'wukong/script/local_command'
|
7
|
-
require 'rbconfig' # for uncovering ruby_interpreter_path
|
8
|
-
require 'wukong/streamer' ; include Wukong::Streamer
|
9
|
-
module Wukong
|
10
|
-
# == How to run a Wukong script
|
11
|
-
#
|
12
|
-
# your/script.rb --run path/to/input_files path/to/output_dir
|
13
|
-
#
|
14
|
-
# All of the file paths are HDFS paths ; your script path, of course, is on the local filesystem.
|
15
|
-
#
|
16
|
-
# == Command-line options
|
17
|
-
#
|
18
|
-
# If you'd like to listen for any command-line options, specify them at the
|
19
|
-
# command line:
|
20
|
-
#
|
21
|
-
# your/script.rb --my_bool_opt --my_val_taking_opt=val \
|
22
|
-
# --run path/to/input_files path/to/output_dir
|
23
|
-
#
|
24
|
-
# In this case the options hash for both Mapper and Reducer will contain
|
25
|
-
#
|
26
|
-
# :my_bool_opt => true,
|
27
|
-
# :my_val_taking_opt => 'val'
|
28
|
-
#
|
29
|
-
# == Complicated input paths
|
30
|
-
#
|
31
|
-
# To use more than one file as input, you can use normal * ? [] wildcards or
|
32
|
-
# give a comma-separated list -- see the hadoop documentation for syntax.
|
33
|
-
#
|
34
|
-
# == Run in Elastic MapReduce Mode (--run=emr)
|
35
|
-
#
|
36
|
-
# Wukong can be used to start scripts on the amazon cloud
|
37
|
-
#
|
38
|
-
# * copies the script to s3 in two parts
|
39
|
-
# * invokes it using the amazon API
|
40
|
-
#
|
41
|
-
# == Run locally (--run=local)
|
42
|
-
#
|
43
|
-
# To run your script locally, use --run=local
|
44
|
-
#
|
45
|
-
# your/script.rb --run=local path/to/input_files path/to/output_dir
|
46
|
-
#
|
47
|
-
# This will pipe the contents of path/to/input_files through first your
|
48
|
-
# mapper, then sort, then the reducer, storing the results in the given output
|
49
|
-
# directory.
|
50
|
-
#
|
51
|
-
# All paths refer to the /local/ filesystem -- hadoop is never involved and in
|
52
|
-
# fact doesn't even have to be installed.
|
53
|
-
#
|
54
|
-
# == How to test your scripts
|
55
|
-
#
|
56
|
-
# You can supply the --map argument in place of --run to run the mapper on its
|
57
|
-
# own (and similarly, --reduce to run the reducer standalone):
|
58
|
-
#
|
59
|
-
# cat ./local/test/input.tsv | ./examples/word_count.rb --map | more
|
60
|
-
#
|
61
|
-
# or, if your test data lies on the HDFS,
|
62
|
-
#
|
63
|
-
# hdp-cat test/input.tsv | ./examples/word_count.rb --map | more
|
64
|
-
#
|
65
|
-
#
|
66
|
-
class Script
|
67
|
-
include Wukong::HadoopCommand
|
68
|
-
include Wukong::LocalCommand
|
69
|
-
attr_reader :mapper, :reducer, :options
|
70
|
-
attr_reader :input_paths, :output_path
|
71
|
-
|
72
|
-
# ---------------------------------------------------------------------------
|
73
|
-
#
|
74
|
-
# Default options for Wukong
|
75
|
-
# http://github.com/infochimps/wukong
|
76
|
-
#
|
77
|
-
# If you set an environment variable WUKONG_CONFIG, *or* if the file
|
78
|
-
# $HOME/.wukong.rb exists, that file will be +require+'d as well.
|
79
|
-
#
|
80
|
-
# Important values to set:
|
81
|
-
#
|
82
|
-
# * hadoop_home -- Path to root of hadoop install. If your hadoop runner is
|
83
|
-
# /usr/local/share/hadoop/bin/hadoop
|
84
|
-
# then your hadoop_home is
|
85
|
-
# /usr/local/share/hadoop.
|
86
|
-
# You can also set a :hadoop_runner that gives the full path to the hadoop script
|
87
|
-
#
|
88
|
-
# * default_run_mode -- Whether to run using hadoop (and
|
89
|
-
# thus, requiring a working hadoop install), or to run in local mode
|
90
|
-
# (script --map | sort | script --reduce)
|
91
|
-
#
|
92
|
-
Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run mode: local, hadoop, emr (elastic mapreduce)', :wukong => true, :hide_help => false
|
93
|
-
Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
|
94
|
-
Settings.define :reduce_command, :description => "shell command to run as reducer, in place of this wukong script", :wukong => true
|
95
|
-
Settings.define :run, :env_var => 'WUKONG_RUN_MODE', :description => "run the script's workflow: Specify 'hadoop' to use hadoop streaming; 'local' to run your_script.rb --map | sort | your_script.rb --reduce; 'emr' to launch on the amazon cloud; 'map' or 'reduce' to run that phase.", :wukong => true
|
96
|
-
Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
|
97
|
-
Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
|
98
|
-
Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
|
99
|
-
Settings.define :rm, :description => "Recursively remove the destination directory. Only used in hadoop mode.", :wukong => true
|
100
|
-
|
101
|
-
#
|
102
|
-
# Instantiate the Script with the Mapper and the Reducer class (each a
|
103
|
-
# Wukong::Streamer) it should call back.
|
104
|
-
#
|
105
|
-
#
|
106
|
-
# == Identity or External program as map or reduce
|
107
|
-
#
|
108
|
-
# To use the identity reducer ('cat'), instantiate your Script class with
|
109
|
-
# +nil+ as the reducer class. (And similarly to use an identity mapper,
|
110
|
-
# supply +nil+ for the mapper class.)
|
111
|
-
#
|
112
|
-
# To use an external program as your reducer (mapper), subclass the
|
113
|
-
# reduce_command (map_command) method to return the full command line
|
114
|
-
# expression to call.
|
115
|
-
#
|
116
|
-
# class MyMapper < Wukong::Streamer::Base
|
117
|
-
# # ... awesome stuff ...
|
118
|
-
# end
|
119
|
-
#
|
120
|
-
# class MyScript < Wukong::Script
|
121
|
-
# # prefix each unique line with the count of its occurrences.
|
122
|
-
# def reduce_command
|
123
|
-
# '/usr/bin/uniq -c'
|
124
|
-
# end
|
125
|
-
# end
|
126
|
-
# MyScript.new(MyMapper, nil).run
|
127
|
-
#
|
128
|
-
def initialize mapper, reducer=nil, extra_options={}
|
129
|
-
Settings.resolve!
|
130
|
-
@options = Settings
|
131
|
-
options.merge! extra_options
|
132
|
-
@mapper = (case mapper when Class then mapper.new when nil then nil else mapper ; end)
|
133
|
-
@reducer = (case reducer when Class then reducer.new when nil then nil else reducer ; end)
|
134
|
-
@output_path = options.rest.pop
|
135
|
-
@input_paths = options.rest.reject(&:blank?)
|
136
|
-
if (input_paths.blank? || output_path.blank?) && (not options[:dry_run]) && (not ['map', 'reduce'].include?(run_mode))
|
137
|
-
raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}"
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
#
|
142
|
-
# In --run mode, use the framework (local, hadoop, emr, etc) to re-launch
|
143
|
-
# the script as mapper, reducer, etc.
|
144
|
-
# If --map or --reduce, dispatch to the mapper or reducer.
|
145
|
-
#
|
146
|
-
def run
|
147
|
-
case run_mode
|
148
|
-
when 'map' then mapper.stream
|
149
|
-
when 'reduce' then reducer.stream
|
150
|
-
when 'local' then execute_local_workflow
|
151
|
-
when 'cassandra' then execute_hadoop_workflow
|
152
|
-
when 'hadoop', 'mapred' then execute_hadoop_workflow
|
153
|
-
when 'emr'
|
154
|
-
require 'wukong/script/emr_command'
|
155
|
-
execute_emr_workflow
|
156
|
-
else dump_help
|
157
|
-
end
|
158
|
-
end
|
159
|
-
|
160
|
-
# if only --run is given, assume default run mode
|
161
|
-
def run_mode
|
162
|
-
case
|
163
|
-
when options[:map] then 'map'
|
164
|
-
when options[:reduce] then 'reduce'
|
165
|
-
when ($0 =~ /-mapper\.rb$/) then 'map'
|
166
|
-
when ($0 =~ /-reducer\.rb$/) then 'reduce'
|
167
|
-
when (options[:run] == true) then options[:default_run_mode]
|
168
|
-
else options[:run].to_s
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
#
|
173
|
-
# Shell command for map phase. By default, calls the script in --map mode
|
174
|
-
# In hadoop mode, this is given to the hadoop streaming command.
|
175
|
-
# In local mode, it's given to the system() call
|
176
|
-
#
|
177
|
-
def mapper_commandline(run_option=:local)
|
178
|
-
if mapper
|
179
|
-
case run_option
|
180
|
-
when :local then
|
181
|
-
"#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
|
182
|
-
when :hadoop then
|
183
|
-
"#{ruby_interpreter_path} #{File.basename(this_script_filename)} --map " + non_wukong_params
|
184
|
-
end
|
185
|
-
else
|
186
|
-
options[:map_command]
|
187
|
-
end
|
188
|
-
end
|
189
|
-
|
190
|
-
#
|
191
|
-
# Shell command for reduce phase. By default, calls the script in --reduce mode
|
192
|
-
# In hadoop mode, this is given to the hadoop streaming command.
|
193
|
-
# In local mode, it's given to the system() call
|
194
|
-
#
|
195
|
-
def reducer_commandline(run_option=:local)
|
196
|
-
if reducer
|
197
|
-
case run_option
|
198
|
-
when :local then
|
199
|
-
"#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
|
200
|
-
when :hadoop then
|
201
|
-
"#{ruby_interpreter_path} #{File.basename(this_script_filename)} --reduce " + non_wukong_params
|
202
|
-
end
|
203
|
-
else
|
204
|
-
options[:reduce_command]
|
205
|
-
end
|
206
|
-
end
|
207
|
-
|
208
|
-
def job_name
|
209
|
-
options[:job_name] ||
|
210
|
-
"#{File.basename(this_script_filename)}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
|
211
|
-
end
|
212
|
-
|
213
|
-
# Wrapper for dangerous operations to catch errors
|
214
|
-
def safely action, &block
|
215
|
-
begin
|
216
|
-
block.call
|
217
|
-
rescue StandardError => e ; handle_error(action, e); end
|
218
|
-
end
|
219
|
-
|
220
|
-
protected
|
221
|
-
|
222
|
-
#
|
223
|
-
# Execute the runner phase:
|
224
|
-
# use the running framework to relaunch the script in map and in reduce mode
|
225
|
-
#
|
226
|
-
def execute_command! *args
|
227
|
-
command = args.flatten.reject(&:blank?).join(" \\\n ")
|
228
|
-
Log.info "Running\n\n#{command}\n"
|
229
|
-
if options[:dry_run]
|
230
|
-
Log.info '== [Not running preceding command: dry run] =='
|
231
|
-
else
|
232
|
-
maybe_overwrite_output_paths! output_path
|
233
|
-
$stdout.puts `#{command}`
|
234
|
-
raise "Streaming command failed!" unless $?.success?
|
235
|
-
end
|
236
|
-
end
|
237
|
-
|
238
|
-
#
|
239
|
-
# In hadoop mode only, removes the destination path before launching
|
240
|
-
#
|
241
|
-
# To the panic-stricken: look in .Trash/current/path/to/accidentally_deleted_files
|
242
|
-
#
|
243
|
-
def maybe_overwrite_output_paths! output_path
|
244
|
-
if (options[:overwrite] || options[:rm]) && (run_mode == 'hadoop')
|
245
|
-
cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
|
246
|
-
Log.info "Removing output file #{output_path}: #{cmd}"
|
247
|
-
puts `#{cmd}`
|
248
|
-
end
|
249
|
-
end
|
250
|
-
|
251
|
-
# Reassemble all the non-internal-to-wukong options into a command line for
|
252
|
-
# the map/reducer phase scripts
|
253
|
-
def non_wukong_params
|
254
|
-
options.
|
255
|
-
reject{|param, val| options.definition_of(param, :wukong) }.
|
256
|
-
map{|param,val| "--#{param}=#{val}" }.
|
257
|
-
join(" ")
|
258
|
-
end
|
259
|
-
|
260
|
-
# the full, real path to the script file
|
261
|
-
def this_script_filename
|
262
|
-
Pathname.new($0).realpath
|
263
|
-
end
|
264
|
-
|
265
|
-
# use the full ruby interpreter path to run slave processes
|
266
|
-
def ruby_interpreter_path
|
267
|
-
Pathname.new(File.join(
|
268
|
-
Config::CONFIG["bindir"],
|
269
|
-
Config::CONFIG["RUBY_INSTALL_NAME"]+Config::CONFIG["EXEEXT"])).realpath
|
270
|
-
end
|
271
|
-
|
272
|
-
#
|
273
|
-
# Usage
|
274
|
-
#
|
275
|
-
def dump_help
|
276
|
-
options.dump_help %Q{Please specify a run mode: you probably want to start with
|
277
|
-
#{$0} --run --local input.tsv output.tsv
|
278
|
-
although
|
279
|
-
cat input.tsv | #{$0} --map > mapped.tsv
|
280
|
-
or
|
281
|
-
cat mapped.tsv | sort | #{$0} --reduce > reduced.tsv
|
282
|
-
can be useful for initial testing.}
|
283
|
-
end
|
284
|
-
|
285
|
-
end
|
286
|
-
end
|