wukong 3.0.0.pre → 3.0.0.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,51 +0,0 @@
|
|
1
|
-
module Monkeyshines
|
2
|
-
module Store
|
3
|
-
class KeyStore < Monkeyshines::Store::Base
|
4
|
-
# The actual backing store; should respond to #set and #get methods
|
5
|
-
attr_accessor :db
|
6
|
-
|
7
|
-
#
|
8
|
-
# Executes block once for each element in the whole DB, in whatever order
|
9
|
-
# the DB thinks you should see it.
|
10
|
-
#
|
11
|
-
# Your block will see |key, val|
|
12
|
-
#
|
13
|
-
# key_store.each do |key, val|
|
14
|
-
# # ... stuff ...
|
15
|
-
# end
|
16
|
-
#
|
17
|
-
def each &block
|
18
|
-
db.iterinit
|
19
|
-
loop do
|
20
|
-
key = db.iternext or break
|
21
|
-
val = db[key]
|
22
|
-
yield key, val
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
|
27
|
-
# Save the value into the database
|
28
|
-
def set(key, val)
|
29
|
-
return unless val
|
30
|
-
db[key] = val
|
31
|
-
end
|
32
|
-
|
33
|
-
alias_method :save, :set
|
34
|
-
def get(key) db[key] end
|
35
|
-
def [](key) db[key] end
|
36
|
-
def close() db.close end
|
37
|
-
def size() db.size end
|
38
|
-
|
39
|
-
#
|
40
|
-
# Load from standard command-line options
|
41
|
-
#
|
42
|
-
# obvs only works when there's just one store
|
43
|
-
#
|
44
|
-
def self.new_from_command_line cmdline_opts, default_opts={}
|
45
|
-
options = default_opts.merge(cmdline_opts)
|
46
|
-
store = self.new(options[:store_db])
|
47
|
-
store
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
@@ -1,22 +0,0 @@
|
|
1
|
-
module Monkeyshines
|
2
|
-
module Store
|
3
|
-
class ReadThruStore < Monkeyshines::Store::TyrantTdbKeyStore
|
4
|
-
|
5
|
-
#
|
6
|
-
# If key is absent, save the result of calling the block.
|
7
|
-
# If key is present, block is never called.
|
8
|
-
#
|
9
|
-
# Ex:
|
10
|
-
# rt_store.set(url) do
|
11
|
-
# fetcher.get url # will only be called if url isn't in rt_store
|
12
|
-
# end
|
13
|
-
#
|
14
|
-
def set key, force=nil, &block
|
15
|
-
return if !force && db.has_key?(key)
|
16
|
-
result = block.call() or return
|
17
|
-
super(key, result)
|
18
|
-
end
|
19
|
-
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
@@ -1,33 +0,0 @@
|
|
1
|
-
require 'tokyocabinet'
|
2
|
-
module Monkeyshines
|
3
|
-
module Store
|
4
|
-
#
|
5
|
-
# Implementation of KeyStore with a Local TokyoCabinet table database (TDB)
|
6
|
-
#
|
7
|
-
class TokyoTdbKeyStore < Monkeyshines::Store::KeyStore
|
8
|
-
|
9
|
-
# pass in the filename or URI of a tokyo cabinet table-style DB
|
10
|
-
# set create_db = true if you want to create a missing DB file
|
11
|
-
def initialize db_uri, *args
|
12
|
-
self.db = TokyoCabinet::TDB.new
|
13
|
-
db.open(db_uri, TokyoCabinet::TDB::OWRITER) or raise "#{self.class.to_s}: Can't open TokyoCabinet TDB #{db_uri}"
|
14
|
-
super *args
|
15
|
-
end
|
16
|
-
|
17
|
-
|
18
|
-
def each_as klass, &block
|
19
|
-
self.each do |key, hsh|
|
20
|
-
yield klass.from_hash hsh
|
21
|
-
end
|
22
|
-
end
|
23
|
-
# Delegate to store
|
24
|
-
def set(key, val)
|
25
|
-
return unless val
|
26
|
-
db.put key, val.to_hash.compact
|
27
|
-
end
|
28
|
-
|
29
|
-
def size() db.rnum end
|
30
|
-
|
31
|
-
end #class
|
32
|
-
end
|
33
|
-
end
|
@@ -1,57 +0,0 @@
|
|
1
|
-
require 'tokyotyrant'
|
2
|
-
module Monkeyshines
|
3
|
-
module Store
|
4
|
-
|
5
|
-
#
|
6
|
-
# Implementation of KeyStore with a Local TokyoCabinet hash database (RDB)
|
7
|
-
#
|
8
|
-
class TyrantRdbKeyStore < Monkeyshines::Store::KeyStore
|
9
|
-
attr_accessor :db_host, :db_port
|
10
|
-
|
11
|
-
# pass in the host:port uri of the key store.
|
12
|
-
def initialize options
|
13
|
-
raise "URI for #{self.class} is required" if options[:uri].blank?
|
14
|
-
self.db_host, self.db_port = options[:uri].to_s.split(':')
|
15
|
-
self.db_host.gsub!(/^(localhost|127\.0\.0\.1)$/,'')
|
16
|
-
super options
|
17
|
-
end
|
18
|
-
|
19
|
-
def db
|
20
|
-
return @db if @db
|
21
|
-
@db ||= TokyoTyrant::RDB.new
|
22
|
-
@db.open(db_host, db_port) or raise("Can't open DB at host #{db_host} port #{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
|
23
|
-
@db
|
24
|
-
end
|
25
|
-
|
26
|
-
def close
|
27
|
-
@db.close if @db
|
28
|
-
@db = nil
|
29
|
-
end
|
30
|
-
|
31
|
-
# Save the value into the database without waiting for a response.
|
32
|
-
def set_nr(key, val)
|
33
|
-
db.putnr key, val if val
|
34
|
-
end
|
35
|
-
|
36
|
-
def size() db.rnum end
|
37
|
-
def include? *args
|
38
|
-
db.has_key? *args
|
39
|
-
end
|
40
|
-
|
41
|
-
# require 'memcache'
|
42
|
-
# def initialize db_uri=nil, *args
|
43
|
-
# # db_uri ||= ':1978'
|
44
|
-
# # self.db_host, self.db_port = db_uri.split(':')
|
45
|
-
# self.db = MemCache.new(db_uri, :no_reply => true)
|
46
|
-
# if !self.db then raise("Can't open DB #{db_uri}. Pass in host:port, default is ':1978' #{db.ecode}: #{db.errmsg(db.ecode)}") end
|
47
|
-
# super *args
|
48
|
-
# end
|
49
|
-
#
|
50
|
-
# def size
|
51
|
-
# db.stats
|
52
|
-
# end
|
53
|
-
|
54
|
-
end #class
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
@@ -1,20 +0,0 @@
|
|
1
|
-
require 'tokyotyrant'
|
2
|
-
require 'tyrant_rdb_key_store'
|
3
|
-
module Monkeyshines
|
4
|
-
module Store
|
5
|
-
#
|
6
|
-
# Implementation of KeyStore with a Local TokyoCabinet Table database (RDBTBL)
|
7
|
-
#
|
8
|
-
class TyrantRdbKeyStore < TyrantRdbKeyStore Monkeyshines::Store::KeyStore
|
9
|
-
|
10
|
-
def db
|
11
|
-
return @db if @db
|
12
|
-
@db ||= TokyoTyrant::RDBTBL.new
|
13
|
-
@db.open(db_host, db_port) or raise("Can't open DB #{db_host}:#{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
|
14
|
-
@db
|
15
|
-
end
|
16
|
-
|
17
|
-
end #class
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
data/old/wukong/streamer.rb
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
module Streamer
|
3
|
-
autoload :Base, 'wukong/streamer/base'
|
4
|
-
autoload :LineStreamer, 'wukong/streamer/line_streamer'
|
5
|
-
autoload :RecordStreamer, 'wukong/streamer/record_streamer'
|
6
|
-
autoload :JsonStreamer, 'wukong/streamer/json_streamer'
|
7
|
-
autoload :StructStreamer, 'wukong/streamer/struct_streamer'
|
8
|
-
autoload :StructRecordizer, 'wukong/streamer/struct_streamer'
|
9
|
-
autoload :InstanceStreamer, 'wukong/streamer/instance_streamer'
|
10
|
-
#
|
11
|
-
autoload :Filter, 'wukong/streamer/filter'
|
12
|
-
#
|
13
|
-
autoload :Reducer, 'wukong/streamer/reducer'
|
14
|
-
autoload :AccumulatingReducer, 'wukong/streamer/accumulating_reducer'
|
15
|
-
autoload :CountingReducer, 'wukong/streamer/counting_reducer'
|
16
|
-
autoload :ListReducer, 'wukong/streamer/list_reducer'
|
17
|
-
autoload :RankAndBinReducer, 'wukong/streamer/rank_and_bin_reducer'
|
18
|
-
autoload :UniqByLastReducer, 'wukong/streamer/uniq_by_last_reducer'
|
19
|
-
|
20
|
-
class Streamer < Base
|
21
|
-
end
|
22
|
-
|
23
|
-
class IdentityMapper < Base
|
24
|
-
end
|
25
|
-
|
26
|
-
class IdentityReducer < Base
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|
30
|
-
end
|
@@ -1,83 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
module Streamer
|
3
|
-
|
4
|
-
#
|
5
|
-
# AccumulatingReducer makes it easy to apply one operation across all
|
6
|
-
# occurrences of each key
|
7
|
-
#
|
8
|
-
# On each occurrence of a given key, AccumulatingReducer calls
|
9
|
-
# accumulate, and at the final occurrence calls finalize.
|
10
|
-
#
|
11
|
-
# See ListAccumulatingReducer and KeyCountingReducer for examples
|
12
|
-
#
|
13
|
-
# Make sure you don't have the bad luck, bad judgement or bad approach to
|
14
|
-
# accumulate more data than your box can hold before finalizing.
|
15
|
-
#
|
16
|
-
class AccumulatingReducer < Wukong::Streamer::Base
|
17
|
-
attr_accessor :key
|
18
|
-
|
19
|
-
#
|
20
|
-
# override for multiple-field keys, etc.
|
21
|
-
#
|
22
|
-
# Note that get_key is called by +process+ -- so the arguments have
|
23
|
-
# already been +recordize+d. In particular, if you are using
|
24
|
-
# StructRecordizer (or StructStreamer), you can write this as
|
25
|
-
#
|
26
|
-
# def get_key(thing) thing.id.to_i ; end
|
27
|
-
#
|
28
|
-
# or whatever
|
29
|
-
def get_key *record
|
30
|
-
record.first
|
31
|
-
end
|
32
|
-
|
33
|
-
#
|
34
|
-
# Accumulate all records for a given key.
|
35
|
-
#
|
36
|
-
# When the last record for the key is seen, finalize processing and adopt the
|
37
|
-
# new key.
|
38
|
-
#
|
39
|
-
def process *args, &block
|
40
|
-
this_key = get_key(*args)
|
41
|
-
if this_key != self.key # if this is a new key,
|
42
|
-
unless self.key == :__first_pass__
|
43
|
-
finalize(&block) # process what we've collected so far
|
44
|
-
end
|
45
|
-
self.key = this_key # adopt the new key
|
46
|
-
start! *args # and set up for the next accumulation
|
47
|
-
end
|
48
|
-
# collect the current record
|
49
|
-
accumulate *args, &block
|
50
|
-
end
|
51
|
-
|
52
|
-
#
|
53
|
-
# start! is called on the the first record of the new key
|
54
|
-
#
|
55
|
-
def start! *args
|
56
|
-
end
|
57
|
-
|
58
|
-
#
|
59
|
-
# Override this to accumulate each record for the given key in turn.
|
60
|
-
#
|
61
|
-
def accumulate *args, &block
|
62
|
-
end
|
63
|
-
|
64
|
-
#
|
65
|
-
#
|
66
|
-
# You must override this method.
|
67
|
-
#
|
68
|
-
def finalize
|
69
|
-
end
|
70
|
-
|
71
|
-
# make a sentinel
|
72
|
-
def before_stream
|
73
|
-
self.key = :__first_pass__
|
74
|
-
end
|
75
|
-
|
76
|
-
# Finalize the last-seen group.
|
77
|
-
def after_stream *args
|
78
|
-
finalize(){|record| emit record } unless (self.key == :__first_pass__)
|
79
|
-
super *args
|
80
|
-
end
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
data/old/wukong/streamer/base.rb
DELETED
@@ -1,126 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
module Streamer
|
3
|
-
class Base
|
4
|
-
|
5
|
-
# Options, initially set from the command-line args -- see
|
6
|
-
# Script#process_argv!
|
7
|
-
attr_reader :own_options
|
8
|
-
|
9
|
-
#
|
10
|
-
# Accepts option hash from script runner
|
11
|
-
#
|
12
|
-
def initialize options={}
|
13
|
-
@own_options = options
|
14
|
-
end
|
15
|
-
|
16
|
-
def options
|
17
|
-
Settings.deep_merge own_options
|
18
|
-
end
|
19
|
-
|
20
|
-
#
|
21
|
-
# Pass each record to +#process+
|
22
|
-
#
|
23
|
-
def stream
|
24
|
-
Log.info("Streaming on:\t%s" % [Script.input_file]) unless Script.input_file.blank?
|
25
|
-
before_stream
|
26
|
-
each_record do |line|
|
27
|
-
record = recordize(line.chomp) or next
|
28
|
-
process(*record) do |output_record|
|
29
|
-
emit output_record
|
30
|
-
end
|
31
|
-
track(record)
|
32
|
-
end
|
33
|
-
after_stream
|
34
|
-
end
|
35
|
-
|
36
|
-
def track record
|
37
|
-
monitor.periodically(record.to_s[0..1000])
|
38
|
-
end
|
39
|
-
|
40
|
-
def each_record &block
|
41
|
-
$stdin.each(&block)
|
42
|
-
end
|
43
|
-
|
44
|
-
# Called exactly once, before streaming begins
|
45
|
-
def before_stream
|
46
|
-
end
|
47
|
-
|
48
|
-
# Called exactly once, after streaming completes
|
49
|
-
def after_stream
|
50
|
-
end
|
51
|
-
|
52
|
-
#
|
53
|
-
# Default recordizer: returns array of fields by splitting at tabs
|
54
|
-
#
|
55
|
-
def recordize line
|
56
|
-
line.split("\t") rescue nil
|
57
|
-
end
|
58
|
-
|
59
|
-
#
|
60
|
-
# Serializes the record to output.
|
61
|
-
#
|
62
|
-
# Emits a single line of tab-separated fields created by calling #to_flat
|
63
|
-
# on the record and joining with "\t".
|
64
|
-
#
|
65
|
-
# Does no escaping or processing of the record -- that's to_flat's job, or
|
66
|
-
# yours if you override this method.
|
67
|
-
#
|
68
|
-
def emit record
|
69
|
-
puts record.to_flat.join("\t")
|
70
|
-
end
|
71
|
-
|
72
|
-
#
|
73
|
-
# Process each record in turn, yielding the records to emit
|
74
|
-
#
|
75
|
-
def process *args, &block
|
76
|
-
yield(args)
|
77
|
-
end
|
78
|
-
|
79
|
-
#
|
80
|
-
# To track processing errors inline,
|
81
|
-
# pass the line back to bad_record!
|
82
|
-
#
|
83
|
-
def bad_record! key, *args
|
84
|
-
warn "Bad record #{args.inspect[0..400]}"
|
85
|
-
puts ["bad_record-"+key.to_s, *args].join("\t")
|
86
|
-
end
|
87
|
-
|
88
|
-
# A periodic logger to track progress
|
89
|
-
def monitor
|
90
|
-
@monitor ||= PeriodicMonitor.new
|
91
|
-
end
|
92
|
-
|
93
|
-
# Defines a process method on the fly to execute the given mapper.
|
94
|
-
#
|
95
|
-
# This is still experimental.
|
96
|
-
# Among other limitations, you can't use ++yield++ -- you have to call
|
97
|
-
# emit() directly.
|
98
|
-
def mapper &mapper_block
|
99
|
-
@mapper_block = mapper_block.to_proc
|
100
|
-
self.instance_eval do
|
101
|
-
def process *args, &block
|
102
|
-
instance_exec(*args, &@mapper_block)
|
103
|
-
end
|
104
|
-
end
|
105
|
-
self
|
106
|
-
end
|
107
|
-
|
108
|
-
# Creates a new object of this class and injects the given block
|
109
|
-
# as the process method
|
110
|
-
def self.mapper *args, &block
|
111
|
-
self.new.mapper(*args, &block)
|
112
|
-
end
|
113
|
-
|
114
|
-
# Delegates back to Wukong to run this instance as a mapper
|
115
|
-
def run options={}
|
116
|
-
Wukong.run(self, nil, options)
|
117
|
-
end
|
118
|
-
|
119
|
-
# Creates a new object of this class and runs it
|
120
|
-
def self.run options={}
|
121
|
-
Wukong.run(self.new, nil, options)
|
122
|
-
end
|
123
|
-
|
124
|
-
end
|
125
|
-
end
|
126
|
-
end
|