wukong 3.0.0.pre → 3.0.0.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
| @@ -1,33 +0,0 @@ | |
| 1 | 
            -
            #!/usr/bin/env ruby
         | 
| 2 | 
            -
            require 'rubygems'
         | 
| 3 | 
            -
            require 'wukong/script'
         | 
| 4 | 
            -
             | 
| 5 | 
            -
            Settings.define :sampling_fraction, :type => Float, :required => true, :description => "floating-point number between 0 and 1 giving the fraction of lines to emit: at sampling_fraction=1 all records are emitted, at 0 none are."
         | 
| 6 | 
            -
             | 
| 7 | 
            -
            #
         | 
| 8 | 
            -
            # Probabilistically emit some fraction of record/lines
         | 
| 9 | 
            -
            #
         | 
| 10 | 
            -
            # Set the sampling fraction at the command line using the
         | 
| 11 | 
            -
            #   --sampling_fraction=
         | 
| 12 | 
            -
            # option: for example, to take a random 1/1000th of the lines in huge_files,
         | 
| 13 | 
            -
            #  ./examples/sample_records.rb --sampling_fraction=0.001 --run huge_files sampled_files
         | 
| 14 | 
            -
            #
         | 
| 15 | 
            -
            class Mapper < Wukong::Streamer::LineStreamer
         | 
| 16 | 
            -
              include Wukong::Streamer::Filter
         | 
| 17 | 
            -
             | 
| 18 | 
            -
              #
         | 
| 19 | 
            -
              # randomly decide to emit +sampling_fraction+ fraction of lines
         | 
| 20 | 
            -
              #
         | 
| 21 | 
            -
              def emit? line
         | 
| 22 | 
            -
                rand < Settings.sampling_fraction
         | 
| 23 | 
            -
              end
         | 
| 24 | 
            -
            end
         | 
| 25 | 
            -
             | 
| 26 | 
            -
            #
         | 
| 27 | 
            -
            # Executes the script
         | 
| 28 | 
            -
            #
         | 
| 29 | 
            -
            Wukong.run( Mapper,
         | 
| 30 | 
            -
              nil,
         | 
| 31 | 
            -
              :reduce_tasks => 0,
         | 
| 32 | 
            -
              :reuse_jvms   => true
         | 
| 33 | 
            -
              )
         | 
| @@ -1,15 +0,0 @@ | |
| 1 | 
            -
            #!/usr/bin/env ruby -E ASCII-8BIT
         | 
| 2 | 
            -
            require 'rubygems'
         | 
| 3 | 
            -
            require 'wukong/script'
         | 
| 4 | 
            -
            $: << File.dirname(__FILE__)
         | 
| 5 | 
            -
            require 'logline'
         | 
| 6 | 
            -
             | 
| 7 | 
            -
            class ApacheLogParser < Wukong::Streamer::LineStreamer
         | 
| 8 | 
            -
             | 
| 9 | 
            -
              # create a Logline object from each record and serialize it flat to disk
         | 
| 10 | 
            -
              def process line
         | 
| 11 | 
            -
                yield Logline.parse(line)
         | 
| 12 | 
            -
              end
         | 
| 13 | 
            -
            end
         | 
| 14 | 
            -
             | 
| 15 | 
            -
            Wukong.run( ApacheLogParser, nil, :sort_fields => 7 ) if $0 == __FILE__
         | 
| @@ -1,48 +0,0 @@ | |
| 1 | 
            -
            #!/usr/bin/env ruby -E BINARY
         | 
| 2 | 
            -
            require 'rubygems'
         | 
| 3 | 
            -
            require 'faraday'
         | 
| 4 | 
            -
            require 'wukong/script'
         | 
| 5 | 
            -
            require 'json'
         | 
| 6 | 
            -
            $: << File.dirname(__FILE__)
         | 
| 7 | 
            -
            require 'apache_log_parser'
         | 
| 8 | 
            -
            require 'nook/faraday_dummy_adapter'
         | 
| 9 | 
            -
             | 
| 10 | 
            -
            Settings.define :target_host,   :default => 'localhost', :description => "The host name or IP address to target"
         | 
| 11 | 
            -
            Settings.define :target_scheme, :default => 'http',      :description => "Request scheme (http, https)"
         | 
| 12 | 
            -
             | 
| 13 | 
            -
            #
         | 
| 14 | 
            -
            # A Nook consumes its input stream and, for each input, generates an HTTP
         | 
| 15 | 
            -
            # request against a remote host. Please use it for good and never for evil.
         | 
| 16 | 
            -
            #
         | 
| 17 | 
            -
            # You can use it from your command line:
         | 
| 18 | 
            -
            #   zcat /var/www/app/current/log/*access*.log.gz | ./nook.rb --map --host=http://my_own_host.com
         | 
| 19 | 
            -
            #
         | 
| 20 | 
            -
            #
         | 
| 21 | 
            -
            class NookMapper < ApacheLogParser
         | 
| 22 | 
            -
              # create a Logline object from each record and serialize it flat to disk
         | 
| 23 | 
            -
              def process line
         | 
| 24 | 
            -
                super(line) do |logline|
         | 
| 25 | 
            -
                  start = Time.now
         | 
| 26 | 
            -
                  resp = fetcher.get(logline.path, :user_agent => logline.ua, :referer => logline.referer)
         | 
| 27 | 
            -
                  yield [Time.now.to_flat, (Time.now - start).to_f, resp.status, resp.body.size, logline.path, resp.body]
         | 
| 28 | 
            -
                end
         | 
| 29 | 
            -
              end
         | 
| 30 | 
            -
             | 
| 31 | 
            -
              def track record
         | 
| 32 | 
            -
                monitor.periodically do |m|
         | 
| 33 | 
            -
                  m.progress
         | 
| 34 | 
            -
                end
         | 
| 35 | 
            -
              end
         | 
| 36 | 
            -
             | 
| 37 | 
            -
              # a mock fetcher with a uniformly distributed variable delay
         | 
| 38 | 
            -
              def fetcher
         | 
| 39 | 
            -
                @fetcher ||= Faraday::Connection.new(:url => 'http://localhost:80/') do |f|
         | 
| 40 | 
            -
                  f.use Faraday::Adapter::Dummy do |dummy|
         | 
| 41 | 
            -
                    dummy.delay = Proc.new{|env| 0.05  } # 0.2 * rand()
         | 
| 42 | 
            -
                    # dummy.body = Proc.new{|env| env[:url] }
         | 
| 43 | 
            -
                  end
         | 
| 44 | 
            -
                end
         | 
| 45 | 
            -
              end
         | 
| 46 | 
            -
            end
         | 
| 47 | 
            -
             | 
| 48 | 
            -
            Wukong.run( NookMapper, nil, :sort_fields => 7 )
         | 
| @@ -1,94 +0,0 @@ | |
| 1 | 
            -
             | 
| 2 | 
            -
            module Faraday
         | 
| 3 | 
            -
              class Adapter
         | 
| 4 | 
            -
             | 
| 5 | 
            -
                # test = Faraday::Connection.new do |f|
         | 
| 6 | 
            -
                #   f.use Faraday::Adapter::Dummy do |dummy|
         | 
| 7 | 
            -
                #     dummy.status 404
         | 
| 8 | 
            -
                #     dummy.delay  1
         | 
| 9 | 
            -
                #   end
         | 
| 10 | 
            -
                # end
         | 
| 11 | 
            -
                #
         | 
| 12 | 
            -
                # # this will delay 0.2s, returning 404 with
         | 
| 13 | 
            -
                # resp = text.get("/your/mom", :dummy_delay => 0.2)
         | 
| 14 | 
            -
                # resp.body # => {"method":"get","url":"/your/mom","request_headers":{"Dummy-Delay":"0.2","dummy_delay":0.2},"request":{"proxy":null},"ssl":{}}
         | 
| 15 | 
            -
                #
         | 
| 16 | 
            -
                # More example:
         | 
| 17 | 
            -
                #
         | 
| 18 | 
            -
                # test = Faraday::Connection.new do |f|
         | 
| 19 | 
            -
                #   f.use Faraday::Adapter::Dummy, :status => 503
         | 
| 20 | 
            -
                # end
         | 
| 21 | 
            -
                #
         | 
| 22 | 
            -
                # test = Faraday::Connection.new do |f|
         | 
| 23 | 
            -
                #   f.use Faraday::Adapter::Dummy do |dummy|
         | 
| 24 | 
            -
                #     dummy.delay = Proc.new{|env| 0.1 + 0.8 * rand() }
         | 
| 25 | 
            -
                #   end
         | 
| 26 | 
            -
                # end
         | 
| 27 | 
            -
                #
         | 
| 28 | 
            -
                class Dummy < Middleware
         | 
| 29 | 
            -
                  include Addressable
         | 
| 30 | 
            -
                  attr_reader :config
         | 
| 31 | 
            -
                  def self.loaded?() false end
         | 
| 32 | 
            -
             | 
| 33 | 
            -
                  # gets value from environment if set, configured instance variable otherwise
         | 
| 34 | 
            -
                  def value_for env, key
         | 
| 35 | 
            -
                    val = env[:request_headers]["Dummy-#{header_hash_key(key)}"] || config[key]
         | 
| 36 | 
            -
                    if val.respond_to?(:call)
         | 
| 37 | 
            -
                      val = val.call(env)
         | 
| 38 | 
            -
                    end
         | 
| 39 | 
            -
                    val
         | 
| 40 | 
            -
                  end
         | 
| 41 | 
            -
             | 
| 42 | 
            -
                  # With an optional delay, constructs a [status, headers, response] based on the first of:
         | 
| 43 | 
            -
                  # * request header field (Dummy-Status, Dummy-Headers, Dummy-Resonse)
         | 
| 44 | 
            -
                  # * adapter's configuration:
         | 
| 45 | 
            -
                  # * Unless one of the above is set, body will return a json string taken from the request hash
         | 
| 46 | 
            -
                  #
         | 
| 47 | 
            -
                  def call(env)
         | 
| 48 | 
            -
                    status  = value_for(env, :status)
         | 
| 49 | 
            -
                    headers = value_for(env, :headers)
         | 
| 50 | 
            -
                    headers = JSON.load(headers) if headers.is_a? String
         | 
| 51 | 
            -
                    body    = value_for(env, :body) ||
         | 
| 52 | 
            -
                      env.dup.tap{|hsh| [:response, :parallel_manager, :body].each{|k| hsh.delete k} }.to_json
         | 
| 53 | 
            -
                    delay   = value_for(env, :delay).to_f
         | 
| 54 | 
            -
                    sleep delay if delay > 0
         | 
| 55 | 
            -
                    headers[:dummy_delay] = delay
         | 
| 56 | 
            -
                    env.update(
         | 
| 57 | 
            -
                      :status           => status,
         | 
| 58 | 
            -
                      :response_headers => headers,
         | 
| 59 | 
            -
                      :body             => body)
         | 
| 60 | 
            -
                    @app.call(env)
         | 
| 61 | 
            -
                  end
         | 
| 62 | 
            -
             | 
| 63 | 
            -
                  class Configurator < Struct.new(:status, :headers, :delay, :body)
         | 
| 64 | 
            -
                    def status(val=nil)  self.status  = val if val ; super() end
         | 
| 65 | 
            -
                    def headers(val=nil) self.headers = val if val ; super() end
         | 
| 66 | 
            -
                    def body(val=nil)    self.body    = val if val ; super() end
         | 
| 67 | 
            -
                    def delay(val=nil)   self.delay   = val if val ; super() end
         | 
| 68 | 
            -
                    def self.from_hash hsh
         | 
| 69 | 
            -
                      new().tap{|config| hsh.each{|k,v| config.send("#{k}=", v) } }
         | 
| 70 | 
            -
                    end
         | 
| 71 | 
            -
                  end
         | 
| 72 | 
            -
             | 
| 73 | 
            -
                  def initialize(app, defaults={}, &block)
         | 
| 74 | 
            -
                    super(app)
         | 
| 75 | 
            -
                    @config = Configurator.from_hash(defaults.reverse_merge(:status => 200, :delay => 0, :headers => {}))
         | 
| 76 | 
            -
                    configure(&block) if block
         | 
| 77 | 
            -
                  end
         | 
| 78 | 
            -
             | 
| 79 | 
            -
                  def configure
         | 
| 80 | 
            -
                    yield config
         | 
| 81 | 
            -
                  end
         | 
| 82 | 
            -
             | 
| 83 | 
            -
                  # same as in Faraday::Utils -- turns :dummy_response_status into 'Dummy-Response-Status'
         | 
| 84 | 
            -
                  def header_hash_key(str)
         | 
| 85 | 
            -
                    str.to_s.split('_').each{|w| w.capitalize! }.join('-')
         | 
| 86 | 
            -
                  end
         | 
| 87 | 
            -
             | 
| 88 | 
            -
                  def create_multipart(env, params, boundary = nil)
         | 
| 89 | 
            -
                    stream = super
         | 
| 90 | 
            -
                    stream.read
         | 
| 91 | 
            -
                  end
         | 
| 92 | 
            -
                end
         | 
| 93 | 
            -
              end
         | 
| 94 | 
            -
            end
         | 
| @@ -1,40 +0,0 @@ | |
| 1 | 
            -
             | 
| 2 | 
            -
             | 
| 3 | 
            -
            # For later, if we want to parse user agents:
         | 
| 4 | 
            -
            #   http://code.google.com/p/browserscope/source/browse/trunk/models/user_agent.py
         | 
| 5 | 
            -
            #   http://www.useragentstring.com/pages/All/
         | 
| 6 | 
            -
            #   http://github.com/jaxn/parse-user-agent
         | 
| 7 | 
            -
            #   http://code.google.com/p/browserscope/wiki/UserAgentParsing
         | 
| 8 | 
            -
            #   http://code.google.com/p/ua-parser/source/browse/
         | 
| 9 | 
            -
            #   http://github.com/shenoudab/active_device/tree/master/lib/active_device/
         | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
            #
         | 
| 13 | 
            -
            # * Mozilla based
         | 
| 14 | 
            -
            # * Mozilla version
         | 
| 15 | 
            -
            # * X11 based
         | 
| 16 | 
            -
            # * Security
         | 
| 17 | 
            -
            # * OS
         | 
| 18 | 
            -
            # * CPU family
         | 
| 19 | 
            -
            # * Language Tag
         | 
| 20 | 
            -
            # * Renderer (i.e. Webkit, Trident, Presto)
         | 
| 21 | 
            -
            # * Renderer Version
         | 
| 22 | 
            -
            # * I don't see a utility for the "KHTML" and "like Gecko" bits, but whatever.
         | 
| 23 | 
            -
            # * Based on
         | 
| 24 | 
            -
            # * Browser Build (not really sure about this either)
         | 
| 25 | 
            -
             | 
| 26 | 
            -
            # * Browser Family (i.e. Firefox, IE, Chrome, etc..)
         | 
| 27 | 
            -
            # * Project Name (optional, i.e. Namoroka, Shiretoko)
         | 
| 28 | 
            -
            # * Major Version
         | 
| 29 | 
            -
            # * Minor Version
         | 
| 30 | 
            -
            # * Version Third Bit
         | 
| 31 | 
            -
            # * Version Fourth Bit
         | 
| 32 | 
            -
            # * Open Question: How should we handle the "alpha/beta" bit, like apre1? I'm inclined to say we put it in its own datapoint and let people group together how ever they want, but not leave it attached to any of the version bits.
         | 
| 33 | 
            -
             | 
| 34 | 
            -
            # Bot
         | 
| 35 | 
            -
            # Brand
         | 
| 36 | 
            -
            # Browser
         | 
| 37 | 
            -
            # Engine
         | 
| 38 | 
            -
            # Handset
         | 
| 39 | 
            -
            # Model
         | 
| 40 | 
            -
            # OS
         | 
| @@ -1,82 +0,0 @@ | |
| 1 | 
            -
            #!/usr/bin/env ruby
         | 
| 2 | 
            -
            require 'rubygems'
         | 
| 3 | 
            -
            require 'wukong/script'
         | 
| 4 | 
            -
             | 
| 5 | 
            -
            module WordCount
         | 
| 6 | 
            -
              class Mapper < Wukong::Streamer::LineStreamer
         | 
| 7 | 
            -
                #
         | 
| 8 | 
            -
                # Emit each word in each line.
         | 
| 9 | 
            -
                #
         | 
| 10 | 
            -
                def process line
         | 
| 11 | 
            -
                  tokenize(line).each{|word| yield [word, 1] }
         | 
| 12 | 
            -
                end
         | 
| 13 | 
            -
             | 
| 14 | 
            -
                #
         | 
| 15 | 
            -
                # Split a string into its constituent words.
         | 
| 16 | 
            -
                #
         | 
| 17 | 
            -
                # This is pretty simpleminded:
         | 
| 18 | 
            -
                # * downcase the word
         | 
| 19 | 
            -
                # * Split at any non-alphanumeric boundary, including '_'
         | 
| 20 | 
            -
                # * However, preserve the special cases of 's, 'd or 't at the end of a
         | 
| 21 | 
            -
                #   word.
         | 
| 22 | 
            -
                #
         | 
| 23 | 
            -
                #   tokenize("Ability is a poor man's wealth #johnwoodenquote")
         | 
| 24 | 
            -
                #   # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
         | 
| 25 | 
            -
                #
         | 
| 26 | 
            -
                def tokenize str
         | 
| 27 | 
            -
                  return [] if str.blank?
         | 
| 28 | 
            -
                  str = str.downcase;
         | 
| 29 | 
            -
                  # kill off all punctuation except [stuff]'s or [stuff]'t
         | 
| 30 | 
            -
                  # this includes hyphens (words are split)
         | 
| 31 | 
            -
                  str = str.
         | 
| 32 | 
            -
                    gsub(/[^a-zA-Z0-9\']+/, ' ').
         | 
| 33 | 
            -
                    gsub(/(\w)\'([std])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
         | 
| 34 | 
            -
                  # Busticate at whitespace
         | 
| 35 | 
            -
                  words = str.split(/\s+/)
         | 
| 36 | 
            -
                  words.reject!{|w| w.blank? }
         | 
| 37 | 
            -
                  words
         | 
| 38 | 
            -
                end
         | 
| 39 | 
            -
              end
         | 
| 40 | 
            -
             | 
| 41 | 
            -
              #
         | 
| 42 | 
            -
              # A bit kinder to your memory manager: accumulate the sum record-by-record:
         | 
| 43 | 
            -
              #
         | 
| 44 | 
            -
              class Reducer2 < Wukong::Streamer::AccumulatingReducer
         | 
| 45 | 
            -
                
         | 
| 46 | 
            -
                def start!(*args)
         | 
| 47 | 
            -
                  @key_count =  0
         | 
| 48 | 
            -
                end
         | 
| 49 | 
            -
                
         | 
| 50 | 
            -
                def accumulate(*args)
         | 
| 51 | 
            -
                  @key_count += 1
         | 
| 52 | 
            -
                end
         | 
| 53 | 
            -
                
         | 
| 54 | 
            -
                def finalize
         | 
| 55 | 
            -
                  yield [ key, @key_count ]
         | 
| 56 | 
            -
                end
         | 
| 57 | 
            -
              end
         | 
| 58 | 
            -
             | 
| 59 | 
            -
              #
         | 
| 60 | 
            -
              # You can stack up all the values in a list then sum them at once.
         | 
| 61 | 
            -
              #
         | 
| 62 | 
            -
              # This isn't good style, as it means the whole list is held in memory
         | 
| 63 | 
            -
              #
         | 
| 64 | 
            -
              class Reducer1 < Wukong::Streamer::ListReducer
         | 
| 65 | 
            -
                def finalize
         | 
| 66 | 
            -
                  yield [ key, values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot } ]
         | 
| 67 | 
            -
                end
         | 
| 68 | 
            -
              end
         | 
| 69 | 
            -
             | 
| 70 | 
            -
              #
         | 
| 71 | 
            -
              # ... easiest of all, though: this is common enough that it's already included
         | 
| 72 | 
            -
              #
         | 
| 73 | 
            -
              require 'wukong/streamer/count_keys'
         | 
| 74 | 
            -
              class Reducer3 < Wukong::Streamer::CountKeys
         | 
| 75 | 
            -
              end
         | 
| 76 | 
            -
            end
         | 
| 77 | 
            -
             | 
| 78 | 
            -
            # Execute the script
         | 
| 79 | 
            -
            Wukong.run(
         | 
| 80 | 
            -
              WordCount::Mapper,
         | 
| 81 | 
            -
              WordCount::Reducer2
         | 
| 82 | 
            -
              )
         | 
    
        data/old/examples/size.rb
    DELETED
    
    | @@ -1,61 +0,0 @@ | |
| 1 | 
            -
            #!/usr/bin/env ruby
         | 
| 2 | 
            -
            require 'rubygems'
         | 
| 3 | 
            -
            require 'wukong/script'
         | 
| 4 | 
            -
             | 
| 5 | 
            -
            module Size
         | 
| 6 | 
            -
              #
         | 
| 7 | 
            -
              # Feed the entire dataset through wc and sum the results
         | 
| 8 | 
            -
              #
         | 
| 9 | 
            -
              class Script < Wukong::Script
         | 
| 10 | 
            -
                #
         | 
| 11 | 
            -
                # Don't implement a wukong script to do something if there's a unix command
         | 
| 12 | 
            -
                # that does it faster: just override map_command or reduce_command in your
         | 
| 13 | 
            -
                # subclass of Wukong::Script to return the complete command line
         | 
| 14 | 
            -
                #
         | 
| 15 | 
            -
                def map_command
         | 
| 16 | 
            -
                  '/usr/bin/wc'
         | 
| 17 | 
            -
                end
         | 
| 18 | 
            -
             | 
| 19 | 
            -
                # Make all records go to one reducer
         | 
| 20 | 
            -
                def default_options
         | 
| 21 | 
            -
                  super.merge :reduce_tasks => 1
         | 
| 22 | 
            -
                end
         | 
| 23 | 
            -
              end
         | 
| 24 | 
            -
             | 
| 25 | 
            -
              #
         | 
| 26 | 
            -
              # Sums the numeric value of each column in its input
         | 
| 27 | 
            -
              #
         | 
| 28 | 
            -
              class Reducer < Wukong::Streamer::Base
         | 
| 29 | 
            -
                attr_accessor :sums
         | 
| 30 | 
            -
             | 
| 31 | 
            -
                #
         | 
| 32 | 
            -
                # The unix +wc+ command uses whitespace, not tabs, so we'll recordize
         | 
| 33 | 
            -
                # accordingly.
         | 
| 34 | 
            -
                #
         | 
| 35 | 
            -
                def recordize line
         | 
| 36 | 
            -
                  line.strip.split(/\s+/)
         | 
| 37 | 
            -
                end
         | 
| 38 | 
            -
             | 
| 39 | 
            -
                #
         | 
| 40 | 
            -
                # add each corresponding column in the input
         | 
| 41 | 
            -
                #
         | 
| 42 | 
            -
                def process *vals
         | 
| 43 | 
            -
                  self.sums = vals.zip( sums || [] ).map{|val,sum| val.to_i + sum.to_i }
         | 
| 44 | 
            -
                end
         | 
| 45 | 
            -
             | 
| 46 | 
            -
                #
         | 
| 47 | 
            -
                # run through the whole reduction input and then output the total
         | 
| 48 | 
            -
                #
         | 
| 49 | 
            -
                def stream *args
         | 
| 50 | 
            -
                  super *args
         | 
| 51 | 
            -
                  emit sums
         | 
| 52 | 
            -
                end
         | 
| 53 | 
            -
              end
         | 
| 54 | 
            -
            end
         | 
| 55 | 
            -
             | 
| 56 | 
            -
            # Execute the script
         | 
| 57 | 
            -
            Size::Script.new(
         | 
| 58 | 
            -
              nil,
         | 
| 59 | 
            -
              Size::Reducer,
         | 
| 60 | 
            -
              :reduce_tasks => 1
         | 
| 61 | 
            -
              ).run
         | 
| @@ -1,86 +0,0 @@ | |
| 1 | 
            -
            #!/usr/bin/env ruby
         | 
| 2 | 
            -
            # run like so:
         | 
| 3 | 
            -
            # $> ruby average_value_frequecy.rb --run=local data/stats.tsv data/avf_out.tsv
         | 
| 4 | 
            -
            require 'rubygems'
         | 
| 5 | 
            -
            require 'wukong'
         | 
| 6 | 
            -
             | 
| 7 | 
            -
            #
         | 
| 8 | 
            -
            # Calculate the average value frequency (AVF) for each data row. AVF for a data
         | 
| 9 | 
            -
            # point with m attributes is defined as:
         | 
| 10 | 
            -
            #
         | 
| 11 | 
            -
            #     avf = (1/m)* sum (frequencies of attributes 1..m)
         | 
| 12 | 
            -
            #
         | 
| 13 | 
            -
            # so with the data
         | 
| 14 | 
            -
            #
         | 
| 15 | 
            -
            #      1       15      30      25
         | 
| 16 | 
            -
            #      2       10      10      20
         | 
| 17 | 
            -
            #      3       50      30      30
         | 
| 18 | 
            -
            #
         | 
| 19 | 
            -
            # for the first row, avf = (1/3)*(1+2+1) ~= 1.33. An outlier is identified by
         | 
| 20 | 
            -
            # a low AVF.
         | 
| 21 | 
            -
            #
         | 
| 22 | 
            -
            module AverageValueFrequency
         | 
| 23 | 
            -
              # Names for each column's attribute, in order
         | 
| 24 | 
            -
              ATTR_NAMES = %w[length width height]
         | 
| 25 | 
            -
             | 
| 26 | 
            -
              class HistogramMapper < Wukong::Streamer::RecordStreamer
         | 
| 27 | 
            -
                # unroll each row from
         | 
| 28 | 
            -
                #     [id,   val1,   val2, ....]
         | 
| 29 | 
            -
                # into
         | 
| 30 | 
            -
                #     [attr1,   val1]
         | 
| 31 | 
            -
                #     [attr2,   val2]
         | 
| 32 | 
            -
                #     ...
         | 
| 33 | 
            -
                def process id, *values
         | 
| 34 | 
            -
                  ATTR_NAMES.zip(values).each do |attr, val|
         | 
| 35 | 
            -
                    yield [attr, val]
         | 
| 36 | 
            -
                  end
         | 
| 37 | 
            -
                end
         | 
| 38 | 
            -
              end
         | 
| 39 | 
            -
             | 
| 40 | 
            -
              #
         | 
| 41 | 
            -
              # Build a histogram of values
         | 
| 42 | 
            -
              #
         | 
| 43 | 
            -
              class HistogramReducer < Wukong::Streamer::CountingReducer
         | 
| 44 | 
            -
                # use the attr and val as the key
         | 
| 45 | 
            -
                def get_key attr, val=nil, *_
         | 
| 46 | 
            -
                  [attr, val]
         | 
| 47 | 
            -
                end
         | 
| 48 | 
            -
              end
         | 
| 49 | 
            -
             | 
| 50 | 
            -
              class AvfRecordMapper < Wukong::Streamer::RecordStreamer
         | 
| 51 | 
            -
                # average the frequency of each value
         | 
| 52 | 
            -
                def process id, *values
         | 
| 53 | 
            -
                  sum = 0.0
         | 
| 54 | 
            -
                  ATTR_NAMES.zip(values).each do |attr, val|
         | 
| 55 | 
            -
                    sum += histogram[ [attr, val] ].to_i
         | 
| 56 | 
            -
                  end
         | 
| 57 | 
            -
                  avf = sum / ATTR_NAMES.length.to_f
         | 
| 58 | 
            -
                  yield [id, avf, *values]
         | 
| 59 | 
            -
                end
         | 
| 60 | 
            -
             | 
| 61 | 
            -
                # Load the histogram from a tab-separated file with
         | 
| 62 | 
            -
                #   attr    val   freq
         | 
| 63 | 
            -
                def histogram
         | 
| 64 | 
            -
                  return @histogram if @histogram
         | 
| 65 | 
            -
                  @histogram = { }
         | 
| 66 | 
            -
                  File.open(options[:histogram_file]).each do |line|
         | 
| 67 | 
            -
                    attr, val, freq = line.chomp.split("\t")
         | 
| 68 | 
            -
                    @histogram[ [attr, val] ] = freq
         | 
| 69 | 
            -
                  end
         | 
| 70 | 
            -
                  @histogram
         | 
| 71 | 
            -
                end
         | 
| 72 | 
            -
              end
         | 
| 73 | 
            -
            end
         | 
| 74 | 
            -
             | 
| 75 | 
            -
            Settings.use :commandline, :define
         | 
| 76 | 
            -
            Settings.define :histogram,      :description => "Run the first pass to calculate a histogram"
         | 
| 77 | 
            -
            Settings.define :avf,            :description => "Run the second pass, to run back over the records with the histogram and find the AVF for each row."
         | 
| 78 | 
            -
            Settings.define :histogram_file, :description => "File to load the histogram from (supply name of the  output file from first pass)"
         | 
| 79 | 
            -
            Settings.resolve!
         | 
| 80 | 
            -
            if Settings[:histogram]
         | 
| 81 | 
            -
              Wukong::Script.new(AverageValueFrequency::HistogramMapper, AverageValueFrequency::HistogramReducer).run
         | 
| 82 | 
            -
            elsif Settings[:avf]
         | 
| 83 | 
            -
              Wukong::Script.new(AverageValueFrequency::AvfRecordMapper, nil).run
         | 
| 84 | 
            -
            else
         | 
| 85 | 
            -
              raise "Please specify either --histogram (for first round) or --avf (second round)"
         | 
| 86 | 
            -
            end
         |