wukong 3.0.0.pre → 3.0.0.pre2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,33 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
|
5
|
-
Settings.define :sampling_fraction, :type => Float, :required => true, :description => "floating-point number between 0 and 1 giving the fraction of lines to emit: at sampling_fraction=1 all records are emitted, at 0 none are."
|
6
|
-
|
7
|
-
#
|
8
|
-
# Probabilistically emit some fraction of record/lines
|
9
|
-
#
|
10
|
-
# Set the sampling fraction at the command line using the
|
11
|
-
# --sampling_fraction=
|
12
|
-
# option: for example, to take a random 1/1000th of the lines in huge_files,
|
13
|
-
# ./examples/sample_records.rb --sampling_fraction=0.001 --run huge_files sampled_files
|
14
|
-
#
|
15
|
-
class Mapper < Wukong::Streamer::LineStreamer
|
16
|
-
include Wukong::Streamer::Filter
|
17
|
-
|
18
|
-
#
|
19
|
-
# randomly decide to emit +sampling_fraction+ fraction of lines
|
20
|
-
#
|
21
|
-
def emit? line
|
22
|
-
rand < Settings.sampling_fraction
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
#
|
27
|
-
# Executes the script
|
28
|
-
#
|
29
|
-
Wukong.run( Mapper,
|
30
|
-
nil,
|
31
|
-
:reduce_tasks => 0,
|
32
|
-
:reuse_jvms => true
|
33
|
-
)
|
@@ -1,15 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby -E ASCII-8BIT
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
$: << File.dirname(__FILE__)
|
5
|
-
require 'logline'
|
6
|
-
|
7
|
-
class ApacheLogParser < Wukong::Streamer::LineStreamer
|
8
|
-
|
9
|
-
# create a Logline object from each record and serialize it flat to disk
|
10
|
-
def process line
|
11
|
-
yield Logline.parse(line)
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
Wukong.run( ApacheLogParser, nil, :sort_fields => 7 ) if $0 == __FILE__
|
@@ -1,48 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby -E BINARY
|
2
|
-
require 'rubygems'
|
3
|
-
require 'faraday'
|
4
|
-
require 'wukong/script'
|
5
|
-
require 'json'
|
6
|
-
$: << File.dirname(__FILE__)
|
7
|
-
require 'apache_log_parser'
|
8
|
-
require 'nook/faraday_dummy_adapter'
|
9
|
-
|
10
|
-
Settings.define :target_host, :default => 'localhost', :description => "The host name or IP address to target"
|
11
|
-
Settings.define :target_scheme, :default => 'http', :description => "Request scheme (http, https)"
|
12
|
-
|
13
|
-
#
|
14
|
-
# A Nook consumes its input stream and, for each input, generates an HTTP
|
15
|
-
# request against a remote host. Please use it for good and never for evil.
|
16
|
-
#
|
17
|
-
# You can use it from your command line:
|
18
|
-
# zcat /var/www/app/current/log/*access*.log.gz | ./nook.rb --map --host=http://my_own_host.com
|
19
|
-
#
|
20
|
-
#
|
21
|
-
class NookMapper < ApacheLogParser
|
22
|
-
# create a Logline object from each record and serialize it flat to disk
|
23
|
-
def process line
|
24
|
-
super(line) do |logline|
|
25
|
-
start = Time.now
|
26
|
-
resp = fetcher.get(logline.path, :user_agent => logline.ua, :referer => logline.referer)
|
27
|
-
yield [Time.now.to_flat, (Time.now - start).to_f, resp.status, resp.body.size, logline.path, resp.body]
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
def track record
|
32
|
-
monitor.periodically do |m|
|
33
|
-
m.progress
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
# a mock fetcher with a uniformly distributed variable delay
|
38
|
-
def fetcher
|
39
|
-
@fetcher ||= Faraday::Connection.new(:url => 'http://localhost:80/') do |f|
|
40
|
-
f.use Faraday::Adapter::Dummy do |dummy|
|
41
|
-
dummy.delay = Proc.new{|env| 0.05 } # 0.2 * rand()
|
42
|
-
# dummy.body = Proc.new{|env| env[:url] }
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
Wukong.run( NookMapper, nil, :sort_fields => 7 )
|
@@ -1,94 +0,0 @@
|
|
1
|
-
|
2
|
-
module Faraday
|
3
|
-
class Adapter
|
4
|
-
|
5
|
-
# test = Faraday::Connection.new do |f|
|
6
|
-
# f.use Faraday::Adapter::Dummy do |dummy|
|
7
|
-
# dummy.status 404
|
8
|
-
# dummy.delay 1
|
9
|
-
# end
|
10
|
-
# end
|
11
|
-
#
|
12
|
-
# # this will delay 0.2s, returning 404 with
|
13
|
-
# resp = text.get("/your/mom", :dummy_delay => 0.2)
|
14
|
-
# resp.body # => {"method":"get","url":"/your/mom","request_headers":{"Dummy-Delay":"0.2","dummy_delay":0.2},"request":{"proxy":null},"ssl":{}}
|
15
|
-
#
|
16
|
-
# More example:
|
17
|
-
#
|
18
|
-
# test = Faraday::Connection.new do |f|
|
19
|
-
# f.use Faraday::Adapter::Dummy, :status => 503
|
20
|
-
# end
|
21
|
-
#
|
22
|
-
# test = Faraday::Connection.new do |f|
|
23
|
-
# f.use Faraday::Adapter::Dummy do |dummy|
|
24
|
-
# dummy.delay = Proc.new{|env| 0.1 + 0.8 * rand() }
|
25
|
-
# end
|
26
|
-
# end
|
27
|
-
#
|
28
|
-
class Dummy < Middleware
|
29
|
-
include Addressable
|
30
|
-
attr_reader :config
|
31
|
-
def self.loaded?() false end
|
32
|
-
|
33
|
-
# gets value from environment if set, configured instance variable otherwise
|
34
|
-
def value_for env, key
|
35
|
-
val = env[:request_headers]["Dummy-#{header_hash_key(key)}"] || config[key]
|
36
|
-
if val.respond_to?(:call)
|
37
|
-
val = val.call(env)
|
38
|
-
end
|
39
|
-
val
|
40
|
-
end
|
41
|
-
|
42
|
-
# With an optional delay, constructs a [status, headers, response] based on the first of:
|
43
|
-
# * request header field (Dummy-Status, Dummy-Headers, Dummy-Resonse)
|
44
|
-
# * adapter's configuration:
|
45
|
-
# * Unless one of the above is set, body will return a json string taken from the request hash
|
46
|
-
#
|
47
|
-
def call(env)
|
48
|
-
status = value_for(env, :status)
|
49
|
-
headers = value_for(env, :headers)
|
50
|
-
headers = JSON.load(headers) if headers.is_a? String
|
51
|
-
body = value_for(env, :body) ||
|
52
|
-
env.dup.tap{|hsh| [:response, :parallel_manager, :body].each{|k| hsh.delete k} }.to_json
|
53
|
-
delay = value_for(env, :delay).to_f
|
54
|
-
sleep delay if delay > 0
|
55
|
-
headers[:dummy_delay] = delay
|
56
|
-
env.update(
|
57
|
-
:status => status,
|
58
|
-
:response_headers => headers,
|
59
|
-
:body => body)
|
60
|
-
@app.call(env)
|
61
|
-
end
|
62
|
-
|
63
|
-
class Configurator < Struct.new(:status, :headers, :delay, :body)
|
64
|
-
def status(val=nil) self.status = val if val ; super() end
|
65
|
-
def headers(val=nil) self.headers = val if val ; super() end
|
66
|
-
def body(val=nil) self.body = val if val ; super() end
|
67
|
-
def delay(val=nil) self.delay = val if val ; super() end
|
68
|
-
def self.from_hash hsh
|
69
|
-
new().tap{|config| hsh.each{|k,v| config.send("#{k}=", v) } }
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
def initialize(app, defaults={}, &block)
|
74
|
-
super(app)
|
75
|
-
@config = Configurator.from_hash(defaults.reverse_merge(:status => 200, :delay => 0, :headers => {}))
|
76
|
-
configure(&block) if block
|
77
|
-
end
|
78
|
-
|
79
|
-
def configure
|
80
|
-
yield config
|
81
|
-
end
|
82
|
-
|
83
|
-
# same as in Faraday::Utils -- turns :dummy_response_status into 'Dummy-Response-Status'
|
84
|
-
def header_hash_key(str)
|
85
|
-
str.to_s.split('_').each{|w| w.capitalize! }.join('-')
|
86
|
-
end
|
87
|
-
|
88
|
-
def create_multipart(env, params, boundary = nil)
|
89
|
-
stream = super
|
90
|
-
stream.read
|
91
|
-
end
|
92
|
-
end
|
93
|
-
end
|
94
|
-
end
|
@@ -1,40 +0,0 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
# For later, if we want to parse user agents:
|
4
|
-
# http://code.google.com/p/browserscope/source/browse/trunk/models/user_agent.py
|
5
|
-
# http://www.useragentstring.com/pages/All/
|
6
|
-
# http://github.com/jaxn/parse-user-agent
|
7
|
-
# http://code.google.com/p/browserscope/wiki/UserAgentParsing
|
8
|
-
# http://code.google.com/p/ua-parser/source/browse/
|
9
|
-
# http://github.com/shenoudab/active_device/tree/master/lib/active_device/
|
10
|
-
|
11
|
-
|
12
|
-
#
|
13
|
-
# * Mozilla based
|
14
|
-
# * Mozilla version
|
15
|
-
# * X11 based
|
16
|
-
# * Security
|
17
|
-
# * OS
|
18
|
-
# * CPU family
|
19
|
-
# * Language Tag
|
20
|
-
# * Renderer (i.e. Webkit, Trident, Presto)
|
21
|
-
# * Renderer Version
|
22
|
-
# * I don't see a utility for the "KHTML" and "like Gecko" bits, but whatever.
|
23
|
-
# * Based on
|
24
|
-
# * Browser Build (not really sure about this either)
|
25
|
-
|
26
|
-
# * Browser Family (i.e. Firefox, IE, Chrome, etc..)
|
27
|
-
# * Project Name (optional, i.e. Namoroka, Shiretoko)
|
28
|
-
# * Major Version
|
29
|
-
# * Minor Version
|
30
|
-
# * Version Third Bit
|
31
|
-
# * Version Fourth Bit
|
32
|
-
# * Open Question: How should we handle the "alpha/beta" bit, like apre1? I'm inclined to say we put it in its own datapoint and let people group together how ever they want, but not leave it attached to any of the version bits.
|
33
|
-
|
34
|
-
# Bot
|
35
|
-
# Brand
|
36
|
-
# Browser
|
37
|
-
# Engine
|
38
|
-
# Handset
|
39
|
-
# Model
|
40
|
-
# OS
|
@@ -1,82 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
|
5
|
-
module WordCount
|
6
|
-
class Mapper < Wukong::Streamer::LineStreamer
|
7
|
-
#
|
8
|
-
# Emit each word in each line.
|
9
|
-
#
|
10
|
-
def process line
|
11
|
-
tokenize(line).each{|word| yield [word, 1] }
|
12
|
-
end
|
13
|
-
|
14
|
-
#
|
15
|
-
# Split a string into its constituent words.
|
16
|
-
#
|
17
|
-
# This is pretty simpleminded:
|
18
|
-
# * downcase the word
|
19
|
-
# * Split at any non-alphanumeric boundary, including '_'
|
20
|
-
# * However, preserve the special cases of 's, 'd or 't at the end of a
|
21
|
-
# word.
|
22
|
-
#
|
23
|
-
# tokenize("Ability is a poor man's wealth #johnwoodenquote")
|
24
|
-
# # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
|
25
|
-
#
|
26
|
-
def tokenize str
|
27
|
-
return [] if str.blank?
|
28
|
-
str = str.downcase;
|
29
|
-
# kill off all punctuation except [stuff]'s or [stuff]'t
|
30
|
-
# this includes hyphens (words are split)
|
31
|
-
str = str.
|
32
|
-
gsub(/[^a-zA-Z0-9\']+/, ' ').
|
33
|
-
gsub(/(\w)\'([std])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
|
34
|
-
# Busticate at whitespace
|
35
|
-
words = str.split(/\s+/)
|
36
|
-
words.reject!{|w| w.blank? }
|
37
|
-
words
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
#
|
42
|
-
# A bit kinder to your memory manager: accumulate the sum record-by-record:
|
43
|
-
#
|
44
|
-
class Reducer2 < Wukong::Streamer::AccumulatingReducer
|
45
|
-
|
46
|
-
def start!(*args)
|
47
|
-
@key_count = 0
|
48
|
-
end
|
49
|
-
|
50
|
-
def accumulate(*args)
|
51
|
-
@key_count += 1
|
52
|
-
end
|
53
|
-
|
54
|
-
def finalize
|
55
|
-
yield [ key, @key_count ]
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
#
|
60
|
-
# You can stack up all the values in a list then sum them at once.
|
61
|
-
#
|
62
|
-
# This isn't good style, as it means the whole list is held in memory
|
63
|
-
#
|
64
|
-
class Reducer1 < Wukong::Streamer::ListReducer
|
65
|
-
def finalize
|
66
|
-
yield [ key, values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot } ]
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
#
|
71
|
-
# ... easiest of all, though: this is common enough that it's already included
|
72
|
-
#
|
73
|
-
require 'wukong/streamer/count_keys'
|
74
|
-
class Reducer3 < Wukong::Streamer::CountKeys
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
# Execute the script
|
79
|
-
Wukong.run(
|
80
|
-
WordCount::Mapper,
|
81
|
-
WordCount::Reducer2
|
82
|
-
)
|
data/old/examples/size.rb
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
|
5
|
-
module Size
|
6
|
-
#
|
7
|
-
# Feed the entire dataset through wc and sum the results
|
8
|
-
#
|
9
|
-
class Script < Wukong::Script
|
10
|
-
#
|
11
|
-
# Don't implement a wukong script to do something if there's a unix command
|
12
|
-
# that does it faster: just override map_command or reduce_command in your
|
13
|
-
# subclass of Wukong::Script to return the complete command line
|
14
|
-
#
|
15
|
-
def map_command
|
16
|
-
'/usr/bin/wc'
|
17
|
-
end
|
18
|
-
|
19
|
-
# Make all records go to one reducer
|
20
|
-
def default_options
|
21
|
-
super.merge :reduce_tasks => 1
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
#
|
26
|
-
# Sums the numeric value of each column in its input
|
27
|
-
#
|
28
|
-
class Reducer < Wukong::Streamer::Base
|
29
|
-
attr_accessor :sums
|
30
|
-
|
31
|
-
#
|
32
|
-
# The unix +wc+ command uses whitespace, not tabs, so we'll recordize
|
33
|
-
# accordingly.
|
34
|
-
#
|
35
|
-
def recordize line
|
36
|
-
line.strip.split(/\s+/)
|
37
|
-
end
|
38
|
-
|
39
|
-
#
|
40
|
-
# add each corresponding column in the input
|
41
|
-
#
|
42
|
-
def process *vals
|
43
|
-
self.sums = vals.zip( sums || [] ).map{|val,sum| val.to_i + sum.to_i }
|
44
|
-
end
|
45
|
-
|
46
|
-
#
|
47
|
-
# run through the whole reduction input and then output the total
|
48
|
-
#
|
49
|
-
def stream *args
|
50
|
-
super *args
|
51
|
-
emit sums
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
# Execute the script
|
57
|
-
Size::Script.new(
|
58
|
-
nil,
|
59
|
-
Size::Reducer,
|
60
|
-
:reduce_tasks => 1
|
61
|
-
).run
|
@@ -1,86 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# run like so:
|
3
|
-
# $> ruby average_value_frequecy.rb --run=local data/stats.tsv data/avf_out.tsv
|
4
|
-
require 'rubygems'
|
5
|
-
require 'wukong'
|
6
|
-
|
7
|
-
#
|
8
|
-
# Calculate the average value frequency (AVF) for each data row. AVF for a data
|
9
|
-
# point with m attributes is defined as:
|
10
|
-
#
|
11
|
-
# avf = (1/m)* sum (frequencies of attributes 1..m)
|
12
|
-
#
|
13
|
-
# so with the data
|
14
|
-
#
|
15
|
-
# 1 15 30 25
|
16
|
-
# 2 10 10 20
|
17
|
-
# 3 50 30 30
|
18
|
-
#
|
19
|
-
# for the first row, avf = (1/3)*(1+2+1) ~= 1.33. An outlier is identified by
|
20
|
-
# a low AVF.
|
21
|
-
#
|
22
|
-
module AverageValueFrequency
|
23
|
-
# Names for each column's attribute, in order
|
24
|
-
ATTR_NAMES = %w[length width height]
|
25
|
-
|
26
|
-
class HistogramMapper < Wukong::Streamer::RecordStreamer
|
27
|
-
# unroll each row from
|
28
|
-
# [id, val1, val2, ....]
|
29
|
-
# into
|
30
|
-
# [attr1, val1]
|
31
|
-
# [attr2, val2]
|
32
|
-
# ...
|
33
|
-
def process id, *values
|
34
|
-
ATTR_NAMES.zip(values).each do |attr, val|
|
35
|
-
yield [attr, val]
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
#
|
41
|
-
# Build a histogram of values
|
42
|
-
#
|
43
|
-
class HistogramReducer < Wukong::Streamer::CountingReducer
|
44
|
-
# use the attr and val as the key
|
45
|
-
def get_key attr, val=nil, *_
|
46
|
-
[attr, val]
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
class AvfRecordMapper < Wukong::Streamer::RecordStreamer
|
51
|
-
# average the frequency of each value
|
52
|
-
def process id, *values
|
53
|
-
sum = 0.0
|
54
|
-
ATTR_NAMES.zip(values).each do |attr, val|
|
55
|
-
sum += histogram[ [attr, val] ].to_i
|
56
|
-
end
|
57
|
-
avf = sum / ATTR_NAMES.length.to_f
|
58
|
-
yield [id, avf, *values]
|
59
|
-
end
|
60
|
-
|
61
|
-
# Load the histogram from a tab-separated file with
|
62
|
-
# attr val freq
|
63
|
-
def histogram
|
64
|
-
return @histogram if @histogram
|
65
|
-
@histogram = { }
|
66
|
-
File.open(options[:histogram_file]).each do |line|
|
67
|
-
attr, val, freq = line.chomp.split("\t")
|
68
|
-
@histogram[ [attr, val] ] = freq
|
69
|
-
end
|
70
|
-
@histogram
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
Settings.use :commandline, :define
|
76
|
-
Settings.define :histogram, :description => "Run the first pass to calculate a histogram"
|
77
|
-
Settings.define :avf, :description => "Run the second pass, to run back over the records with the histogram and find the AVF for each row."
|
78
|
-
Settings.define :histogram_file, :description => "File to load the histogram from (supply name of the output file from first pass)"
|
79
|
-
Settings.resolve!
|
80
|
-
if Settings[:histogram]
|
81
|
-
Wukong::Script.new(AverageValueFrequency::HistogramMapper, AverageValueFrequency::HistogramReducer).run
|
82
|
-
elsif Settings[:avf]
|
83
|
-
Wukong::Script.new(AverageValueFrequency::AvfRecordMapper, nil).run
|
84
|
-
else
|
85
|
-
raise "Please specify either --histogram (for first round) or --avf (second round)"
|
86
|
-
end
|