wukong 3.0.0.pre → 3.0.0.pre2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -0,0 +1,32 @@
|
|
1
|
+
/*
|
2
|
+
* Augments raw pageview data with page ID.
|
3
|
+
* Pageview stats are *theoretically* uniquely keyed by namespace
|
4
|
+
* and title, so that is what is used to join pageviews with page_metadata.
|
5
|
+
*
|
6
|
+
* In practice, the original pageview stats only give the URL visited, and
|
7
|
+
* reliably extracting namespace and title from the URL is difficult. Additionally,
|
8
|
+
* page names change, redirects happen, and many other small things can go
|
9
|
+
* wrong with the join. All pageview data is kept in the final table, but
|
10
|
+
* the page id will be blank in rows where the join failed.
|
11
|
+
*
|
12
|
+
* Output format:
|
13
|
+
* page_id:int, namespace:int, title:chararray, num_visitors:long,
|
14
|
+
* date:int, time:int, epoch_time:long, day_of_week:int
|
15
|
+
*/
|
16
|
+
|
17
|
+
%default PAGE_METADATA '/data/results/wikipedia/full/page_metadata' -- page metadata for all Wikipedia pages
|
18
|
+
%default EXTRACTED_PAGEVIEWS '/data/scratch/wikipedia/full/pageviews' -- raw extracted pageview stats (see extract_pageviews.rb)
|
19
|
+
%default AUGMENTED_PAGEVIEWS_OUT '/data/results/wikipedia/full/pageviews' -- where output will be stored
|
20
|
+
|
21
|
+
page_metadata = LOAD '$PAGE_METADATA' AS
|
22
|
+
(id:int, namespace:int, title:chararray,
|
23
|
+
restrictions:chararray, counter:long, is_redirect:int, is_new:int,
|
24
|
+
random:float, touched:int, page_latest:int, len:int);
|
25
|
+
pageviews = LOAD '$EXTRACTED_PAGEVIEWS' AS (namespace:int, title:chararray,
|
26
|
+
num_visitors:long, date:int, time:int, epoch_time:long, day_of_week:int);
|
27
|
+
|
28
|
+
first_join = JOIN page_metadata BY (namespace, title) RIGHT OUTER, pageviews BY (namespace, title);
|
29
|
+
final = FOREACH first_join GENERATE
|
30
|
+
page_metadata::id, pageviews::namespace, pageviews::title, pageviews::num_visitors,
|
31
|
+
pageviews::date, pageviews::time, pageviews::epoch_time, pageviews::day_of_week;
|
32
|
+
STORE final INTO '$AUGMENTED_PAGEVIEWS_OUT';
|
@@ -0,0 +1,85 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding:UTF-8
|
3
|
+
|
4
|
+
# Pig output format:
|
5
|
+
# namespace:int, title:chararray, num_visitors:long, date:int, time:int, epoch_time:long, day_of_week:int
|
6
|
+
|
7
|
+
$:.unshift '/home/dlaw/dev/wukong_og/lib'
|
8
|
+
$:.unshift '/home/dlaw/dev/gorillib/lib'
|
9
|
+
|
10
|
+
require 'uri'
|
11
|
+
require 'pathname'
|
12
|
+
require 'json'
|
13
|
+
require 'wukong'
|
14
|
+
require 'wukong/streamer'
|
15
|
+
require 'wukong/streamer/encoding_cleaner'
|
16
|
+
load '/home/dlaw/dev/wukong/examples/munging/wikipedia/utils/munging_utils.rb'
|
17
|
+
|
18
|
+
ENV['map_input_file'] ||= 'pagecounts-20071222-100000.gz'
|
19
|
+
|
20
|
+
class String
|
21
|
+
def is_enwiki?
|
22
|
+
return (not (self =~ /^en /).nil?)
|
23
|
+
end
|
24
|
+
|
25
|
+
def is_after_enwiki?
|
26
|
+
return (not (self =~ /^(e[o-z][a-z]*|[f-z][a-z]+) /).nil?)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
module PageviewsExtractor
|
31
|
+
class Mapper < Wukong::Streamer::LineStreamer
|
32
|
+
include Wukong::Streamer::EncodingCleaner
|
33
|
+
include MungingUtils
|
34
|
+
|
35
|
+
ns_json_file = File.open("/home/dlaw/dev/wukong/examples/munging/wikipedia/utils/namespaces.json",'r:UTF-8')
|
36
|
+
NAMESPACES = JSON.parse(ns_json_file.read)
|
37
|
+
|
38
|
+
# the filename strings are formatted as
|
39
|
+
# pagecounts-YYYYMMDD-HH0000.gz
|
40
|
+
def time_from_filename(filename)
|
41
|
+
parts = filename.split('-')
|
42
|
+
year = parts[1][0..3].to_i
|
43
|
+
month = parts[1][4..5].to_i
|
44
|
+
day = parts[1][6..7].to_i
|
45
|
+
hour = parts[2][0..1].to_i
|
46
|
+
return Time.new(year,month,day,hour)
|
47
|
+
end
|
48
|
+
|
49
|
+
def process line
|
50
|
+
# we only want enwiki lines
|
51
|
+
return if @done
|
52
|
+
if line.is_after_enwiki?
|
53
|
+
@done = true
|
54
|
+
return
|
55
|
+
end
|
56
|
+
return if not line.is_enwiki?
|
57
|
+
# we have an enwiki line - process it!
|
58
|
+
fields = line.split(' ')[1..-1]
|
59
|
+
out_fields = []
|
60
|
+
# add the namespace
|
61
|
+
namespace = nil
|
62
|
+
if fields[0].include? ':'
|
63
|
+
namespace = NAMESPACES[fields[0].split(':')[0]]
|
64
|
+
out_fields << (namespace || '0')
|
65
|
+
else
|
66
|
+
out_fields << '0'
|
67
|
+
end
|
68
|
+
# add the title
|
69
|
+
if namespace.nil?
|
70
|
+
out_fields << URI.unescape(fields[0])
|
71
|
+
else
|
72
|
+
out_fields << URI.unescape(fields[0][(fields[0].index(':')||-1)+1..-1])
|
73
|
+
end
|
74
|
+
# add number of visitors in the hour
|
75
|
+
out_fields << fields[2]
|
76
|
+
# grab date info from filename
|
77
|
+
file = Pathname.new(ENV['map_input_file']).basename
|
78
|
+
time = time_from_filename(file.to_s)
|
79
|
+
out_fields += time_columns_from_time(time)
|
80
|
+
yield out_fields
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
Wukong::Script.new(PageviewsExtractor::Mapper, Wukong::Streamer::LineStreamer).run
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# Pig Style Guide
|
2
|
+
|
3
|
+
- Everything except names should be in all caps. E.g.
|
4
|
+
|
5
|
+
first_join = JOIN pages BY (namespace,title)
|
6
|
+
RIGHT OUTER, pageviews BY (namespace, title);
|
7
|
+
|
8
|
+
- Group and align columns in the script in ways that make sense. Don't be afraid of newlines. E.g.
|
9
|
+
|
10
|
+
second_pass = FOREACH second_pass_j GENERATE
|
11
|
+
first_pass::from_id, pages::id,
|
12
|
+
first_pass::from_namespace, first_pass::from_title,
|
13
|
+
first_pass::into_namespace, first_pass::into_title;
|
14
|
+
|
15
|
+
- Columns that form an important sub-set of the table's data should be easily accessible as a unit.
|
16
|
+
|
17
|
+
E.g. The edge list above has the from and into ids in the first and second columns, making it easy to just get an edge list of ids without the additional metadata.
|
18
|
+
|
19
|
+
- When at all possible, you should include sample LOAD statements in the comments for your script. This makes it easy to use the output of your script
|
20
|
+
|
21
|
+
- Parameterize as much as possible. All paths should be parameterized.
|
22
|
+
|
23
|
+
- Parameters should be in all caps, e.g. $NODE.
|
24
|
+
|
25
|
+
- Parameters should have defaults if at all possible. When you define the default, also include a comment describing the parameter.
|
@@ -0,0 +1,19 @@
|
|
1
|
+
/*
|
2
|
+
* Filters the page metadata table, leaving only pages that
|
3
|
+
* are redirects.
|
4
|
+
*
|
5
|
+
* Output Format (same as page_metadata):
|
6
|
+
* (id:int, namespace:int, title:chararray, restrictions:chararray,
|
7
|
+
* counter:long, is_redirect:int, is_new:int, random:float, touched:int,
|
8
|
+
* page_latest:int, len:int)
|
9
|
+
*/
|
10
|
+
|
11
|
+
%default PAGE_METADATA '/data/results/wikipedia/full/page_metadata' -- page metdata for all pages in Wikipedia
|
12
|
+
%default REDIRECTS_OUT '/data/results/wikipedia/full/redirect_page_metadata' -- place to store page metdata for redirects
|
13
|
+
|
14
|
+
page_metadata = LOAD '$PAGE_METADATA' AS (id:int, namespace:int, title:chararray,
|
15
|
+
restrictions:chararray, counter:long, is_redirect:int, is_new:int, random:float,
|
16
|
+
touched:int, page_latest:int, len:int);
|
17
|
+
|
18
|
+
redirects = FILTER page_metadata BY (is_redirect == 1);
|
19
|
+
STORE redirects INTO '$REDIRECTS_OUT';
|
@@ -0,0 +1,23 @@
|
|
1
|
+
/*
|
2
|
+
* This script filters the articles table, leaving only the articles
|
3
|
+
* in the specified subuniverse.
|
4
|
+
*
|
5
|
+
* Output format:
|
6
|
+
* page_id:int, title:chararray, namespace:int, rev_date:int, rev_time:int,
|
7
|
+
* rev_epoch_time:long, rev_dow:int, article_text:chararray
|
8
|
+
*/
|
9
|
+
|
10
|
+
%default ARTICLES '/data/results/wikipedia/full/articles' -- all articles in the wikipedia corpus
|
11
|
+
%default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
|
12
|
+
%default SUB_ARTICLES_OUT '/data/results/wikipedia/mini/articles' -- where output will be stored
|
13
|
+
|
14
|
+
articles = LOAD '$ARTICLES' AS (page_id:int, title:chararray, namespace:int,
|
15
|
+
rev_date:int, rev_time:int, rev_epoch_time:long, rev_dow:int, article_text:chararray);
|
16
|
+
sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
|
17
|
+
sub_articles_unfiltered = JOIN articles BY id, sub_nodes BY node_id;
|
18
|
+
sub_articles = FOREACH sub_articles_unfiltered GENERATE
|
19
|
+
articles::page_id AS page_id, articles::title AS title, articles::namespace AS namespace,
|
20
|
+
articles::rev_date AS rev_date, articles::rev_time AS rev_time,
|
21
|
+
articles::rev_epoch_time AS rev_epoch_time, articles::rev_dow AS rev_dow,
|
22
|
+
articles::article_text AS article_text;
|
23
|
+
STORE sub_articles INTO '$SUB_ARTICLES_OUT';
|
@@ -0,0 +1,24 @@
|
|
1
|
+
/*
|
2
|
+
* This script filters the page metadata table, leaving only the pages
|
3
|
+
* in the specified subuniverse.
|
4
|
+
*
|
5
|
+
* Output format (same as page_metadata):
|
6
|
+
* id:int, namespace:int, title:chararray, restrictions:chararray, counter:long,
|
7
|
+
* is_redirect:int, is_new:int, random:float, touched:int, page_latest:int, len:int
|
8
|
+
*/
|
9
|
+
|
10
|
+
%default PAGE_METADATA '/data/results/wikipedia/full/page_metadata' -- metadata for all pages in the wikipedia corpus
|
11
|
+
%default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
|
12
|
+
%default SUB_PAGE_METADATA_OUT '/data/results/wikipedia/mini/page_metadata' -- where output will be stored
|
13
|
+
|
14
|
+
page_metadata = LOAD '$PAGE_METADATA' AS (id:int, namespace:int, title:chararray,
|
15
|
+
restrictions:chararray, counter:long, is_redirect:int, is_new:int, random:float,
|
16
|
+
touched:int, page_latest:int, len:int);
|
17
|
+
sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
|
18
|
+
sub_page_metadata_unfiltered = JOIN page_metadata BY id, sub_nodes BY node_id;
|
19
|
+
sub_page_metadata = FOREACH sub_page_metadata_unfiltered GENERATE
|
20
|
+
page_metadata::id, page_metadata::namespace, page_metadata::title,
|
21
|
+
page_metadata::restrictions, page_metadata::counter, page_metadata::is_redirect,
|
22
|
+
page_metadata::is_new, page_metadata::random, page_metadata::touched,
|
23
|
+
page_metadata::page_latest, page_metadata::len;
|
24
|
+
STORE sub_page_metadata INTO '$SUB_PAGE_METADATA_OUT';
|
@@ -0,0 +1,22 @@
|
|
1
|
+
/*
|
2
|
+
* This script filters the pagelinks table, leaving only the pagelinks
|
3
|
+
* that start within supplied subuniverse.
|
4
|
+
*
|
5
|
+
* Output format (same as augmented_pagelinks):
|
6
|
+
* from_id:int, into_id:int, from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray
|
7
|
+
*/
|
8
|
+
|
9
|
+
%default PAGELINKS '/data/results/wikipedia/full/pagelinks' -- all edges in the pagelink graph (must be *directed*)
|
10
|
+
%default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
|
11
|
+
%default SUB_PAGELINKS_OUT '/data/results/wikipedia/mini/pagelinks' -- where output will be stored
|
12
|
+
|
13
|
+
all_pagelinks = LOAD '$PAGELINKS' AS (from_id:int, into_id:int,
|
14
|
+
from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray);
|
15
|
+
sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
|
16
|
+
|
17
|
+
sub_pagelinks_from = JOIN all_pagelinks BY from_id, sub_nodes BY node_id;
|
18
|
+
sub_pagelinks = FOREACH sub_pagelinks_from GENERATE
|
19
|
+
all_pagelinks::from_id, all_pagelinks::into_id,
|
20
|
+
all_pagelinks::from_namespace, all_pagelinks::from_title,
|
21
|
+
all_pagelinks::into_namespace, all_pagelinks::into_title;
|
22
|
+
STORE sub_pagelinks INTO '$SUB_PAGELINKS_OUT';
|
@@ -0,0 +1,22 @@
|
|
1
|
+
/*
|
2
|
+
* This script filters the pagelinks table, leaving only the pagelinks
|
3
|
+
* that terminate within supplied subuniverse.
|
4
|
+
*
|
5
|
+
* Output format (same as augment_pagelinks):
|
6
|
+
* node_a:int, node_b:int, a_into_b:int, b_into_a:int, is_symmetric:int
|
7
|
+
*/
|
8
|
+
|
9
|
+
%default PAGELINKS '/data/results/wikipedia/full/pagelinks' -- all edges in the pagelink graph (must be *directed*)
|
10
|
+
%default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
|
11
|
+
%default SUB_PAGELINKS_OUT '/data/results/wikipedia/mini/pagelinks' -- where output will be stored
|
12
|
+
|
13
|
+
all_pagelinks = LOAD '$PAGELINKS' AS (from_id:int, into_id:int,
|
14
|
+
from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray);
|
15
|
+
sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
|
16
|
+
|
17
|
+
sub_pagelinks_into = JOIN all_pagelinks BY into_id, sub_nodes BY node_id;
|
18
|
+
sub_pagelinks = FOREACH sub_pagelinks_into GENERATE
|
19
|
+
all_pagelinks::from_id, all_pagelinks::into_id,
|
20
|
+
all_pagelinks::from_namespace, all_pagelinks::from_title,
|
21
|
+
all_pagelinks::into_namespace, all_pagelinks::into_title;
|
22
|
+
STORE sub_pagelinks INTO '$SUB_PAGELINKS_OUT';
|
@@ -0,0 +1,26 @@
|
|
1
|
+
/*
|
2
|
+
* This script filters the pagelinks table, leaving only the pagelinks
|
3
|
+
* that start and end within supplied subuniverse.
|
4
|
+
*
|
5
|
+
* Output format (same as augment_pagelinks):
|
6
|
+
* from_id:int, into_id:int, from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray
|
7
|
+
*/
|
8
|
+
|
9
|
+
%default PAGELINKS '/data/results/wikipedia/full/undirected_pagelinks' -- all edges in the pagelink graph
|
10
|
+
%default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
|
11
|
+
%default SUB_PAGELINKS_OUT '/data/results/wikipedia/mini/pagelinks' -- where output will be stored
|
12
|
+
|
13
|
+
all_pagelinks = LOAD '$PAGELINKS' AS (from_id:int, into_id:int,
|
14
|
+
from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray);
|
15
|
+
sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
|
16
|
+
|
17
|
+
sub_pagelinks_in = JOIN all_pagelinks BY from_id, sub_nodes BY node_id;
|
18
|
+
sub_pagelinks_unfiltered = JOIN sub_pagelinks_in BY into_id, sub_nodes BY node_id;
|
19
|
+
sub_pagelinks = FOREACH sub_pagelinks_unfiltered GENERATE
|
20
|
+
sub_pagelinks_in::all_pagelinks::from_id,
|
21
|
+
sub_pagelinks_in::all_pagelinks::into_id,
|
22
|
+
sub_pagelinks_in::all_pagelinks::from_namespace,
|
23
|
+
sub_pagelinks_in::all_pagelinks::from_title,
|
24
|
+
sub_pagelinks_in::all_pagelinks::into_namespace,
|
25
|
+
sub_pagelinks_in::all_pagelinks::into_title;
|
26
|
+
STORE sub_pagelinks INTO '$SUB_PAGELINKS_OUT';
|
@@ -0,0 +1,29 @@
|
|
1
|
+
/*
|
2
|
+
* This script filters the pageviews table, leaving only the pageviews
|
3
|
+
* in the specified subuniverse.
|
4
|
+
*
|
5
|
+
* Parameters:
|
6
|
+
* pageviews - all pageviews in the wikipedia corpus
|
7
|
+
* sub_nodes - the list of nodes in your subuniverse
|
8
|
+
* sub_pageviews_out - the directory where output will be stored
|
9
|
+
*
|
10
|
+
* Output format (same as pageviews_augment.pig):
|
11
|
+
* id:int, namespace:int,
|
12
|
+
* page_id:int, title:chararray, namespace:int, rev_date:int, rev_time:int,
|
13
|
+
* rev_epoch_time:long, rev_dow:int, article_text:chararray
|
14
|
+
*/
|
15
|
+
|
16
|
+
%default PAGEVIEWS '/data/results/wikipedia/full/pageviews' -- all pageview stats for the English Wikipedia
|
17
|
+
%default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
|
18
|
+
%default SUB_PAGEVIEWS_OUT '/data/results/wikipedia/mini/pageviews' -- where output will be stored
|
19
|
+
|
20
|
+
pageviews = LOAD '$PAGEVIEWS' AS (page_id:int, title:chararray, namespace:int,
|
21
|
+
rev_date:int, rev_time:int, rev_epoch_time:long, rev_dow:int, article_text:chararray);
|
22
|
+
sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
|
23
|
+
sub_pageviews_unfiltered = JOIN pageviews BY id, sub_nodes BY node_id;
|
24
|
+
sub_pageviews = FOREACH sub_pageviews_unfiltered GENERATE
|
25
|
+
articles::page_id AS page_id, articles::title AS title, articles::namespace AS namespace,
|
26
|
+
articles::rev_date AS rev_date, articles::rev_time AS rev_time,
|
27
|
+
articles::rev_epoch_time AS rev_epoch_time, articles::rev_dow AS rev_dow,
|
28
|
+
articles::article_text AS article_text;
|
29
|
+
STORE sub_pageviews INTO '$SUB_PAGEVIEWS_OUT';
|
@@ -0,0 +1,24 @@
|
|
1
|
+
/*
|
2
|
+
* This script filters the pagelinks table, leaving only the pagelinks
|
3
|
+
* that start and end within supplied subuniverse.
|
4
|
+
*
|
5
|
+
* Output format (same as undirected_pagelinks):
|
6
|
+
* node_a:int, node_b:int, a_into_b:int, b_into_a:int, is_symmetric:int
|
7
|
+
*/
|
8
|
+
|
9
|
+
%default UNDIRECTED_PAGELINKS '/data/results/wikipedia/full/undirected_pagelinks' -- all edges in the pagelink graph
|
10
|
+
%default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
|
11
|
+
%default SUB_PAGELINKS_OUT '/data/results/wikipedia/mini/pagelinks' -- where output will be stored
|
12
|
+
|
13
|
+
all_pagelinks = LOAD '$UNDIRECTED_PAGELINKS' AS (node_a:int, node_b:int, a_into_b:int, b_into_a:int, is_symmetric:int);
|
14
|
+
sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
|
15
|
+
|
16
|
+
sub_pagelinks_in = JOIN all_pagelinks BY node_a, sub_nodes BY node_id;
|
17
|
+
sub_pagelinks_unfiltered = JOIN sub_pagelinks_in BY node_b, sub_nodes BY node_id;
|
18
|
+
sub_pagelinks = FOREACH sub_pagelinks_unfiltered GENERATE
|
19
|
+
sub_pagelinks_in::all_pagelinks::node_a AS node_a,
|
20
|
+
sub_pagelinks_in::all_pagelinks::node_b AS node_b,
|
21
|
+
sub_pagelinks_in::all_pagelinks::a_into_b AS a_into_b,
|
22
|
+
sub_pagelinks_in::all_pagelinks::b_into_a AS b_into_a,
|
23
|
+
sub_pagelinks_in::all_pagelinks::is_symmetric AS is_symmetric;
|
24
|
+
STORE sub_pagelinks INTO '$SUB_PAGELINKS_OUT';
|
@@ -0,0 +1,86 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding:UTF-8
|
3
|
+
|
4
|
+
# A script that fetches the namespace -> id mapping for
|
5
|
+
# all wikipedia languages. The output is stored (by default)
|
6
|
+
# in a json file that represents a hash from namespace name => id
|
7
|
+
|
8
|
+
require 'ruby-progressbar'
|
9
|
+
require 'open-uri'
|
10
|
+
require 'set'
|
11
|
+
require 'configliere'
|
12
|
+
require 'json'
|
13
|
+
|
14
|
+
Settings.use :commandline
|
15
|
+
|
16
|
+
NS_FILE = 'namespaces'
|
17
|
+
|
18
|
+
Settings.define :out_dir, flag: 'o', description: "Directory to drop the namespace file into.", default: File.expand_path(File.dirname(__FILE__))
|
19
|
+
Settings.define :verbose, flag: 'v', description: "Get chatty", type: :boolean, default: false
|
20
|
+
Settings.define :silent, description: "Say nothing", type: :boolean, default: false
|
21
|
+
Settings.define :head_length, flag: 'h', description: "The number of lines to read into the wiki xml for the namespace definitions.", type: Integer, default: 100
|
22
|
+
Settings.define :std_out, flag: 's', description: "Print output to standard out.", type: :boolean, default: false
|
23
|
+
Settings.define :to_tsv, flag: 't', description: 'Format the output as a TSV instead of JSON', type: :boolean, default:false
|
24
|
+
|
25
|
+
Settings.resolve!
|
26
|
+
|
27
|
+
Settings.out_dir = File.expand_path(Settings.out_dir)
|
28
|
+
|
29
|
+
namespaces = {}
|
30
|
+
namespaces_by_wiki = {}
|
31
|
+
|
32
|
+
wikis_page = open('http://dumps.wikimedia.org/backup-index.html')
|
33
|
+
wikis = Set.new
|
34
|
+
|
35
|
+
# grap the list of wikis
|
36
|
+
wikis_page.each_line do |line|
|
37
|
+
next unless line =~ />[a-z]*wiki</
|
38
|
+
wikis << line.gsub(/.*>([a-z]*)wiki<.*/,'\1')[0..-2]
|
39
|
+
end
|
40
|
+
|
41
|
+
if Settings.verbose
|
42
|
+
$stderr.puts "Retrieved the names of #{wikis.size} wikis"
|
43
|
+
$stderr.puts "Grabbing namespace data"
|
44
|
+
elsif (not Settings.silent)
|
45
|
+
progressbar = ProgressBar.create(:title => "Retrieving Namespaces...", :total => wikis.size, :format => '%t |%B| %c/%C %e ')
|
46
|
+
end
|
47
|
+
|
48
|
+
wikis.each_with_index do |prefix,index|
|
49
|
+
progressbar.increment unless (Settings.silent or Settings.verbose)
|
50
|
+
namespaces_by_wiki[prefix] = {}
|
51
|
+
$stderr.puts "Getting namespaces for #{prefix}.wikipedia.org" if Settings.verbose
|
52
|
+
raw = `curl -s 'http://dumps.wikimedia.org/#{prefix}wiki/latest/#{prefix}wiki-latest-pages-logging.xml.gz' | gzcat | head -n #{Settings.head_length}`
|
53
|
+
#TODO: Make this actually work
|
54
|
+
if $?.exitstatus != 0
|
55
|
+
out = "Could not access page dump for #{prefix}wiki." +
|
56
|
+
" This dump is probably being updated now." +
|
57
|
+
" Namespaces for this wiki will not be included in the final output"
|
58
|
+
$stderr.puts out
|
59
|
+
next
|
60
|
+
end
|
61
|
+
raw.each_line do |line|
|
62
|
+
next unless line =~ /.*<\/?namespace[^>]*>/
|
63
|
+
match = /<\/?namespace key="(?<key>-?\d+)"[^>]*>(?<ns>[^<]*)<\/namespace>/.match(line)
|
64
|
+
next if match.nil?
|
65
|
+
namespaces[match[:ns]] = match[:key].to_i
|
66
|
+
namespaces_by_wiki[prefix][match[:ns]] = match[:key].to_i
|
67
|
+
$stderr.puts " #{match[:ns]} -> #{match[:key]}" if Settings.verbose
|
68
|
+
end
|
69
|
+
$stderr.puts "Finished getting namespaces for #{prefix}.wikipedia.org. #{wikis.size - index} wikis to go" if Settings.verbose
|
70
|
+
end
|
71
|
+
|
72
|
+
if Settings.to_tsv
|
73
|
+
output = ""
|
74
|
+
namespaces.each_pair do |k,v|
|
75
|
+
output += "#{k}\t#{v}\n"
|
76
|
+
end
|
77
|
+
else
|
78
|
+
output = namespaces.to_json
|
79
|
+
end
|
80
|
+
|
81
|
+
if Settings.std_out
|
82
|
+
pp output
|
83
|
+
else
|
84
|
+
filename = "#{Settings.out_dir}/#{NS_FILE}.#{Settings.to_tsv ? "tsv" : "json"}"
|
85
|
+
File.open(filename, 'w') { |f| f.write(output)}
|
86
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# encoding:UTF-8
|
2
|
+
|
3
|
+
require 'multi_json'
|
4
|
+
|
5
|
+
module MungingUtils
|
6
|
+
extend self # you can call MungingUtils.foo, or include it and call on self.
|
7
|
+
|
8
|
+
# all non-keyboard characters (that is, characters outside the 0x20 to 0x127 range)
|
9
|
+
NON_PLAIN_ASCII_RE = /[^\x20-\x7e]/m
|
10
|
+
# characters below 0x20
|
11
|
+
CONTROL_CHARS_RE = /[\x00-\x19]/m
|
12
|
+
|
13
|
+
def time_columns_from_time(time)
|
14
|
+
columns = []
|
15
|
+
columns << "%04d%02d%02d" % [time.year, time.month, time.day]
|
16
|
+
columns << "%02d%02d%02d" % [time.hour, time.min, time.sec]
|
17
|
+
columns << time.to_i
|
18
|
+
columns << time.wday
|
19
|
+
return columns
|
20
|
+
end
|
21
|
+
|
22
|
+
def warn_record(desc, record=nil)
|
23
|
+
record_info = MultiJson.encode(record)[0..1000] rescue "(unencodeable record) #{record.inspect[0..100]}"
|
24
|
+
Log.warn [desc, record_info].join("\t")
|
25
|
+
nil
|
26
|
+
end
|
27
|
+
|
28
|
+
# Modifies the text in place, replacing all newlines, tabs, and other control
|
29
|
+
# characters with a space (those < ascii 0x20, but not including 0xff). This
|
30
|
+
# uses a whitelist
|
31
|
+
#
|
32
|
+
# Only use this if funny characters aren't suppose to be in there in the first
|
33
|
+
# place; there are safe, easy ways to properly encode, eg `MultiJson.encode()`
|
34
|
+
#
|
35
|
+
def scrub_control_chars(text)
|
36
|
+
text.gsub!(CONTROL_CHARS_RE, ' ')
|
37
|
+
text
|
38
|
+
end
|
39
|
+
|
40
|
+
# Modifies the text in place, replacing all non-keyboard characters (newline,
|
41
|
+
# tab, anything not between ascii 0x20 and 0x7e) with their XML entity encoding
|
42
|
+
def safe_xml_encode(text)
|
43
|
+
text.gsub!(NON_PLAIN_ASCII_RE){|ch| "\\u%04x" % ch.ord } unless jsonized.ascii_only?
|
44
|
+
text
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
# Returns a JSON encoded string, with all non-ASCII characters escaped
|
49
|
+
def safe_json_encode(string)
|
50
|
+
jsonized = MultiJson.encode(string)
|
51
|
+
jsonized.gsub!(NON_PLAIN_ASCII_RE){|ch| "\\u%04x" % ch.ord } unless jsonized.ascii_only?
|
52
|
+
jsonized
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
Time.class_eval do
|
59
|
+
def to_flat
|
60
|
+
utc.strftime("%Y%m%d%H%M%SZ")
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
MatchData.class_eval do
|
65
|
+
def as_hash
|
66
|
+
Hash[ names.map{|name| [name.to_sym, self[name]] } ]
|
67
|
+
end
|
68
|
+
end
|