wukong 3.0.0.pre → 3.0.0.pre2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,90 +0,0 @@
|
|
1
|
-
job_201006200508_0002 NORMAL flip parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b 100.00%
|
2
|
-
s3 => hdfs bz2 parser, cond_em empty (?)
|
3
|
-
201006200508_0002 35mins, 34sec 1 1812031232 0 12495736645 7240978546 8180472 388863907 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b
|
4
|
-
|
5
|
-
job_201006200508_0003 NORMAL flip parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-dupes 100.00%
|
6
|
-
s3 => hdfs bz2 parser, cond_em duplicate
|
7
|
-
201006200508_0003 15mins, 50sec 1 1812031232 0 11877866580 7240978546 8180472 383928615 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-dupes
|
8
|
-
|
9
|
-
job_201006200508_0004 NORMAL flip parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2 100.00%
|
10
|
-
hdfs => hdfs bz2 parser, cond_em empty
|
11
|
-
201006200508_0004 36mins, 56sec 1 1812031232 13334645497 7240978546 8180472 395564272 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2
|
12
|
-
|
13
|
-
job_201006200508_0005 NORMAL flip parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2_no_cond_em 100.00%
|
14
|
-
hdfs => hdfs bz2 parser, no_cond_em --
|
15
|
-
201006200508_0005 35mins, 23sec 1 1812031232 13479823318 7240978546 8180472 396757046 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2_no_cond_em
|
16
|
-
|
17
|
-
job_201006200508_0006 NORMAL flip hdp-stream-flat-/bin/cat-/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111 100.00%
|
18
|
-
hdfs => hdfs bz2 `which cat`
|
19
|
-
201006200508_0006 1mins, 10sec 1 1812031232 7240978549 7240978546 8180472 8180472 hdp-stream-flat-/bin/cat-/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111
|
20
|
-
|
21
|
-
job_201006200508_0007 NORMAL flip hdp-stream-flat-/bin/cat-s3n://monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111-from_s3n 100.00%
|
22
|
-
s3 => hdfs bz2 `which cat`
|
23
|
-
201006200508_0007 1mins, 55sec 1 1812031232 0 7240978549 7240978546 8180472 8180472 hdp-stream-flat-/bin/cat-s3n://monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111-from_s3n
|
24
|
-
|
25
|
-
job_201006200508_0008 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-no_cond_em-no_db 100.00%
|
26
|
-
hdfs => hdfs flat parser no cond_em no db
|
27
|
-
201006200508_0008 10mins, 59sec 1 7240978549 13545881166 7240978549 8180472 397172723 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-no_cond_em-no_db
|
28
|
-
|
29
|
-
job_201006200508_0015 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-no_db 100.00%
|
30
|
-
hdfs => hdfs flat parser cond_em on users only no DB
|
31
|
-
201006200508_0015 23mins, 48sec 1 7240978549 13415414554 7240978549 8180472 396101235 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-no_db
|
32
|
-
|
33
|
-
job_201006200508_0016 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-nodupes 100.00%
|
34
|
-
hdfs => hdfs flat parser cond_em on users only - vanished saving id/sn to DB
|
35
|
-
201006200508_0016 28mins, 7sec 1 0 7240978549 13414285504 7240978549 8180472 396091251 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
|
36
|
-
|
37
|
-
job_201006200508_0017 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes 100.00%
|
38
|
-
hdfs => hdfs flat parser cond_em on users only - duped saving id/sn to DB
|
39
|
-
201006200508_0017 11mins, 51sec 1 0 7240978549 12221205449 7240978549 8180472 386114331 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
|
40
|
-
|
41
|
-
===========================================================================
|
42
|
-
== Parse
|
43
|
-
==
|
44
|
-
|
45
|
-
job_201006200508_0018 NORMAL flip parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2010056---/data/sn/tw/rawd/parsed/2010056 100.00%
|
46
|
-
201006200508_0018 11hrs, 12mins, 43sec 1 25560337747 141729936525 128606199040 14198839 3918844056 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2010056---/data/sn/tw/rawd/parsed/2010056
|
47
|
-
|
48
|
-
for foo in 0016 0017 0018 ; do echo $foo ; ~/ics/hadoop/chimpmark/bin/elephantscat.rb job_201006200508_$foo ; done
|
49
|
-
cat ~/timings/job/201006200508/*/*.tsv | wu-lign
|
50
|
-
|
51
|
-
job_id scraped_at run_time succ? s3n_in hdfs_in file_in hdfs_out file_out map_in map_out map_recs_in map_recs_out red_recs_in red_recs_out job_name
|
52
|
-
201006200508_0002 35mins, 34sec 1 1812031232 0 0 12495736645 0 7240978546 0 8180472 388863907 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b
|
53
|
-
201006200508_0003 15mins, 50sec 1 1812031232 0 0 11877866580 0 7240978546 0 8180472 383928615 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-dupes
|
54
|
-
201006200508_0004 36mins, 56sec 1 1812031232 0 13334645497 0 7240978546 0 8180472 395564272 0 0 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2
|
55
|
-
201006200508_0005 35mins, 23sec 1 1812031232 0 13479823318 0 7240978546 0 8180472 396757046 0 0 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2_no_cond_em
|
56
|
-
201006200508_0006 1mins, 10sec 1 1812031232 0 7240978549 0 7240978546 0 8180472 8180472 0 0 hdp-stream-flat-/bin/cat-/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111
|
57
|
-
201006200508_0007 1mins, 55sec 1 1812031232 0 0 7240978549 0 7240978546 0 8180472 8180472 0 0 hdp-stream-flat-/bin/cat-s3n://monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111-from_s3n
|
58
|
-
201006200508_0008 10mins, 59sec 1 7240978549 0 13545881166 0 7240978549 0 8180472 397172723 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-no_cond_em-no_db
|
59
|
-
201006200508_0015 23mins, 48sec 1 7240978549 0 13415414554 0 7240978549 0 8180472 396101235 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-no_db
|
60
|
-
201006200508_0016 28mins, 7sec 1 7240978549 0 13414285504 0 7240978549 0 8180472 396091251 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
|
61
|
-
201006200508_0017 11mins, 51sec 1 7240978549 0 12221205449 0 7240978549 0 8180472 386114331 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
|
62
|
-
201006200508_0018 11hrs, 12mins, 43sec 1 25560337747 0 0 141729936525 0 128606199040 0 14198839 3918844056 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2010056---/data/sn/tw/rawd/parsed/2010056
|
63
|
-
201006200508_0021 8hrs, 50mins, 52sec 1 141779023755 62208536220 24722859867 73825391771 141729936525 189098533358 3918844056 3918844056 155139258 155139258 Unsplicer
|
64
|
-
201006200508_0029 1mins, 20sec 1 1763173995 0 1762322014 0 1762322014 0 22764940 22764940 0 0 hdp-stream-flat-/bin/cat-/data/sn/tw/rawd/unspliced/twitter_user-/tmp/foo
|
65
|
-
201006200508_0031 3hrs, 48mins, 6sec 1 14930014182 0 0 48106164389 0 113092707367 0 8408164 753481311 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/201004---/data/sn/tw/rawd/parsed/api/201004
|
66
|
-
201006200508_0034 30mins, 46sec 1 7170990599 2203578261 8389754083 5031160348 7170990599 7170990510 143461243 143461241 143461241 67443309 bulk_load_conversation.rb---/data/sn/tw/fixd/objects/a_replies_b---/data/sn/tw/fixd/apeyeye/conversation/a_replies_b_json
|
67
|
-
|
68
|
-
Identity mapper Wukong `which cat` pig
|
69
|
-
Identity reducer wukong `which cat` pig
|
70
|
-
* no skew
|
71
|
-
* data/reducer > ram
|
72
|
-
|
73
|
-
Do a sort|uniq on 150GB
|
74
|
-
|
75
|
-
|
76
|
-
* 1.8 GB bz2, S3 => HDFS 1m55s
|
77
|
-
* 1.8 GB bz2, HDFS => HDFS 1m10s
|
78
|
-
|
79
|
-
TokyoTyrant, 1 node => 4 m1.large (Balancer) 15_000 inserts/sec
|
80
|
-
TokyoTyrant, 20 tasks => 4 m1.large (Balancer) 2_000 inserts/sec
|
81
|
-
|
82
|
-
===========================================================================
|
83
|
-
|
84
|
-
Parse:
|
85
|
-
|
86
|
-
hdp-du s3n://monkeyshines.infochimps.org/data/ripd/com.tw/\*/ > /mnt/tmp/ripd_com.tw-du.tsv
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
1050 entries 448483502374 417.7 GB
|
@@ -1,65 +0,0 @@
|
|
1
|
-
---
|
2
|
-
layout: default
|
3
|
-
title: mrflip.github.com/wukong - wu-lign utility
|
4
|
-
collapse: false
|
5
|
-
---
|
6
|
-
|
7
|
-
h1. wu-lign -- format a tab-separated file as aligned columns
|
8
|
-
|
9
|
-
wu-lign will intelligently reformat a tab-separated file into a tab-separated, space aligned file that is still suitable for further processing. For example, given the log-file input
|
10
|
-
|
11
|
-
<pre><code>
|
12
|
-
2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
|
13
|
-
2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
|
14
|
-
2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
|
15
|
-
2009-07-21T21:44:00 concert 500 0.29290 13367 9733414
|
16
|
-
2009-07-21T21:44:29 world 65536 1.09110 32850 200916
|
17
|
-
2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
|
18
|
-
2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
|
19
|
-
</code></pre>
|
20
|
-
|
21
|
-
wu-lign will reformat it to read
|
22
|
-
|
23
|
-
<pre><code>
|
24
|
-
2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
|
25
|
-
2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
|
26
|
-
2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
|
27
|
-
2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414
|
28
|
-
2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
|
29
|
-
2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
|
30
|
-
2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
|
31
|
-
</code></pre>
|
32
|
-
|
33
|
-
The fields are still tab-delimited by exactly one tab -- only spaces are used to pad out fields. You can still use cuttab and friends to manipulate columns.
|
34
|
-
|
35
|
-
wu-lign isn't intended to be smart, or correct, or reliable -- only to be useful for previewing and organizing tab-formatted files. In general @wu-lign(foo).split("\t").map(&:strip)@ *should* give output semantically equivalent to its input. (That is, the only changes should be insertion of spaces and re-formatting of numerics.) But still -- reserve its use for human inspection only.
|
36
|
-
|
37
|
-
(Note: tab characters in this source code file have been converted to spaces; replace whitespace with tab in the first example if you'd like to play along at home.)
|
38
|
-
|
39
|
-
h2. How it works
|
40
|
-
|
41
|
-
Wu-Lign takes the first 500ish lines, splits into fields on TAB characters, and tries to guess the format (int, float, or string) for each. It builds a consensus of the width and type for corresponding columns in the chunk. If a column has mixed numeric and string formats it degrades to :mixed, which is basically treated as :string. If a column has mixed :float and :int elements all of them are formatted as float.
|
42
|
-
|
43
|
-
h2. Command-line arguments
|
44
|
-
|
45
|
-
You can give sprintf-style positional arguments on the command line that will be applied to the corresponding columns. (Blank args are used for placeholding and auto-formatting is still applied). So with the example above,
|
46
|
-
|
47
|
-
@cat foo | wu-lign '' '' '' '%8.4e'@
|
48
|
-
|
49
|
-
will format the fourth column with "%8.4e", while the first three columns and fifth-and-higher columns are formatted as usual.
|
50
|
-
|
51
|
-
<pre><code>
|
52
|
-
...
|
53
|
-
2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
|
54
|
-
2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
|
55
|
-
2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
|
56
|
-
....
|
57
|
-
</code></pre>
|
58
|
-
|
59
|
-
h2. Notes
|
60
|
-
|
61
|
-
* Header rows: the first line is used for width alignment but not for type detection. This means that an initial row of text headers will inform column spacing but still allow a column of floats (say) to be properly aligned as floats.
|
62
|
-
* It requires a unanimous vote. One screwy line can coerce the whole mess to :mixed; width formatting will still be applied, though.
|
63
|
-
* It won't set columns wider than 100 chars -- this allows for the occasional super-wide column without completely breaking your screen.
|
64
|
-
* For :float values, wulign tries to guess at the right number of significant digits to the left and right of the decimal point.
|
65
|
-
* wulign parses only plain-jane 'TSV files': no quoting or escaping; every tab delimits a field, every newline a record.
|
@@ -1,17 +0,0 @@
|
|
1
|
-
---
|
2
|
-
layout: default
|
3
|
-
title: mrflip.github.com/wukong - Using Wukong and Wuclan, Part 1 - Setup
|
4
|
-
collapse: false
|
5
|
-
---
|
6
|
-
|
7
|
-
h1. Using Wukong and Wuclan, Part 0 - Setup
|
8
|
-
|
9
|
-
Please follow the "installation and setup directions":setup.html for wukong, hadoop and a compute cluster.
|
10
|
-
|
11
|
-
h1. Using Wukong and Wuclan, Part 1 - Scraping
|
12
|
-
|
13
|
-
This part needs writing.
|
14
|
-
|
15
|
-
Later, it will tell you how to get a large corpus of data to use in part 2.
|
16
|
-
|
17
|
-
In the meantime check out http://mrflip.github.com/monkeyshines/ and http://mrflip.github.com/wuclan/ -- in particular the "Twitter Search Scraper":http://github.com/mrflip/wuclan/tree/master/examples/twitter/scrape_twitter_search/ example. We use this in production to gather and analyze tens of gigabytes of twitter conversations.
|
@@ -1,75 +0,0 @@
|
|
1
|
-
---
|
2
|
-
layout: default
|
3
|
-
title: mrflip.github.com/wukong - Overview
|
4
|
-
collapse: false
|
5
|
-
---
|
6
|
-
|
7
|
-
h1. Thinking Big Data
|
8
|
-
|
9
|
-
h2. There's lots of data, Wukong and Hadoop can help
|
10
|
-
|
11
|
-
|
12
|
-
There are two disruptive
|
13
|
-
|
14
|
-
* We're instrumenting every realm of human activity
|
15
|
-
** Conversation
|
16
|
-
** Relationships
|
17
|
-
**
|
18
|
-
|
19
|
-
* We have linearly scaling multiprocessing
|
20
|
-
** Old frontier computing: expensive, N log N, SUUUUUUCKS
|
21
|
-
** It's cheap, it's scaleable and it's fun
|
22
|
-
|
23
|
-
h2. == Map|Reduce ==
|
24
|
-
|
25
|
-
h3. cat input.tsv | mapper.sh | sort | reducer.sh > output.tsv
|
26
|
-
|
27
|
-
* Bobo histogram:
|
28
|
-
|
29
|
-
cat twitter_users.tsv | cuttab 3 | cutc 1-6 | sort | uniq -c > histogram.tsv
|
30
|
-
|
31
|
-
cat twitter_users.tsv | \
|
32
|
-
cuttab 3 | # extract the date column \
|
33
|
-
cutc 1-6 | # chop off all but the yearmonth \
|
34
|
-
sort | # sort, to ensure locality \
|
35
|
-
uniq -c > # roll up lines, along with their count \
|
36
|
-
histogram.tsv # save into output file
|
37
|
-
|
38
|
-
|
39
|
-
h3. Word Count
|
40
|
-
|
41
|
-
mapper:
|
42
|
-
|
43
|
-
# output each word on its own line
|
44
|
-
@readlines.each{|line| puts line.split(/[^\w]+/) }@
|
45
|
-
|
46
|
-
reducer:
|
47
|
-
|
48
|
-
# every word is _guaranteed_ to land in the same place and next to its
|
49
|
-
# friends, so we can just output the repetition count for each
|
50
|
-
# distinct line.
|
51
|
-
uniq -c
|
52
|
-
|
53
|
-
|
54
|
-
h3. Word Count by Person
|
55
|
-
|
56
|
-
* Partition Keys vs. Reduce Keys
|
57
|
-
|
58
|
-
- reduce by [word, <total>, count] and [word, user_id, count]
|
59
|
-
|
60
|
-
|
61
|
-
h2. == Global Structure ==
|
62
|
-
|
63
|
-
h3. Enumerating neighborhood
|
64
|
-
|
65
|
-
* adjacency list
|
66
|
-
|
67
|
-
* join on center link
|
68
|
-
|
69
|
-
* list of 3-paths ==
|
70
|
-
|
71
|
-
h2. == Mechanics, HDFS ==
|
72
|
-
|
73
|
-
|
74
|
-
x M _
|
75
|
-
_ M y
|
@@ -1,138 +0,0 @@
|
|
1
|
-
---
|
2
|
-
layout: default
|
3
|
-
title: mrflip.github.com/wukong - Using Wukong and Wuclan, Part 3 - Parsing
|
4
|
-
collapse: false
|
5
|
-
---
|
6
|
-
|
7
|
-
h1. Using Wukong and Wuclan - Parsing
|
8
|
-
|
9
|
-
In part 1 we begain a scraper to trawl our desired part of the social web. Now
|
10
|
-
we're ready to start using Wukong to process the files.
|
11
|
-
|
12
|
-
Files come off the wire as
|
13
|
-
|
14
|
-
:url :scraped_at :response_code :response_message :contents
|
15
|
-
String DateTime (flat) Integer String String (JSON-formatted, tab&newline-munged)
|
16
|
-
|
17
|
-
The contents field is a JSON-formatted mix of records:
|
18
|
-
|
19
|
-
* TwitterFollowersRequest and TwitterFriendsRequest yield an @Array[Hash{user => raw_tweet}]@. We want to extract a stream of AFollowsB (with the request user as user_a for a friends request and user_b for a followers request) along with the included Tweet, TwitterUser, TwitterUserProfile and TwitterUserStyle records.
|
20
|
-
* TwitterFavoritesRequest yields an array of @Array[Hash{tweet_hash => user_hash}]. We want to extract a stream of AFavoritesB along with the included Tweet, TwitterUser, TwitterUserProfile and TwitterUserStyle records
|
21
|
-
* TwitterUser yields a single @user_hash@ making one each of TwitterUser, TwitterUserProfile and TwitterUserStyle.
|
22
|
-
* UserTimelineRequest and PublicTimelineRequest yield an Array[Hash{tweet => user}]. We want to extract the included Tweet, TwitterUser, TwitterUserProfile and TwitterUserStyle records.
|
23
|
-
* TwitterFollowerIdsRequest and TwitterFriendIdsRequest return an Array[user_ids] (each user_id is a simple Integer). We extract a series of AFollowsB (using the request's user_id as user_a_id or user_b_id)
|
24
|
-
|
25
|
-
We want to split each API response into a stream of those TwitterUser, Tweet, etc. records.
|
26
|
-
|
27
|
-
# Stream in each line (each line holds one request)
|
28
|
-
# turn the line into the corresponding TwitterRequest
|
29
|
-
# have the TwitterRequest parse its JSON contents and construct the TwitterUser, Tweet, etc.
|
30
|
-
# seriealize those records back out as tab-separated lines suitable for further processing with Wukong
|
31
|
-
|
32
|
-
h4. The basics of StructStreamer
|
33
|
-
|
34
|
-
Wukong handles the first and last steps through its StructStreamer and the standard .to_flat method. So the actual structure is really simple:
|
35
|
-
|
36
|
-
#
|
37
|
-
# Instantiate each incoming request.
|
38
|
-
# Stream out the contained classes it generates.
|
39
|
-
#
|
40
|
-
class TwitterRequestParser < Wukong::Streamer::StructStreamer
|
41
|
-
def process request
|
42
|
-
request.parse do |obj|
|
43
|
-
yield obj
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
# This makes the script go.
|
49
|
-
Wukong::Script.new(TwitterRequestParser, nil).run
|
50
|
-
|
51
|
-
In practice, all you need to know is that a StructStreamer gets a stream of objects to parse. Here's an outline of its internals. The Wukong StructStreamer:
|
52
|
-
|
53
|
-
# takes each flattened line:
|
54
|
-
|
55
|
-
"twitter_friends_request http://.... 20090701123456 ...fields... [{...}, {...}, ...json..., {...}]"
|
56
|
-
|
57
|
-
# splits by tabs to create an array of fields
|
58
|
-
|
59
|
-
["twitter_friends_request", "http://...", ... "[{...}, {...}, ...json..., {...}]"]
|
60
|
-
|
61
|
-
# constructs the class name indicated in the first field,
|
62
|
-
using the values extracted from the remaining fields.
|
63
|
-
|
64
|
-
TwitterFriendsRequest.new "http://...", "20090701123456", ... "[{...}, {...}, ...json..., {...}]"
|
65
|
-
|
66
|
-
The last (contents) field is still just a string: there's nothing special about it to Wukong.
|
67
|
-
|
68
|
-
h4. Parsing
|
69
|
-
|
70
|
-
Since each requests' contents are handled in a slightly (and brittle-ly) different manner, we just ask each request object to parse itself and feed out all the TwitterXXXX objects it generates.
|
71
|
-
|
72
|
-
class TwitterFollowersRequest
|
73
|
-
# ...
|
74
|
-
|
75
|
-
def parse &block
|
76
|
-
return unless healthy?
|
77
|
-
# for each raw user/tweet pair in the parsed JSON contents,
|
78
|
-
parsed_contents.each do |hsh|
|
79
|
-
json_obj = JsonUserWithTweet.new(hsh, 'scraped_at' => scraped_at)
|
80
|
-
next unless json_obj && json_obj.healthy?
|
81
|
-
# Extract user, tweet and relationship
|
82
|
-
yield AFollowsB.new(json_obj.user.id, self.twitter_user_id) if json_obj.user
|
83
|
-
json_obj.each(&block)
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
# ...
|
88
|
-
end
|
89
|
-
|
90
|
-
The TwitterXXXRequest objects consist of one or many hashes with (a raw user hash, and possibly its latest raw tweet hash) or (a raw tweet hash and its raw user hash). The user hash might have only the fields for a TwitterPartialUser or it might have the fields for a full set of TwitterUser, TwitterUserProfile, TwitterUserStyle. Besides which, the fields themselves need some massaging to be compatible with Wukong and other tools in our Map/Reduce toolkit (details explained in a later section).
|
91
|
-
|
92
|
-
The fiddly little details are handled by a JsonUserWithTweet or JsonTweetWithUser (as appropriate) adapter pattern:
|
93
|
-
|
94
|
-
class JsonUserTweetPair
|
95
|
-
def initialize raw, moreinfo
|
96
|
-
# clean up fields in entries (flatten date, true/false -> 1/0, etc)
|
97
|
-
fix_raw_user!
|
98
|
-
fix_raw_tweet!
|
99
|
-
end
|
100
|
-
|
101
|
-
# generate all the contained TwitterXXX objects
|
102
|
-
def each
|
103
|
-
#
|
104
|
-
end
|
105
|
-
|
106
|
-
# create TwitterUser object from raw info
|
107
|
-
def user
|
108
|
-
end
|
109
|
-
# create Tweet object from raw tweet hash
|
110
|
-
def tweet
|
111
|
-
end
|
112
|
-
# ... and so forth
|
113
|
-
end
|
114
|
-
|
115
|
-
I'll ignore the gory details; view the source if you're interested.
|
116
|
-
|
117
|
-
|
118
|
-
h4. Running the script
|
119
|
-
|
120
|
-
Here, again, is the code (in full!) for the twitter_request_parser.rb script.
|
121
|
-
|
122
|
-
#
|
123
|
-
# Instantiate each incoming request.
|
124
|
-
# Stream out the contained classes it generates.
|
125
|
-
#
|
126
|
-
class TwitterRequestParser < Wukong::Streamer::StructStreamer
|
127
|
-
def process request
|
128
|
-
request.parse do |obj|
|
129
|
-
yield obj
|
130
|
-
end
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
# This makes the script go.
|
135
|
-
Wukong::Script.new(TwitterRequestParser, nil).run
|
136
|
-
|
137
|
-
That last line is the runner: it makes this a Wukong script with a map phase only. (We'll add in a reducer later on.)
|
138
|
-
|
data/docpages/_config.yml
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
---
|
2
|
-
permalink: ":year-:month/:title.html"
|
3
|
-
markdown: rdiscount
|
4
|
-
pygments: true
|
5
|
-
auto: true
|
6
|
-
server: true
|
7
|
-
server_port: 4000
|
8
|
-
maruku:
|
9
|
-
use_tex: false
|
10
|
-
use_divs: false
|
11
|
-
png_dir: images/latex
|
12
|
-
png_url: /images/latex
|
13
|
-
|
14
|
-
header_ref: '.html' # .html for subdirs, / for main.
|
15
|
-
assets_path: '/' # http://github.mrflip.com
|
16
|
-
|
17
|
-
gemuser: mrflip
|
18
|
-
gemname: wukong
|
19
|
-
gemversion: 0.1.1
|
20
|
-
title: mrflip.github.com/wukong
|
21
|
-
|
22
|
-
keywords: [ 'wukong,hadoop,ruby,mrflip,infochimps,map,reduce,streaming,dumbo,happy,mrtoolkit,script,simple' ]
|
23
|
-
description: "Wukong: Hadoop made so easy a Chimpanzee could run it."
|
24
|
-
header_files:
|
25
|
-
- INSTALL
|
26
|
-
- LICENSE
|
27
|
-
- usage
|
28
|
-
- wutils
|
29
|
-
- moreinfo
|
30
|
-
- tutorial
|
31
|
-
|
32
|
-
credits:
|
33
|
-
<p>Wukong image courtesy
|
34
|
-
<a href="http://www.curtbusse.com/okavango/page1/oka1.html">Curt Busse</a> under
|
35
|
-
an <a href="http://www.curtbusse.com/copyright.html">open license</a>.
|
36
|
-
It's a Chacma Baboon from the Okavango site. Make sure to read the
|
37
|
-
<a href="http://www.curtbusse.com/okavango/page1/oka1.html#note3">story at the bottom of that page</a>.
|
38
|
-
</p>
|
39
|
-
|
@@ -1,56 +0,0 @@
|
|
1
|
-
* Spec: http://avro.apache.org/docs/current/spec.html
|
2
|
-
* Jira: https://issues.apache.org/jira/browse/AVRO
|
3
|
-
* Wiki: https://cwiki.apache.org/confluence/display/AVRO/Index
|
4
|
-
|
5
|
-
* http://github.com/phunt/avro-rpc-quickstart
|
6
|
-
|
7
|
-
* http://lucene.apache.org/java/2_4_0/fileformats.html#VInt -- types
|
8
|
-
* http://code.google.com/apis/protocolbuffers/docs/encoding.html#types -- a good reference
|
9
|
-
* Avro + Eventlet (Python evented code): http://unethicalblogger.com/node/282
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
Cassandra + Avro
|
14
|
-
|
15
|
-
* Make bulk loading into Cassandra less crappy, more pluggable https://issues.apache.org/jira/browse/CASSANDRA-1278
|
16
|
-
* Refactor Streaming: https://issues.apache.org/jira/browse/CASSANDRA-1189
|
17
|
-
* Increment Counters: https://issues.apache.org/jira/browse/CASSANDRA-1072
|
18
|
-
|
19
|
-
== From hammer's avro tools:
|
20
|
-
|
21
|
-
#! /usr/bin/env python
|
22
|
-
|
23
|
-
import sys
|
24
|
-
from avro import schema
|
25
|
-
from avro.genericio import DatumReader
|
26
|
-
from avro.io import DataFileReader
|
27
|
-
|
28
|
-
if __name__ == "__main__":
|
29
|
-
if len(sys.argv) < 2:
|
30
|
-
print "Need to at least specify an Avro file."
|
31
|
-
outfile_name = sys.argv[1]
|
32
|
-
|
33
|
-
message_schema = None
|
34
|
-
if len(sys.argv) > 2:
|
35
|
-
message_schema = schema.parse(schema.parse(sys.argv[2].encode("utf-8")))
|
36
|
-
|
37
|
-
r = file(outfile_name, 'r')
|
38
|
-
dr = DatumReader(expected = message_schema)
|
39
|
-
dfr = DataFileReader(r, dr)
|
40
|
-
for record in dfr:
|
41
|
-
print record
|
42
|
-
dfr.close()
|
43
|
-
|
44
|
-
from binascii import hexlify
|
45
|
-
|
46
|
-
def avro_hexlify(reader):
|
47
|
-
"""Return the hex value, as a string, of a binary-encoded int or long."""
|
48
|
-
bytes = []
|
49
|
-
current_byte = reader.read(1)
|
50
|
-
bytes.append(hexlify(current_byte))
|
51
|
-
while (ord(current_byte) & 0x80) != 0:
|
52
|
-
current_byte = reader.read(1)
|
53
|
-
bytes.append(hexlify(current_byte))
|
54
|
-
return ' '.join(bytes)
|
55
|
-
|
56
|
-
|