wukong 3.0.0.pre → 3.0.0.pre2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
data/docpages/intro.textile
DELETED
data/docpages/moreinfo.textile
DELETED
@@ -1,174 +0,0 @@
|
|
1
|
-
---
|
2
|
-
layout: default
|
3
|
-
title: mrflip.github.com/wukong - TODO
|
4
|
-
collapse: false
|
5
|
-
---
|
6
|
-
|
7
|
-
|
8
|
-
h1(gemheader). Wukong More Info
|
9
|
-
|
10
|
-
** "Why is it called Wukong?":#name
|
11
|
-
** "Don't Use Wukong, use this instead":#whateverdude
|
12
|
-
** "Further Reading and useful links":#links
|
13
|
-
** "Note on Patches/Pull Requests":#patches
|
14
|
-
** "What's up with Wukong::AndPig?":#andpig
|
15
|
-
** "Map/Reduce Algorithms":#algorithms
|
16
|
-
** "TODOs":#TODO
|
17
|
-
|
18
|
-
|
19
|
-
<notextile><div class="toggle"></notextile>
|
20
|
-
|
21
|
-
h2(#name). Why is it called Wukong?
|
22
|
-
|
23
|
-
Hadoop, as you may know, is "named after a stuffed elephant.":http://en.wikipedia.org/wiki/Hadoop Since Wukong was started by the "infochimps":http://infochimps.org team, we needed a simian analog. A Monkey King who journeyed to the land of the Elephant seems to fit the bill:
|
24
|
-
|
25
|
-
bq. Sun Wukong (孙悟空), known in the West as the Monkey King, is the main character in the classical Chinese epic novel Journey to the West. In the novel, he accompanies the monk Xuanzang on the journey to retrieve Buddhist sutras from India.
|
26
|
-
|
27
|
-
bq. Sun Wukong possesses incredible strength, being able to lift his 13,500 jīn (8,100 kg) Ruyi Jingu Bang with ease. He also has superb speed, traveling 108,000 li (54,000 kilometers) in one somersault. Sun knows 72 transformations, which allows him to transform into various animals and objects; he is, however, shown with slight problems transforming into other people, since he is unable to complete the transformation of his tail. He is a skilled fighter, capable of holding his own against the best generals of heaven. Each of his hairs possesses magical properties, and is capable of transforming into a clone of the Monkey King himself, or various weapons, animals, and other objects. He also knows various spells in order to command wind, part water, conjure protective circles against demons, freeze humans, demons, and gods alike. -- ["Sun Wukong's Wikipedia entry":http://en.wikipedia.org/wiki/Wukong]
|
28
|
-
|
29
|
-
The "Jaime Hewlett / Damon Albarn short":http://news.bbc.co.uk/sport1/hi/olympics/monkey that the BBC made for their 2008 Olympics coverage gives the general idea.
|
30
|
-
|
31
|
-
<notextile></div><div class="toggle"></notextile>
|
32
|
-
|
33
|
-
h2(#algorithms). Map/Reduce Algorithms
|
34
|
-
|
35
|
-
Example graph scripts:
|
36
|
-
|
37
|
-
* Multigraph
|
38
|
-
* Pagerank (done)
|
39
|
-
* Breadth-first search
|
40
|
-
* Triangle enumeration
|
41
|
-
* Clustering
|
42
|
-
|
43
|
-
h3. K-Nearest Neighbors
|
44
|
-
|
45
|
-
More example hadoop algorithms:
|
46
|
-
* Bigram counts: http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/exercises/bigrams.html
|
47
|
-
* Inverted index construction: http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/exercises/indexer.html
|
48
|
-
* Pagerank : http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/exercises/pagerank.html
|
49
|
-
* SIPs, Median, classifiers and more : http://matpalm.com/
|
50
|
-
* Brad Heintz's "Distributed Computing with Ruby":http://www.bradheintz.com/no1thing/talks/ demonstrates Travelling Salesman in map/reduce.
|
51
|
-
|
52
|
-
* "Clustering billions of images with large scale nearest neighbor search":http://scholar.google.com/scholar?cluster=2473742255769621469&hl=en uses three map/reduce passes:
|
53
|
-
** Subsample to build a "spill tree" that roughly localizes each object
|
54
|
-
** Use the spill tree on the full dataset to group each object with its potential neighbors
|
55
|
-
** Calculate the metrics and emit only the k-nearest neighbors
|
56
|
-
|
57
|
-
Example example scripts (from http://www.cloudera.com/resources/learning-mapreduce):
|
58
|
-
|
59
|
-
1. Find the [number of] hits by 5 minute timeslot for a website given its access logs.
|
60
|
-
2. Find the pages with over 1 million hits in day for a website given its access logs.
|
61
|
-
3. Find the pages that link to each page in a collection of webpages.
|
62
|
-
4. Calculate the proportion of lines that match a given regular expression for a collection of documents.
|
63
|
-
5. Sort tabular data by a primary and secondary column.
|
64
|
-
6. Find the most popular pages for a website given its access logs.
|
65
|
-
|
66
|
-
<notextile></div><div class="toggle"></notextile>
|
67
|
-
|
68
|
-
h2(#whateverdude). Don't Use Wukong, use this instead
|
69
|
-
|
70
|
-
There are several worthy Hadoop|Streaming Frameworks:
|
71
|
-
|
72
|
-
* infochimps.org's "Wukong":http://github.com/mrflip/wukong -- ruby; object-oriented *and* record-oriented
|
73
|
-
* NYTimes' "MRToolkit":http://code.google.com/p/mrtoolkit/ -- ruby; much more log-oriented
|
74
|
-
* Freebase's "Happy":http://code.google.com/p/happy/ -- python; the most performant, as it can use Jython to make direct API calls.
|
75
|
-
* Last.fm's "Dumbo":http://wiki.github.com/klbostee/dumbo -- python
|
76
|
-
|
77
|
-
Most people use Wukong / one of the above (or straight Java Hadoop, poor souls) for heavy lifting, and several of the following hadoop tools for efficiency:
|
78
|
-
|
79
|
-
* Pig OR
|
80
|
-
* Hive -- hive is more SQL-ish, Pig is more elegant (in a brushed-metal kind of way). I greatly prefer Pig, because I hate SQL; you may feel differently.
|
81
|
-
* Sqoop
|
82
|
-
* Mahout
|
83
|
-
|
84
|
-
<notextile></div><div class="toggle"></notextile>
|
85
|
-
|
86
|
-
h2(#links). Further Reading and useful links:
|
87
|
-
|
88
|
-
* "Ruby Hadoop Quickstart":http://blog.pdatasolutions.com/post/191978092/ruby-on-hadoop-quickstart - dive right in with Wukong, Hadoop and the Amazon Elastic MapReduce cloud. Once you get bored with the command line, this is the fastest path to Wukong power.
|
89
|
-
* "Distributed Computing with Ruby":http://www.bradheintz.com/no1thing/talks/ has some raw ruby, some Wukong and some JRuby/Hadoop integration -- it demonstrates a Travelling Salesman in map/reduce. Cool!
|
90
|
-
|
91
|
-
* "Hadoop, The Definitive Guide":http://www.amazon.com/Hadoop-Definitive-Guide-Tom-White/dp/0596521979
|
92
|
-
|
93
|
-
* "Running Hadoop On Ubuntu Linux (Single-Node Cluster)":http://www.michael-noll.com/wiki/Running_Hadoop_On_Ubuntu_Linux_(Single-Node_Cluster) and "unning Hadoop On Ubuntu Linux (Multi-Node Cluster).":http://www.michael-noll.com/wiki/Running_Hadoop_On_Ubuntu_Linux_(Multi-Node_Cluster)
|
94
|
-
* "Running Hadoop MapReduce on Amazon EC2 and S3":http://developer.amazonwebservices.com/connect/entry.jspa?externalID=873
|
95
|
-
|
96
|
-
* "Hadoop Overview by Doug Cutting":http://video.google.com/videoplay?docid=-4912926263813234341 - the founder of the Hadoop project. (49m video)
|
97
|
-
|
98
|
-
* "Cluster Computing and Map|Reduce":http://www.youtube.com/results?search_query=cluster+computing+and+mapreduce
|
99
|
-
** "Lecture 1: Overview":http://www.youtube.com/watch?v=yjPBkvYh-ss
|
100
|
-
** "Lecture 2 (technical): Map|Reduce":http://www.youtube.com/watch?v=-vD6PUdf3Js
|
101
|
-
** "Lecture 3 (technical): GFS (Google File System)":http://www.youtube.com/watch?v=5Eib_H_zCEY
|
102
|
-
** "Lecture 4 (theoretical): Canopy Clustering":http://www.youtube.com/watch?v=1ZDybXl212Q
|
103
|
-
** "Lecture 5 (theoretical): Breadth-First Search":http://www.youtube.com/watch?v=BT-piFBP4fE
|
104
|
-
|
105
|
-
* "Cloudera Hadoop Training:":http://www.cloudera.com/hadoop-training
|
106
|
-
** "Thinking at Scale":http://www.cloudera.com/hadoop-training-thinking-at-scale
|
107
|
-
** "Mapreduce and HDFS":http://www.cloudera.com/hadoop-training-mapreduce-hdfs
|
108
|
-
** "A Tour of the Hadoop Ecosystem":http://www.cloudera.com/hadoop-training-ecosystem-tour
|
109
|
-
** "Programming with Hadoop":http://www.cloudera.com/hadoop-training-programming-with-hadoop
|
110
|
-
** "Hadoop and Hive: introduction":http://www.cloudera.com/hadoop-training-hive-introduction
|
111
|
-
** "Hadoop and Hive: tutorial":http://www.cloudera.com/hadoop-training-hive-tutorial
|
112
|
-
** "Hadoop and Pig: Introduction":http://www.cloudera.com/hadoop-training-pig-introduction
|
113
|
-
** "Hadoop and Pig: Tutorial":http://www.cloudera.com/hadoop-training-pig-tutorial
|
114
|
-
** "Mapreduce Algorithms":http://www.cloudera.com/hadoop-training-mapreduce-algorithms
|
115
|
-
** "Exercise: Getting started with Hadoop":http://www.cloudera.com/hadoop-training-exercise-getting-started-with-hadoop
|
116
|
-
** "Exercise: Writing mapreduce programs":http://www.cloudera.com/hadoop-training-exercise-writing-mapreduce-programs
|
117
|
-
** "Cloudera Blog":http://www.cloudera.com/blog/
|
118
|
-
|
119
|
-
* "Hadoop Wiki: Hadoop Streaming":http://wiki.apache.org/hadoop/HadoopStreaming
|
120
|
-
* "Hadoop Docs: Hadoop Streaming":http://hadoop.apache.org/common/docs/current/streaming.html
|
121
|
-
|
122
|
-
* A "dimwitted screed on Ruby, Hadoop and Starling":http://www.theregister.co.uk/2008/08/11/hadoop_dziuba/ seemingly written with jockstrap on head.
|
123
|
-
|
124
|
-
<notextile></div><div class="toggle"></notextile>
|
125
|
-
|
126
|
-
h2(#patches). Note on Patches/Pull Requests
|
127
|
-
|
128
|
-
* Fork the project.
|
129
|
-
* Make your feature addition or bug fix.
|
130
|
-
* Add tests for it. This is important so I don't break it in a future version unintentionally.
|
131
|
-
* Commit, do not mess with rakefile, version, or history. (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
132
|
-
* Send me a pull request. Bonus points for topic branches.
|
133
|
-
|
134
|
-
<notextile></div><div class="toggle"></notextile>
|
135
|
-
|
136
|
-
h2(#andpig). What's up with Wukong::AndPig?
|
137
|
-
|
138
|
-
@Wukong::AndPig@ is a small library to more easily generate code for the "Pig":http://hadoop.apache.org/pig data analysis language. See its "README":http://github.com/mrflip/wukong/tree/master/lib/wukong/and_pig/README.textile for more.
|
139
|
-
|
140
|
-
It's **not really being worked on**, and you should probably **ignore it**.
|
141
|
-
|
142
|
-
<notextile></div><div class="toggle"></notextile>
|
143
|
-
|
144
|
-
h2(#todo). TODOs
|
145
|
-
|
146
|
-
Utility
|
147
|
-
|
148
|
-
* columnizing / reconstituting
|
149
|
-
|
150
|
-
* Set up with JRuby
|
151
|
-
* Allow for direct HDFS operations
|
152
|
-
* Make the dfs commands slightly less stupid
|
153
|
-
* add more standard options
|
154
|
-
* Allow for combiners
|
155
|
-
* JobStarter / JobSteps
|
156
|
-
* might as well take dumbo's command line args
|
157
|
-
|
158
|
-
BUGS:
|
159
|
-
|
160
|
-
* Can't do multiple input files in local mode
|
161
|
-
|
162
|
-
Patterns to implement:
|
163
|
-
|
164
|
-
* Stats reducer
|
165
|
-
** basic sum, avg, max, min, std.dev of a numeric field
|
166
|
-
** the "running standard deviation":http://www.johndcook.com/standard_deviation.html
|
167
|
-
|
168
|
-
* Efficient median (and other order statistics)
|
169
|
-
|
170
|
-
* Make StructRecordizer work generically with other reducers (spec. AccumulatingReducer)
|
171
|
-
|
172
|
-
Make wutils: tsv-oriented implementations of the coreutils (eg uniq, sort, cut, nl, wc, split, ls, df and du) to instrinsically accept and emit tab-separated records.
|
173
|
-
|
174
|
-
<notextile></div></notextile>
|
data/docpages/news.html
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
---
|
2
|
-
layout: default
|
3
|
-
title: edamame news
|
4
|
-
collapse: true
|
5
|
-
---
|
6
|
-
<h1 class="gemheader">{% if site.gemname %}{{ site.gemname }}{% else %}mrflip{% endif %}<span class="small">:: news</span></h1>
|
7
|
-
|
8
|
-
<div id="news">
|
9
|
-
{% for t in site.posts %} {% assign has_posts = true %}{% endfor %}{% if has_posts %}
|
10
|
-
{% for post in site.posts %}
|
11
|
-
<div class="toggle" id="news-{{ post.id }}">
|
12
|
-
|
13
|
-
<h2><a href="{{ post.url }}">{{ post.title }}</a><span class="postdate"> » {{ post.date | date_to_string }}</span></h2>
|
14
|
-
|
15
|
-
{{ post.content }}
|
16
|
-
|
17
|
-
</div>
|
18
|
-
{% endfor %}
|
19
|
-
{% else %}
|
20
|
-
<p class="heavy">
|
21
|
-
<em>(no news. good news?)</em>
|
22
|
-
</p>
|
23
|
-
{% endif %}
|
24
|
-
</div>
|
@@ -1,122 +0,0 @@
|
|
1
|
-
|
2
|
-
h2. RelationalOperators
|
3
|
-
|
4
|
-
* foreach
|
5
|
-
|
6
|
-
* cogroup
|
7
|
-
* group
|
8
|
-
* join
|
9
|
-
|
10
|
-
* cross
|
11
|
-
|
12
|
-
* distinct
|
13
|
-
* filter
|
14
|
-
* limit
|
15
|
-
* order
|
16
|
-
* split
|
17
|
-
* union
|
18
|
-
|
19
|
-
* load
|
20
|
-
* store
|
21
|
-
|
22
|
-
h2. Streaming Operator
|
23
|
-
* stream
|
24
|
-
|
25
|
-
h2. UDF Statements
|
26
|
-
* define
|
27
|
-
* register
|
28
|
-
|
29
|
-
h2. Diagnostic Statements
|
30
|
-
* describe
|
31
|
-
* dump
|
32
|
-
* explain
|
33
|
-
* illustrate
|
34
|
-
|
35
|
-
h2. Built-in Functions
|
36
|
-
* EvalFunctions
|
37
|
-
* AVG
|
38
|
-
* CONCAT
|
39
|
-
* COUNT
|
40
|
-
* DIFF
|
41
|
-
* MIN
|
42
|
-
* MAX
|
43
|
-
* SIZE
|
44
|
-
* SUM
|
45
|
-
* TOKENIZE
|
46
|
-
* Load/StoreFunctions
|
47
|
-
* BinaryDeserializer
|
48
|
-
* BinarySerializer
|
49
|
-
* BinStorage
|
50
|
-
* PigStorage
|
51
|
-
* PigDump
|
52
|
-
* TextLoader
|
53
|
-
|
54
|
-
h1. Operators
|
55
|
-
* ArithmeticOperators
|
56
|
-
- addition+
|
57
|
-
- subtraction-
|
58
|
-
- multiplication*
|
59
|
-
- division/
|
60
|
-
- modulo%
|
61
|
-
- bincond?
|
62
|
-
* ComparisonOperators
|
63
|
-
- Equal==
|
64
|
-
- notequal!=
|
65
|
-
- lessthan<
|
66
|
-
- greaterthan>
|
67
|
-
- lessthanorequalto<=
|
68
|
-
- greaterthanorequalto>=
|
69
|
-
- patternmatchingmatches
|
70
|
-
* NullOperators
|
71
|
-
- isnull
|
72
|
-
- isnotnull
|
73
|
-
* BooleanOperators
|
74
|
-
- and
|
75
|
-
- or
|
76
|
-
- not
|
77
|
-
* DereferenceOperators
|
78
|
-
- tupledereference.
|
79
|
-
- mapdereference#
|
80
|
-
* SignOperators
|
81
|
-
- positive+
|
82
|
-
- negative-
|
83
|
-
* CastOperators
|
84
|
-
- (type)$0
|
85
|
-
- (type)alias
|
86
|
-
- Nulls
|
87
|
-
- Constants
|
88
|
-
- Expressions
|
89
|
-
- Schemas
|
90
|
-
- Keywords
|
91
|
-
|
92
|
-
h1. DataTypes
|
93
|
-
h2. SimpleDataTypes
|
94
|
-
- int
|
95
|
-
- long
|
96
|
-
- double
|
97
|
-
- arrays
|
98
|
-
- chararray
|
99
|
-
- bytearray
|
100
|
-
h2. ComplexDataTypes
|
101
|
-
- tuple
|
102
|
-
- bag
|
103
|
-
- map
|
104
|
-
|
105
|
-
h1. FileCommands
|
106
|
-
* cat
|
107
|
-
* cd
|
108
|
-
* copyFromLocal
|
109
|
-
* copyToLocal
|
110
|
-
* cp
|
111
|
-
* ls
|
112
|
-
* mkdir
|
113
|
-
* mv
|
114
|
-
* pwd
|
115
|
-
* rm
|
116
|
-
* rmf
|
117
|
-
|
118
|
-
h1. UtilityCommands
|
119
|
-
* help
|
120
|
-
* kill
|
121
|
-
* quit
|
122
|
-
* set
|
@@ -1,1640 +0,0 @@
|
|
1
|
-
# ---------------------------------------------------------------------------
|
2
|
-
#
|
3
|
-
# = CROSS
|
4
|
-
#
|
5
|
-
# Computes the cross product of two or more relations.
|
6
|
-
#
|
7
|
-
# == Syntax
|
8
|
-
#
|
9
|
-
# alias = CROSS alias, alias [, alias …] [PARALLEL n];
|
10
|
-
#
|
11
|
-
# == Terms
|
12
|
-
#
|
13
|
-
# alias::
|
14
|
-
# The name of a relation.
|
15
|
-
#
|
16
|
-
# PARALLEL n::
|
17
|
-
# Increase the parallelism of a job by specifying the number of reduce tasks,
|
18
|
-
# n. The optimal number of parallel tasks depends on the amount of memory on
|
19
|
-
# each node and the memory required by each of the tasks. To determine n, use
|
20
|
-
# the following as a general guideline:
|
21
|
-
# n = (nr_nodes - 1) * 0.45 * nr_GB
|
22
|
-
# where nr_nodes is the number of nodes used and nr_GB is the amount of physical
|
23
|
-
# memory on each node.
|
24
|
-
#
|
25
|
-
# Note the following:
|
26
|
-
# * Parallel only affects the number of reduce tasks. Map parallelism is
|
27
|
-
# determined by the input file, one map for each HDFS block.
|
28
|
-
# * If you don’t specify parallel, you still get the same map parallelism but
|
29
|
-
# only one reduce task.
|
30
|
-
#
|
31
|
-
# == Usage
|
32
|
-
#
|
33
|
-
# Use the CROSS operator to compute the cross product (Cartesian product) of two
|
34
|
-
# or more relations.
|
35
|
-
#
|
36
|
-
# CROSS is an expensive operation and should be used sparingly.
|
37
|
-
#
|
38
|
-
# == Example
|
39
|
-
#
|
40
|
-
# Suppose we have relations A and B.
|
41
|
-
#
|
42
|
-
# (A) (B)
|
43
|
-
# ----------- --------
|
44
|
-
# (1, 2, 3) (2, 4)
|
45
|
-
# (4, 2, 1) (8, 9)
|
46
|
-
# (1, 3)
|
47
|
-
#
|
48
|
-
# In this example the cross product of relation A and B is computed.
|
49
|
-
#
|
50
|
-
# X = CROSS A, B;
|
51
|
-
#
|
52
|
-
# Relation X looks like this.
|
53
|
-
#
|
54
|
-
# (1, 2, 3, 2, 4)
|
55
|
-
# (1, 2, 3, 8, 9)
|
56
|
-
# (1, 2, 3, 1, 3)
|
57
|
-
# (4, 2, 1, 2, 4)
|
58
|
-
# (4, 2, 1, 8, 9)
|
59
|
-
# (4, 2, 1, 1, 3)
|
60
|
-
#
|
61
|
-
|
62
|
-
|
63
|
-
# ---------------------------------------------------------------------------
|
64
|
-
#
|
65
|
-
# DISTINCT
|
66
|
-
#
|
67
|
-
# Removes duplicate tuples in a relation.
|
68
|
-
#
|
69
|
-
# == Syntax
|
70
|
-
#
|
71
|
-
# alias = DISTINCT alias [PARALLEL n];
|
72
|
-
#
|
73
|
-
# == Terms
|
74
|
-
#
|
75
|
-
# alias::
|
76
|
-
# The name of a relation.
|
77
|
-
#
|
78
|
-
# PARALLEL n::
|
79
|
-
# Increase the parallelism of a job by specifying the number of reduce tasks,
|
80
|
-
# n. The optimal number of parallel tasks depends on the amount of memory on
|
81
|
-
# each node and the memory required by each of the tasks. To determine n, use
|
82
|
-
# the following as a general guideline:
|
83
|
-
# n = (nr_nodes - 1) * 0.45 * nr_GB
|
84
|
-
# where nr_nodes is the number of nodes used and nr_GB is the amount of physical
|
85
|
-
# memory on each node.
|
86
|
-
#
|
87
|
-
# Note the following:
|
88
|
-
# * Parallel only affects the number of reduce tasks. Map parallelism is
|
89
|
-
# determined by the input file, one map for each HDFS block.
|
90
|
-
# * If you don’t specify parallel, you still get the same map parallelism but
|
91
|
-
# only one reduce task.
|
92
|
-
#
|
93
|
-
# == Usage
|
94
|
-
#
|
95
|
-
# Use the DISTINCT operator to remove duplicate tuples in a relation. DISTINCT
|
96
|
-
# does not preserve the original order of the contents (to eliminate duplicates,
|
97
|
-
# Pig must first sort the data). You cannot use DISTINCT on a subset of fields. To
|
98
|
-
# do this, use FOREACH … GENERATE to select the fields, and then use DISTINCT.
|
99
|
-
#
|
100
|
-
# == Example
|
101
|
-
#
|
102
|
-
# Suppose we have relation A.
|
103
|
-
#
|
104
|
-
# (A)
|
105
|
-
# ---------
|
106
|
-
# (8, 3, 4)
|
107
|
-
# (1, 2, 3)
|
108
|
-
# (4, 3, 3)
|
109
|
-
# (4, 3, 3)
|
110
|
-
# (1, 2, 3)
|
111
|
-
#
|
112
|
-
# In this example all duplicate tuples are removed.
|
113
|
-
#
|
114
|
-
# X = DISTINCT A;
|
115
|
-
#
|
116
|
-
# Relation X looks like this.
|
117
|
-
#
|
118
|
-
# (1, 2, 3)
|
119
|
-
# (4, 3, 3)
|
120
|
-
# (8, 3, 4)
|
121
|
-
#
|
122
|
-
|
123
|
-
# ---------------------------------------------------------------------------
|
124
|
-
#
|
125
|
-
# FILTER
|
126
|
-
#
|
127
|
-
# Selects tuples (rows) from a relation based on some condition.
|
128
|
-
#
|
129
|
-
# == Syntax
|
130
|
-
#
|
131
|
-
# alias = FILTER alias BY expression;
|
132
|
-
#
|
133
|
-
# == Terms
|
134
|
-
#
|
135
|
-
# alias::
|
136
|
-
# The name of a relation.
|
137
|
-
#
|
138
|
-
# BY::
|
139
|
-
# Required keyword.
|
140
|
-
#
|
141
|
-
# expression::
|
142
|
-
# An expression.
|
143
|
-
#
|
144
|
-
# == Usage
|
145
|
-
#
|
146
|
-
# Use the FILTER operator to work with tuples (rows) of data. FILTER is commonly
|
147
|
-
# used to select the data that you want; or, conversely, to filter out (remove)
|
148
|
-
# the data you don’t want.
|
149
|
-
#
|
150
|
-
# Note: If you want to work with specific fields (columns) of data, use the
|
151
|
-
# FOREACH …GENERATE operation.
|
152
|
-
#
|
153
|
-
# == Examples
|
154
|
-
#
|
155
|
-
# Suppose we have relation A.
|
156
|
-
#
|
157
|
-
# (A: f1:int, f2:int, f3:int)
|
158
|
-
# ----------------
|
159
|
-
# (1, 2, 3)
|
160
|
-
# (4, 2, 1)
|
161
|
-
# (8, 3, 4)
|
162
|
-
# (4, 3, 3)
|
163
|
-
# (7, 2, 5)
|
164
|
-
# (8, 4, 3)
|
165
|
-
#
|
166
|
-
# In this example the condition states that if the third field equals 3, then add the tuple to relation X.
|
167
|
-
#
|
168
|
-
# X = FILTER A BY f3 == 3;
|
169
|
-
#
|
170
|
-
# Relation X looks like this.
|
171
|
-
#
|
172
|
-
# (1, 2, 3)
|
173
|
-
# (4, 3, 3)
|
174
|
-
# (8, 4, 3)
|
175
|
-
#
|
176
|
-
# In this example the condition states that if the first field equals 8 or if the sum of fields f2 and f3 is not greater than first field, then add the tuple to relation X.
|
177
|
-
#
|
178
|
-
# X = FILTER A BY (f1 == 8) OR (NOT (f2+f3 > f1));
|
179
|
-
#
|
180
|
-
# Relation X looks like this.
|
181
|
-
#
|
182
|
-
# (4, 2, 1)
|
183
|
-
# (8, 3, 4)
|
184
|
-
# (7, 2, 5)
|
185
|
-
# (8, 4, 3)
|
186
|
-
#
|
187
|
-
|
188
|
-
# ---------------------------------------------------------------------------
|
189
|
-
#
|
190
|
-
# FOREACH … GENERATE
|
191
|
-
#
|
192
|
-
# Generates data transformations based on fields (columns) of data.
|
193
|
-
#
|
194
|
-
# == Syntax
|
195
|
-
#
|
196
|
-
# alias = FOREACH { gen_blk | nested_gen_blk } [AS schema];
|
197
|
-
#
|
198
|
-
# == Terms
|
199
|
-
#
|
200
|
-
# alias::
|
201
|
-
# The name of a relation.
|
202
|
-
#
|
203
|
-
# gen_blk::
|
204
|
-
# FOREACH … GENERATE used with a non-nested relation. Use this syntax:
|
205
|
-
#
|
206
|
-
# alias = FOREACH alias GENERATE expression [expression ….]
|
207
|
-
#
|
208
|
-
# nested_gen_blk::
|
209
|
-
# FOREACH … GENERATE used with a nested relation. Use this syntax:
|
210
|
-
#
|
211
|
-
# alias = FOREACH nested_alias {
|
212
|
-
# alias = nested_op; [alias = nested_op; …]
|
213
|
-
# GENERATE expression [expression ….]
|
214
|
-
# };
|
215
|
-
#
|
216
|
-
# where:
|
217
|
-
# * The nested block is enclosed in opening and closing brackets { … }.
|
218
|
-
# * The GENERATE keyword must be the last statement within the nested block.
|
219
|
-
#
|
220
|
-
# expression::
|
221
|
-
# An expression.
|
222
|
-
#
|
223
|
-
# nested_alias::
|
224
|
-
# If one of the fields (columns) in a relation is a bag, the bag can be treated
|
225
|
-
# as an inner or a nested relation.
|
226
|
-
#
|
227
|
-
# nested_op::
|
228
|
-
# Allowable operations include FILTER, ORDER, and DISTINCT.
|
229
|
-
#
|
230
|
-
# The FOREACH … GENERATE operation itself is not allowed since this could lead
|
231
|
-
# to an arbitrary number of nesting levels.
|
232
|
-
#
|
233
|
-
# AS::
|
234
|
-
# Keyword.
|
235
|
-
#
|
236
|
-
# schema::
|
237
|
-
# A schema using the AS keyword (see Schemas).
|
238
|
-
#
|
239
|
-
# * If the FLATTEN keyword is used, enclose the schema in parentheses.
|
240
|
-
#
|
241
|
-
# * If the FLATTEN keyword is not used, don't enclose the schema in parentheses.
|
242
|
-
#
|
243
|
-
# == Usage
|
244
|
-
#
|
245
|
-
# Use the FOREACH …GENERATE operation to work with individual fields (columns) of data. The FOREACH …GENERATE operation works with non-nested and nested relations.
|
246
|
-
#
|
247
|
-
# A statement with a non-nested relation A could look like this.
|
248
|
-
#
|
249
|
-
# X = FOREACH A GENERATE f1;
|
250
|
-
#
|
251
|
-
# A statement with a nested relation A could look like this.
|
252
|
-
#
|
253
|
-
# X = FOREACH B {
|
254
|
-
#
|
255
|
-
# S = FILTER A by 'xyz';
|
256
|
-
#
|
257
|
-
# GENERATE COUNT (S.$0);
|
258
|
-
#
|
259
|
-
# }
|
260
|
-
#
|
261
|
-
# Note: FOREACH … GENERATE works with fields (columns) of data. If you want to work with entire tuples (rows) of data, use the FILTER operation.
|
262
|
-
#
|
263
|
-
# == Examples
|
264
|
-
#
|
265
|
-
# Suppose we have relations A and B, and derived relation C (where C = COGROUP A BY a1 INNER, B BY b1 INNER;).
|
266
|
-
#
|
267
|
-
# (A: a1:int, a2:int, a3:int)
|
268
|
-
# -----------------
|
269
|
-
# (1, 2, 3)
|
270
|
-
# (4, 2, 1)
|
271
|
-
# (8, 3, 4)
|
272
|
-
# (4, 3, 3)
|
273
|
-
# (7, 2, 5)
|
274
|
-
# (8, 4, 3)
|
275
|
-
#
|
276
|
-
#
|
277
|
-
# (B: b1:int, b2:int)
|
278
|
-
# ---------------
|
279
|
-
# (2, 4)
|
280
|
-
# (8, 9)
|
281
|
-
# (1, 3)
|
282
|
-
# (2, 7)
|
283
|
-
# (2, 9)
|
284
|
-
# (4, 6)
|
285
|
-
# (4, 9)
|
286
|
-
#
|
287
|
-
# (C: c1, c2, c3)
|
288
|
-
# ---------------------
|
289
|
-
# (1, {(1, 2, 3)}, {(1, 3)})
|
290
|
-
# (4, {(4, 2, 1), (4, 3, 3)}, {(4, 6), (4, 9)})
|
291
|
-
# (8, {(8, 3, 4), (8, 4, 3)}, {(8, 9)})
|
292
|
-
#
|
293
|
-
#
|
294
|
-
# == Example: Projection
|
295
|
-
#
|
296
|
-
# In this example the asterisk (*) is used to project all fields from relation A to relation X (this is similar to SQL Select *). Relation A and X are identical.
|
297
|
-
#
|
298
|
-
# X = FOREACH A GENERATE *;
|
299
|
-
#
|
300
|
-
# In this example two fields from relation A are projected to form relation X.
|
301
|
-
#
|
302
|
-
# X = FOREACH A GENERATE a1, a2;
|
303
|
-
#
|
304
|
-
# Relation X looks this.
|
305
|
-
#
|
306
|
-
# (1, 2)
|
307
|
-
# (4, 2)
|
308
|
-
# (8, 3)
|
309
|
-
# (4, 3)
|
310
|
-
# (7, 2)
|
311
|
-
# (8, 4)
|
312
|
-
# == Example: Nested Projection
|
313
|
-
#
|
314
|
-
# Note: See GROUP for information about the "group" field in relation C.
|
315
|
-
#
|
316
|
-
# In this example if one of the fields in the input relation is a tuple, bag or map, we can perform projection on that field.
|
317
|
-
#
|
318
|
-
# X = FOREACH C GENERATE group, B.b2;
|
319
|
-
#
|
320
|
-
# Relation X looks like this.
|
321
|
-
#
|
322
|
-
# (1, {(3)})
|
323
|
-
# (4, {(6), (9)})
|
324
|
-
# (8, {(9)})
|
325
|
-
#
|
326
|
-
# In this example multiple nested columns are retained.
|
327
|
-
#
|
328
|
-
# X = FOREACH C GENERATE group, A.(a1, a2);
|
329
|
-
#
|
330
|
-
# Relation X looks like this.
|
331
|
-
#
|
332
|
-
# (1, {(1, 2)})
|
333
|
-
# (4, {(4, 2), (4, 3)})
|
334
|
-
# (8, {(8, 3), (8, 4)})
|
335
|
-
# == Example: Schema
|
336
|
-
#
|
337
|
-
# In this example two fields in relation A are summed to form relation X. A schema is defined for the projected field.
|
338
|
-
#
|
339
|
-
# X = FOREACH A GENERATE a1+a2 AS f1:int;
|
340
|
-
#
|
341
|
-
# Y = FILTER X by f1 > 10;
|
342
|
-
#
|
343
|
-
# Relations X and Y look this.
|
344
|
-
#
|
345
|
-
# (X) (Y)
|
346
|
-
# ----- ------
|
347
|
-
# (3) (11)
|
348
|
-
# (6) (12)
|
349
|
-
# (11)
|
350
|
-
# (7)
|
351
|
-
# (9)
|
352
|
-
# (12)
|
353
|
-
#
|
354
|
-
# == Example: Applying Functions
|
355
|
-
#
|
356
|
-
# Note: See GROUP for information about the "group" field in relation C.
|
357
|
-
#
|
358
|
-
# In this example the built-in function SUM() is used to sum a set of numbers in a bag.
|
359
|
-
#
|
360
|
-
# X = FOREACH C GENERATE group, SUM (A.a1);
|
361
|
-
#
|
362
|
-
# Relation X looks like this.
|
363
|
-
#
|
364
|
-
# (1, 1)
|
365
|
-
# (4, 8)
|
366
|
-
# (8, 16)
|
367
|
-
# == Example: Flattening
|
368
|
-
#
|
369
|
-
# Note: See GROUP for information about the "group" field in relation C.
|
370
|
-
#
|
371
|
-
# In this example the FLATTEN keyword is used to eliminate nesting.
|
372
|
-
#
|
373
|
-
# X = FOREACH C GENERATE group, FLATTEN(A);
|
374
|
-
#
|
375
|
-
# Relation X looks like this.
|
376
|
-
#
|
377
|
-
# (1, 1, 2, 3)
|
378
|
-
# (4, 4, 2, 1)
|
379
|
-
# (4, 4, 3, 3)
|
380
|
-
# (8, 8, 3, 4)
|
381
|
-
# (8, 8, 4, 3)
|
382
|
-
#
|
383
|
-
# Another FLATTEN example.
|
384
|
-
#
|
385
|
-
# X = FOREACH C GENERATE GROUP, FLATTEN(A.a3);
|
386
|
-
#
|
387
|
-
# Relation X looks like this.
|
388
|
-
#
|
389
|
-
# (1, 3)
|
390
|
-
# (4, 1)
|
391
|
-
# (4, 3)
|
392
|
-
# (8, 4)
|
393
|
-
# (8, 3)
|
394
|
-
#
|
395
|
-
# Another FLATTEN example.
|
396
|
-
#
|
397
|
-
# X = FOREACH C GENERATE FLATTEN(A.(f1, f2)), FLATTEN(B.$1);
|
398
|
-
#
|
399
|
-
# Relation X looks like this. Note that for the group '4' in C, there are two tuples in each bag. Thus, when both bags are flattened, the cross product of these tuples is returned; that is, tuples (4, 2, 6), (4, 3, 6), (4, 2, 9), and (4, 3, 9).
|
400
|
-
#
|
401
|
-
# (1, 2, 3)
|
402
|
-
# (4, 2, 6)
|
403
|
-
# (4, 3, 6)
|
404
|
-
# (4, 2, 9)
|
405
|
-
# (4, 3, 9)
|
406
|
-
# (8, 3, 9)
|
407
|
-
# (8, 4, 9)
|
408
|
-
#
|
409
|
-
# == Example: Nested Block
|
410
|
-
#
|
411
|
-
# Suppose we have relation A and derived relation B (where B = GROUP A BY url;). Since relation B contains tuples with bags it can be treated as a nested relation.
|
412
|
-
#
|
413
|
-
# A (url:chararray, outlink:chararray)
|
414
|
-
# ---------------------------------------------
|
415
|
-
# (www.ccc.com,www.hjk.com)
|
416
|
-
# (www.ddd.com,www.xyz.org)
|
417
|
-
# (www.aaa.com,www.cvn.org)
|
418
|
-
# (www.www.com,www.kpt.net)
|
419
|
-
# (www.www.com,www.xyz.org)
|
420
|
-
# (www.ddd.com,www.xyz.org)
|
421
|
-
#
|
422
|
-
#
|
423
|
-
# B
|
424
|
-
# ---------------------------------------------
|
425
|
-
# (www.aaa.com,{(www.aaa.com,www.cvn.org)})
|
426
|
-
# (www.ccc.com,{(www.ccc.com,www.hjk.com)})
|
427
|
-
# (www.ddd.com,{(www.ddd.com,www.xyz.org),(www.ddd.com,www.xyz.org)})
|
428
|
-
# (www.www.com,{(www.www.com,www.kpt.net),(www.www.com,www.xyz.org)})
|
429
|
-
#
|
430
|
-
# In this example we perform two of the allowed Pig operations, FILTER (FA) and DISTINCT (DA), as well as projection (PA). Note that the last statement in the nested block must be GENERATE.
|
431
|
-
#
|
432
|
-
# X = foreach B {
|
433
|
-
# FA= FILTER A BY outlink == 'www.xyz.org';
|
434
|
-
# PA = FA.outlink;
|
435
|
-
# DA = DISTINCT PA;
|
436
|
-
# GENERATE GROUP, COUNT(DA);
|
437
|
-
# }
|
438
|
-
#
|
439
|
-
# Relation X looks like this.
|
440
|
-
#
|
441
|
-
# (www.ddd.com,1L)
|
442
|
-
# (www.www.com,1L)
|
443
|
-
|
444
|
-
|
445
|
-
# ---------------------------------------------------------------------------
|
446
|
-
#
|
447
|
-
# GROUP
|
448
|
-
#
|
449
|
-
# Groups the data in a single relation.
|
450
|
-
#
|
451
|
-
# == Syntax
|
452
|
-
#
|
453
|
-
# alias = GROUP alias
|
454
|
-
# [BY {[field_alias [, field_alias]] | * | [expression] } ]
|
455
|
-
# [ALL] [PARALLEL n];
|
456
|
-
#
|
457
|
-
# == Terms
|
458
|
-
#
|
459
|
-
# alias::
|
460
|
-
# The name of a relation.
|
461
|
-
#
|
462
|
-
# BY::
|
463
|
-
# Keyword. Use this clause to group the relation by fields or by expression.
|
464
|
-
#
|
465
|
-
# field_alias::
|
466
|
-
# The name of a field in a relation. This is the group key or key field.
|
467
|
-
#
|
468
|
-
# A relation can be grouped by a single field (f1) or by the composite value of
|
469
|
-
# multiple fields (f1,f2).
|
470
|
-
#
|
471
|
-
# *::
|
472
|
-
# The asterisk. A designator for all fields in the relation.
|
473
|
-
#
|
474
|
-
# expression::
|
475
|
-
# An expression.
|
476
|
-
#
|
477
|
-
# ALL::
|
478
|
-
# Keyword. Use ALL if you want all tuples to go to a single group; for example, when doing aggregates across entire relations.
|
479
|
-
#
|
480
|
-
# PARALLEL n::
|
481
|
-
# Increase the parallelism of a job by specifying the number of reduce tasks,
|
482
|
-
# n. The optimal number of parallel tasks depends on the amount of memory on
|
483
|
-
# each node and the memory required by each of the tasks. To determine n, use
|
484
|
-
# the following as a general guideline:
|
485
|
-
# n = (nr_nodes - 1) * 0.45 * nr_GB
|
486
|
-
# where nr_nodes is the number of nodes used and nr_GB is the amount of physical
|
487
|
-
# memory on each node.
|
488
|
-
#
|
489
|
-
# Note the following:
|
490
|
-
# * Parallel only affects the number of reduce tasks. Map parallelism is
|
491
|
-
# determined by the input file, one map for each HDFS block.
|
492
|
-
# * If you don’t specify parallel, you still get the same map parallelism but
|
493
|
-
# only one reduce task.
|
494
|
-
#
|
495
|
-
# == Usage
|
496
|
-
#
|
497
|
-
# The GROUP operator groups together tuples that have the same group key (key
|
498
|
-
# field). The result of a GROUP operation is a relation that includes one tuple
|
499
|
-
# per group. This tuple contains two fields:
|
500
|
-
#
|
501
|
-
# * The first field is named "group" (do not confuse this with the GROUP operator)
|
502
|
-
# and is the same type of the group key.
|
503
|
-
#
|
504
|
-
# * The second field takes the name of the original relation and is type bag.
|
505
|
-
#
|
506
|
-
# Suppose we have the following data:
|
507
|
-
#
|
508
|
-
# john 25 3.6
|
509
|
-
# george 25 2.9
|
510
|
-
# anne 27 3.9
|
511
|
-
# julia 28 3.6
|
512
|
-
#
|
513
|
-
# And, suppose we perform the LOAD and GROUP statements shown below. We can use
|
514
|
-
# the DESCRIBE operator to view the schemas for relation Y. We can use DUMP to
|
515
|
-
# view the contents of Y.
|
516
|
-
#
|
517
|
-
# Note that relation Y has two fields. The first field is named "group" and is
|
518
|
-
# type int (the same as age). The second field takes the name of the original
|
519
|
-
# relation "X" and is type bag (that can contain tuples with three elements of
|
520
|
-
# type chararray, int, and float).
|
521
|
-
#
|
522
|
-
# Statements
|
523
|
-
#
|
524
|
-
# X = LOAD 'data AS (name:chararray, age:int, gpa:float);
|
525
|
-
# Y = GROUP X BY age;
|
526
|
-
# DESCRIBE Y;
|
527
|
-
# Y: {group: int,X: {name: chararray,age: int,gpa: float}}
|
528
|
-
# DUMP Y;
|
529
|
-
#
|
530
|
-
# (25,{(john,25,3.6F),(george,25,2.9F)})
|
531
|
-
# (27,{(anne,27,3.9F)})
|
532
|
-
# (28,{(julia,28,3.6F)})
|
533
|
-
#
|
534
|
-
# As shown in this FOREACH statement, we can refer to the fields in relation Y by their names "group" and "X".
|
535
|
-
#
|
536
|
-
# Z = FOREACH Y GENERATE group, COUNT(X);
|
537
|
-
#
|
538
|
-
# Relation Z looks like this.
|
539
|
-
#
|
540
|
-
# (25,2L)
|
541
|
-
# (27,1L)
|
542
|
-
# (28,1L)
|
543
|
-
#
|
544
|
-
# == Examples
|
545
|
-
#
|
546
|
-
# Suppose we have relation A.
|
547
|
-
#
|
548
|
-
# A: (owner:chararray, pet:chararray)
|
549
|
-
# -----------------
|
550
|
-
# (Alice, turtle)
|
551
|
-
# (Alice, goldfish)
|
552
|
-
# (Alice, cat)
|
553
|
-
# (Bob, dog)
|
554
|
-
# (Bob, cat)
|
555
|
-
#
|
556
|
-
# In this example tuples are grouped using the field "owner."
|
557
|
-
#
|
558
|
-
# X = GROUP A BY owner;
|
559
|
-
#
|
560
|
-
# Relation X looks like this. "group" is the name of the first field. "A" is the
|
561
|
-
# name of the second field.
|
562
|
-
#
|
563
|
-
# (Alice, {(Alice, turtle), (Alice, goldfish)})
|
564
|
-
# (Bob, {(Bob, dog), (Bob, cat)})
|
565
|
-
#
|
566
|
-
# In this example tuples are grouped using the ALL keyword. Field "A" is then
|
567
|
-
# counted and projected to from relation Y.
|
568
|
-
#
|
569
|
-
# X = GROUP A ALL;
|
570
|
-
# Y = FOREACH X GENERATE COUNT(A);
|
571
|
-
#
|
572
|
-
# Relation X looks like this. "group" is the name of the first field. "A" is the
|
573
|
-
# name of the second field.
|
574
|
-
#
|
575
|
-
# (all,{(Alice,turtle),(Alice,goldfish),(Alice,cat),(Bob,dog),(Bob,cat)})
|
576
|
-
#
|
577
|
-
# Relation Y looks like this.
|
578
|
-
#
|
579
|
-
# (5L)
|
580
|
-
#
|
581
|
-
# Suppose we have relation S.
|
582
|
-
#
|
583
|
-
# S: (f1:chararay, f2:int, f3:int)
|
584
|
-
# -----------------
|
585
|
-
# (r1, 1, 2)
|
586
|
-
# (r2, 2, 1)
|
587
|
-
# (r3, 2, 8)
|
588
|
-
# (r4, 4, 4)
|
589
|
-
#
|
590
|
-
# In this example tuples are grouped using an expression, f2*f3.
|
591
|
-
#
|
592
|
-
# X = GROUP S BY f2*f3;
|
593
|
-
#
|
594
|
-
# Relation Y looks like this. The first field is named "group". The second field is named "S".
|
595
|
-
#
|
596
|
-
# (2, {(r1, 1, 2), (r2, 2, 1)})
|
597
|
-
# (16, {(r3, 2, 8), (r4, 4, 4)})
|
598
|
-
|
599
|
-
|
600
|
-
# ---------------------------------------------------------------------------
|
601
|
-
#
|
602
|
-
# JOIN
|
603
|
-
#
|
604
|
-
# Joins two or more relations based on common field values.
|
605
|
-
#
|
606
|
-
# == Syntax
|
607
|
-
#
|
608
|
-
# alias = JOIN alias BY field_alias,
|
609
|
-
# alias BY field_alias [, alias BY field_alias …]
|
610
|
-
# [PARALLEL n];
|
611
|
-
#
|
612
|
-
# == Terms
|
613
|
-
#
|
614
|
-
# alias::
|
615
|
-
# The name of a relation.
|
616
|
-
#
|
617
|
-
# BY::
|
618
|
-
# Keyword.
|
619
|
-
#
|
620
|
-
# field_alias::
|
621
|
-
# The name of a field in a relation. The alias and field_alias specified in the
|
622
|
-
# BY clause must correspond.
|
623
|
-
#
|
624
|
-
# == Example:
|
625
|
-
# X = JOIN relationA BY fieldA, relationB by fieldB, relationC by fieldC;
|
626
|
-
#
|
627
|
-
# PARALLEL n::
|
628
|
-
# Increase the parallelism of a job by specifying the number of reduce tasks,
|
629
|
-
# n. The optimal number of parallel tasks depends on the amount of memory on
|
630
|
-
# each node and the memory required by each of the tasks. To determine n, use
|
631
|
-
# the following as a general guideline:
|
632
|
-
# n = (nr_nodes - 1) * 0.45 * nr_GB
|
633
|
-
# where nr_nodes is the number of nodes used and nr_GB is the amount of physical
|
634
|
-
# memory on each node.
|
635
|
-
#
|
636
|
-
# Note the following:
|
637
|
-
# * Parallel only affects the number of reduce tasks. Map parallelism is
|
638
|
-
# determined by the input file, one map for each HDFS block.
|
639
|
-
# * If you don’t specify parallel, you still get the same map parallelism but
|
640
|
-
# only one reduce task.
|
641
|
-
#
|
642
|
-
# == Usage
|
643
|
-
#
|
644
|
-
# Use the JOIN operator to join two or more relations based on common field
|
645
|
-
# values. The JOIN operator always performs an inner join.
|
646
|
-
#
|
647
|
-
# Note: The JOIN and COGROUP operators perform similar functions. JOIN creates a
|
648
|
-
# flat set of output records while COGROUP creates a nested set of output records.
|
649
|
-
#
|
650
|
-
# == Example
|
651
|
-
#
|
652
|
-
# Suppose we have relations A and B.
|
653
|
-
#
|
654
|
-
# (A: a1, a2, a3) (B: b1, b2)
|
655
|
-
# ----------------- ---------------
|
656
|
-
# (1, 2, 3) (2, 4)
|
657
|
-
# (4, 2, 1) (8, 9)
|
658
|
-
# (8, 3, 4) (1, 3)
|
659
|
-
# (4, 3, 3) (2, 7)
|
660
|
-
# (7, 2, 5) (2, 9)
|
661
|
-
# (8, 4, 3) (4, 6)
|
662
|
-
# (4, 9)
|
663
|
-
#
|
664
|
-
# In this example relations A and B are joined on their first fields.
|
665
|
-
#
|
666
|
-
# X = JOIN A BY a1, B BY b1;
|
667
|
-
#
|
668
|
-
# Relation X looks like this.
|
669
|
-
#
|
670
|
-
# (1, 2, 3, 1, 3)
|
671
|
-
# (4, 2, 1, 4, 6)
|
672
|
-
# (4, 3, 3, 4, 6)
|
673
|
-
# (4, 2, 1, 4, 9)
|
674
|
-
# (4, 3, 3, 4, 9)
|
675
|
-
# (8, 3, 4, 8, 9)
|
676
|
-
# (8, 4, 3, 8, 9)
|
677
|
-
#
|
678
|
-
|
679
|
-
|
680
|
-
# ---------------------------------------------------------------------------
|
681
|
-
#
|
682
|
-
# LIMIT
|
683
|
-
#
|
684
|
-
# Limits the number of output tuples.
|
685
|
-
#
|
686
|
-
# == Syntax
|
687
|
-
#
|
688
|
-
# alias = LIMIT alias n;
|
689
|
-
#
|
690
|
-
# == Terms
|
691
|
-
#
|
692
|
-
# alias::
|
693
|
-
# The name of a relation.
|
694
|
-
#
|
695
|
-
# n::
|
696
|
-
# The number of tuples.
|
697
|
-
#
|
698
|
-
# == Usage
|
699
|
-
#
|
700
|
-
# Use the LIMIT operator to limit the number of output tuples (rows). If the
|
701
|
-
# specified number of output tuples is equal to or exceeds the number of tuples in
|
702
|
-
# the relation, the output will include all tuples in the relation.
|
703
|
-
#
|
704
|
-
# There is no guarantee which tuples will be returned, and the tuples that are
|
705
|
-
# returned can change from one run to the next. A particular set of tuples can be
|
706
|
-
# requested using the ORDER operator followed by LIMIT.
|
707
|
-
#
|
708
|
-
# Note: The LIMIT operator allows Pig to avoid processing all tuples in a
|
709
|
-
# relation. In most cases a query that uses LIMIT will run more efficiently than
|
710
|
-
# an identical query that does not use LIMIT. It is always a good idea to use
|
711
|
-
# limit if you can.
|
712
|
-
#
|
713
|
-
# == Examples
|
714
|
-
#
|
715
|
-
# Suppose we have relation A.
|
716
|
-
#
|
717
|
-
# (A: f1:int, f2:int, f3:int)
|
718
|
-
# -----------------
|
719
|
-
# (1, 2, 3)
|
720
|
-
# (4, 2, 1)
|
721
|
-
# (8, 3, 4)
|
722
|
-
# (4, 3, 3)
|
723
|
-
# (7, 2, 5)
|
724
|
-
# (8, 4, 3)
|
725
|
-
#
|
726
|
-
# In this example output is limited to 3 tuples.
|
727
|
-
#
|
728
|
-
# X = LIMIT A 3;
|
729
|
-
#
|
730
|
-
# Relation X could look like this (there is no guarantee which three tuples will be output).
|
731
|
-
#
|
732
|
-
# (1, 2, 3)
|
733
|
-
# (4, 3, 3)
|
734
|
-
# (7, 2, 5)
|
735
|
-
#
|
736
|
-
# In this example the ORDER operator is used to order the tuples and the LIMIT operator is used to output the first three tuples.
|
737
|
-
#
|
738
|
-
# B = ORDER A BY f1 DESC, f2 ASC;
|
739
|
-
# X = LIMIT B 3;
|
740
|
-
#
|
741
|
-
# Relation B and relation X look like this.
|
742
|
-
#
|
743
|
-
# (B) (X)
|
744
|
-
# ----------- -----------
|
745
|
-
# (8, 3, 4) (8, 3, 4)
|
746
|
-
# (8, 4, 3) (8, 4, 3)
|
747
|
-
# (7, 2, 5) (7, 2, 5)
|
748
|
-
# (4, 2, 1)
|
749
|
-
# (4, 3, 3)
|
750
|
-
# (1, 2, 3)
|
751
|
-
|
752
|
-
|
753
|
-
# ---------------------------------------------------------------------------
|
754
|
-
#
|
755
|
-
# LOAD
|
756
|
-
#
|
757
|
-
# Loads data from the file system.
|
758
|
-
#
|
759
|
-
# == Syntax
|
760
|
-
#
|
761
|
-
# LOAD 'data' [USING function] [AS schema];
|
762
|
-
#
|
763
|
-
# == Terms
|
764
|
-
#
|
765
|
-
# 'data'::
|
766
|
-
# The name of the file or directory, in single quotes.
|
767
|
-
#
|
768
|
-
# If you specify a directory name, all the files in the directory are loaded.
|
769
|
-
#
|
770
|
-
# You can use hadoop-supported globing to specify files at the file system or
|
771
|
-
# directory levels (see [WWW]hadoop glob documentation for details on globing
|
772
|
-
# syntax).
|
773
|
-
#
|
774
|
-
# USING::
|
775
|
-
# Keyword.
|
776
|
-
#
|
777
|
-
# function::
|
778
|
-
# The load function.
|
779
|
-
#
|
780
|
-
# PigStorage is the default load/store function and does not need to be
|
781
|
-
# specified. This function reads/writes simple newline-separated records with
|
782
|
-
# delimiter-separated fields. The function has one parameter, the field
|
783
|
-
# delimiter (tab (‘\t’) if the default delimiter).
|
784
|
-
#
|
785
|
-
# If the data is stored in a special format that the Pig load functions cannot
|
786
|
-
# parse, you can write your own load function.
|
787
|
-
#
|
788
|
-
# AS::
|
789
|
-
# Keyword.
|
790
|
-
#
|
791
|
-
# schema::
|
792
|
-
# A schema using the AS keyword, enclosed in parentheses (see Schemas).
|
793
|
-
#
|
794
|
-
# == Usage
|
795
|
-
#
|
796
|
-
# Use the LOAD operator to load data from the file system.
|
797
|
-
#
|
798
|
-
# == Examples
|
799
|
-
#
|
800
|
-
# Suppose we have a data file called myfile.txt. The fields are tab-delimited. The
|
801
|
-
# records are newline-separated.
|
802
|
-
#
|
803
|
-
# 1 2 3
|
804
|
-
# 4 2 1
|
805
|
-
# 8 3 4
|
806
|
-
#
|
807
|
-
# In this example the default load function, PigStorage, loads data from
|
808
|
-
# myfile.txt into relation A. Note that, because no schema is specified, the
|
809
|
-
# fields are not named and all fields default to type bytearray. The two
|
810
|
-
# statements are equivalent.
|
811
|
-
#
|
812
|
-
# A = LOAD 'myfile.txt';
|
813
|
-
# A = LOAD 'myfile.txt' USING PigStorage('\t');
|
814
|
-
#
|
815
|
-
# Relation A looks like this.
|
816
|
-
#
|
817
|
-
# (1, 2, 3)
|
818
|
-
# (4, 2, 1)
|
819
|
-
# (8, 3, 4)
|
820
|
-
#
|
821
|
-
# In this example a schema is specified using the AS keyword. The two statements
|
822
|
-
# are equivalent.
|
823
|
-
#
|
824
|
-
# A = LOAD 'myfile.txt' AS (f1:int, f2:int, f3:int);
|
825
|
-
# A = LOAD 'myfile.txt' USING PigStorage(‘\t’) AS (f1:int, f2:int, f3:int);
|
826
|
-
|
827
|
-
|
828
|
-
# ---------------------------------------------------------------------------
|
829
|
-
#
|
830
|
-
# ORDER
|
831
|
-
#
|
832
|
-
# Sorts a relation based on one or more fields.
|
833
|
-
#
|
834
|
-
# == Syntax
|
835
|
-
#
|
836
|
-
# alias = ORDER alias BY { * [ASC|DESC] | field_alias [ASC|DESC]
|
837
|
-
# [, field_alias [ASC|DESC] …] } [PARALLEL n];
|
838
|
-
#
|
839
|
-
# == Terms
|
840
|
-
#
|
841
|
-
# alias::
|
842
|
-
# The name of a relation.
|
843
|
-
#
|
844
|
-
# BY::
|
845
|
-
# Required keyword.
|
846
|
-
#
|
847
|
-
# *::
|
848
|
-
# Represents all fields in the relation.
|
849
|
-
#
|
850
|
-
# ASC::
|
851
|
-
# Sort in ascending order.
|
852
|
-
#
|
853
|
-
# DESC::
|
854
|
-
# Sort in descending order.
|
855
|
-
#
|
856
|
-
# field_alias::
|
857
|
-
# A field in the relation.
|
858
|
-
#
|
859
|
-
# PARALLEL n::
|
860
|
-
# Increase the parallelism of a job by specifying the number of reduce tasks,
|
861
|
-
# n. The optimal number of parallel tasks depends on the amount of memory on
|
862
|
-
# each node and the memory required by each of the tasks. To determine n, use
|
863
|
-
# the following as a general guideline:
|
864
|
-
# n = (nr_nodes - 1) * 0.45 * nr_GB
|
865
|
-
# where nr_nodes is the number of nodes used and nr_GB is the amount of physical
|
866
|
-
# memory on each node.
|
867
|
-
#
|
868
|
-
# Note the following:
|
869
|
-
# * Parallel only affects the number of reduce tasks. Map parallelism is
|
870
|
-
# determined by the input file, one map for each HDFS block.
|
871
|
-
# * If you don’t specify parallel, you still get the same map parallelism but
|
872
|
-
# only one reduce task.
|
873
|
-
#
|
874
|
-
# == Usage
|
875
|
-
#
|
876
|
-
# In Pig, relations are logically unordered.
|
877
|
-
#
|
878
|
-
# * If you order relation A to produce relation X (X = ORDER A BY * DESC;),
|
879
|
-
# relations A and X still contain the same thing.
|
880
|
-
#
|
881
|
-
# * If you retrieve the contents of relation X, they are guaranteed to be in the
|
882
|
-
# order you specified (descending).
|
883
|
-
#
|
884
|
-
# * However, if you further process relation X, there is no guarantee that the
|
885
|
-
# contents will be processed in the order you specified.
|
886
|
-
#
|
887
|
-
# == Examples
|
888
|
-
#
|
889
|
-
# Suppose we have relation A.
|
890
|
-
#
|
891
|
-
# (A: f1, f2, f3)
|
892
|
-
# -----------------
|
893
|
-
# (1, 2, 3)
|
894
|
-
# (4, 2, 1)
|
895
|
-
# (8, 3, 4)
|
896
|
-
# (4, 3, 3)
|
897
|
-
# (7, 2, 5)
|
898
|
-
# (8, 4, 3)
|
899
|
-
#
|
900
|
-
# In this example relation A is sorted by the third field, f3 in descending order.
|
901
|
-
#
|
902
|
-
# X = ORDER A BY f3 DESC;
|
903
|
-
#
|
904
|
-
# Relation X could look like this (note that the order of the three tuples ending
|
905
|
-
# in 3 can vary).
|
906
|
-
#
|
907
|
-
# (7, 2, 5)
|
908
|
-
# (8, 3, 4)
|
909
|
-
# (1, 2, 3)
|
910
|
-
# (4, 3, 3)
|
911
|
-
# (8, 4, 3)
|
912
|
-
# (4, 2, 1)
|
913
|
-
|
914
|
-
|
915
|
-
# ---------------------------------------------------------------------------
|
916
|
-
#
|
917
|
-
# SPLIT
|
918
|
-
#
|
919
|
-
# Partitions a relation into two or more relations.
|
920
|
-
#
|
921
|
-
# == Syntax
|
922
|
-
#
|
923
|
-
# SPLIT alias INTO alias IF expression, alias IF expression [, alias IF expression …];
|
924
|
-
#
|
925
|
-
# == Terms
|
926
|
-
#
|
927
|
-
# alias::
|
928
|
-
# The name of a relation.
|
929
|
-
#
|
930
|
-
# INTO::
|
931
|
-
# Required keyword.
|
932
|
-
#
|
933
|
-
# IF::
|
934
|
-
# Required keyword.
|
935
|
-
#
|
936
|
-
# expression::
|
937
|
-
# An expression.
|
938
|
-
#
|
939
|
-
# == Usage
|
940
|
-
#
|
941
|
-
# Use the SPLIT operator to partition a relation into two or more relations based
|
942
|
-
# on some expression. Depending on the expression:
|
943
|
-
#
|
944
|
-
# * A tuple may be assigned to more than one relation.
|
945
|
-
#
|
946
|
-
# * A tuple may not be assigned to any relation.
|
947
|
-
#
|
948
|
-
# == Example
|
949
|
-
#
|
950
|
-
# Suppose we have relation A.
|
951
|
-
#
|
952
|
-
# (A: f1, f2, f3)
|
953
|
-
# -----------------
|
954
|
-
# (1, 2, 3)
|
955
|
-
# (4, 5, 6)
|
956
|
-
# (7, 8, 9)
|
957
|
-
#
|
958
|
-
# In this example relation A is split into three relations, X, Y, and Z.
|
959
|
-
#
|
960
|
-
# SPLIT A INTO X IF f1< 7, Y IF f2==5, Z IF (f3<6 OR f3>6);
|
961
|
-
#
|
962
|
-
# Relations X, Y, and Z look like this.
|
963
|
-
#
|
964
|
-
# (X) (Y) (Z)
|
965
|
-
# ---------- ----------- -----------
|
966
|
-
# (1, 2, 3) (4, 5, 6) (1, 2, 3)
|
967
|
-
# (4, 5, 6) (7, 8, 9)
|
968
|
-
|
969
|
-
|
970
|
-
# ---------------------------------------------------------------------------
|
971
|
-
#
|
972
|
-
# STORE
|
973
|
-
#
|
974
|
-
# Stores data to the file system.
|
975
|
-
#
|
976
|
-
# == Syntax
|
977
|
-
#
|
978
|
-
# STORE alias INTO 'directory' [USING function];
|
979
|
-
#
|
980
|
-
# == Terms
|
981
|
-
#
|
982
|
-
# alias::
|
983
|
-
# The name of a relation.
|
984
|
-
#
|
985
|
-
# INTO::
|
986
|
-
# Required keyword.
|
987
|
-
#
|
988
|
-
# 'directory'::
|
989
|
-
# The name of the storage directory, in quotes. If the directory already exists, the STORE operation will fail.
|
990
|
-
#
|
991
|
-
#
|
992
|
-
#
|
993
|
-
# The output data files, named part-nnnnn, are written to this directory.
|
994
|
-
#
|
995
|
-
# USING::
|
996
|
-
# Keyword. Use this clause to name the store function.
|
997
|
-
#
|
998
|
-
# function::
|
999
|
-
# The load function.
|
1000
|
-
#
|
1001
|
-
# PigStorage is the default load/store function and does not need to be specified. This function reads/writes simple newline-separated records with delimiter-separated fields. The function has one parameter, the field delimiter (tab ‘\t’ if the default delimiter)
|
1002
|
-
#
|
1003
|
-
# If you want to store the data in a special format that the Pig Load/Store functions cannot handle, you can write your own store function.
|
1004
|
-
#
|
1005
|
-
# == Usage
|
1006
|
-
#
|
1007
|
-
# Use the STORE operator to store data on the file system.
|
1008
|
-
#
|
1009
|
-
# == Example
|
1010
|
-
#
|
1011
|
-
# Suppose we have relation A.
|
1012
|
-
#
|
1013
|
-
# (A)
|
1014
|
-
#
|
1015
|
-
# ----------------
|
1016
|
-
# (1, 2, 3)
|
1017
|
-
# (4, 2, 1)
|
1018
|
-
# (8, 3, 4)
|
1019
|
-
# (4, 3, 3)
|
1020
|
-
# (7, 2, 5)
|
1021
|
-
# (8, 4, 3)
|
1022
|
-
#
|
1023
|
-
# In this example the contents of relation A are written to file part-00000 located in directory myoutput.
|
1024
|
-
#
|
1025
|
-
# STORE relationA INTO ‘myoutput’ USING PigStorage (‘*’);
|
1026
|
-
#
|
1027
|
-
# The part-00000 file looks like this. Fields are delimited with the asterisk * characters and records are separated by newlines.
|
1028
|
-
#
|
1029
|
-
# 1*2*3
|
1030
|
-
# 4*2*1
|
1031
|
-
# 8*3*4
|
1032
|
-
# 4*3*3
|
1033
|
-
# 7*2*5
|
1034
|
-
# 8*4*3
|
1035
|
-
#
|
1036
|
-
|
1037
|
-
|
1038
|
-
# ---------------------------------------------------------------------------
|
1039
|
-
#
|
1040
|
-
# STREAM
|
1041
|
-
#
|
1042
|
-
# Sends data to an external script or program.
|
1043
|
-
#
|
1044
|
-
# == Syntax
|
1045
|
-
#
|
1046
|
-
# alias = STREAM alias [, alias …] THROUGH {`command` | cmd_alias } [AS schema] ;
|
1047
|
-
#
|
1048
|
-
# == Terms
|
1049
|
-
#
|
1050
|
-
# alias::
|
1051
|
-
# The name of a relation.
|
1052
|
-
#
|
1053
|
-
# THROUGH::
|
1054
|
-
# Keyword.
|
1055
|
-
#
|
1056
|
-
# `command`::
|
1057
|
-
# A command, including the arguments, enclosed in back tics (where a command is anything that can be executed).
|
1058
|
-
#
|
1059
|
-
# cmd_alias::
|
1060
|
-
# The name of a command created using the DEFINE operator.
|
1061
|
-
#
|
1062
|
-
# AS::
|
1063
|
-
# Keyword.
|
1064
|
-
#
|
1065
|
-
# schema::
|
1066
|
-
# A schema using the AS keyword, enclosed in parentheses (see Schemas).
|
1067
|
-
#
|
1068
|
-
# == Usage
|
1069
|
-
#
|
1070
|
-
# Use the STREAM operator to send data through an external script or program. Multiple stream operators can appear in the same Pig script. The stream operators can be adjacent to each other or have other operations in between.
|
1071
|
-
#
|
1072
|
-
# When used with a command, a stream statement could look like this:
|
1073
|
-
#
|
1074
|
-
# A = LOAD 'data';
|
1075
|
-
#
|
1076
|
-
# B = STREAM A THROUGH `stream.pl -n 5`;
|
1077
|
-
#
|
1078
|
-
# When used with a cmd_alias, a stream statement could look like this, where cmd is the defined alias.
|
1079
|
-
#
|
1080
|
-
# A = LOAD 'data';
|
1081
|
-
#
|
1082
|
-
# DEFINE cmd `stream.pl –n 5`;
|
1083
|
-
#
|
1084
|
-
# B = STREAM A THROUGH cmd;
|
1085
|
-
# About Data Guarantees
|
1086
|
-
#
|
1087
|
-
# Data guarantees are determined based on the position of the streaming operator in the Pig script.
|
1088
|
-
#
|
1089
|
-
# * Unordered data – No guarantee for the order in which the data is delivered to
|
1090
|
-
# the streaming application.
|
1091
|
-
#
|
1092
|
-
# * Grouped data – The data for the same grouped key is guaranteed to be provided
|
1093
|
-
# to the streaming application contiguously
|
1094
|
-
#
|
1095
|
-
# * Grouped and ordered data – The data for the same grouped key is guaranteed to
|
1096
|
-
# be provided to the streaming application contiguously. Additionally, the data
|
1097
|
-
# within the group is guaranteed to be sorted by the provided secondary key.
|
1098
|
-
#
|
1099
|
-
# In addition to position, data grouping and ordering can be determined by the
|
1100
|
-
# data itself. However, you need to know the property of the data to be able to
|
1101
|
-
# take advantage of its structure.
|
1102
|
-
#
|
1103
|
-
# == Example: Data Guarantees
|
1104
|
-
#
|
1105
|
-
# In this example the data is unordered.
|
1106
|
-
#
|
1107
|
-
# A = LOAD 'data';
|
1108
|
-
# B = STREAM A THROUGH `stream.pl`;
|
1109
|
-
#
|
1110
|
-
# In this example the data is grouped.
|
1111
|
-
#
|
1112
|
-
# A = LOAD 'data';
|
1113
|
-
# B = GROUP A BY $1;
|
1114
|
-
# C = FOREACH B FLATTEN(A);
|
1115
|
-
# D = STREAM C THROUGH `stream.pl`
|
1116
|
-
#
|
1117
|
-
# In this example the data is grouped and ordered.
|
1118
|
-
#
|
1119
|
-
# A = LOAD 'data';
|
1120
|
-
# B = GROUP A BY $1;
|
1121
|
-
# C = FOREACH B {
|
1122
|
-
# D = ORDER A BY ($3, $4);
|
1123
|
-
# GENERATE D;
|
1124
|
-
# }
|
1125
|
-
# E = STREAM C THROUGH `stream.pl`;
|
1126
|
-
#
|
1127
|
-
# == Example: Schemas
|
1128
|
-
#
|
1129
|
-
# In this example a schema is specified as part of the STREAM statement.
|
1130
|
-
#
|
1131
|
-
# X = STREAM A THROUGH `stream.pl` as (f1:int, f2;int, f3:int);
|
1132
|
-
#
|
1133
|
-
# Additional Examples
|
1134
|
-
#
|
1135
|
-
# See DEFINE for additional examples.
|
1136
|
-
|
1137
|
-
|
1138
|
-
# ---------------------------------------------------------------------------
|
1139
|
-
#
|
1140
|
-
# UNION
|
1141
|
-
#
|
1142
|
-
# Computes the union of two or more relations.
|
1143
|
-
#
|
1144
|
-
# == Syntax
|
1145
|
-
#
|
1146
|
-
# alias = UNION alias, alias [, alias …];
|
1147
|
-
#
|
1148
|
-
# == Terms
|
1149
|
-
#
|
1150
|
-
# alias::
|
1151
|
-
# The name of a relation.
|
1152
|
-
#
|
1153
|
-
# == Usage
|
1154
|
-
#
|
1155
|
-
# Use the UNION operator to compute the union of two or more relations. The UNION operator:
|
1156
|
-
#
|
1157
|
-
# * Does not preserve the order of tuples. Both the input and output relations are
|
1158
|
-
# interpreted as unordered bags of tuples.
|
1159
|
-
#
|
1160
|
-
# * Does not ensure (as databases do) that all tuples adhere to the same schema or
|
1161
|
-
# that they have the same number of fields. In a typical scenario, however, this
|
1162
|
-
# should be the case; therefore, it is the user's responsibility to either (1)
|
1163
|
-
# ensure that the tuples in the input relations have the same schema or (2) be
|
1164
|
-
# able to process varying tuples in the output relation.
|
1165
|
-
#
|
1166
|
-
# * Does not eliminate duplicate tuples.
|
1167
|
-
#
|
1168
|
-
# == Example
|
1169
|
-
#
|
1170
|
-
# Suppose we have relations A and B.
|
1171
|
-
#
|
1172
|
-
# (A) (B)
|
1173
|
-
# ----------- --------
|
1174
|
-
# (1, 2, 3) (2, 4)
|
1175
|
-
# (4, 2, 1) (8, 9)
|
1176
|
-
# (1, 3)
|
1177
|
-
#
|
1178
|
-
# In this example the union of relation A and B is computed.
|
1179
|
-
#
|
1180
|
-
# X = UNION A, B;
|
1181
|
-
#
|
1182
|
-
# Relation X looks like this.
|
1183
|
-
#
|
1184
|
-
# (1, 2, 3)
|
1185
|
-
# (4, 2, 1)
|
1186
|
-
# (2, 4)
|
1187
|
-
# (8, 9)
|
1188
|
-
# (1, 3)
|
1189
|
-
# Diagnostic Operators
|
1190
|
-
# DESCRIBE
|
1191
|
-
#
|
1192
|
-
# Returns the schema of an alias.
|
1193
|
-
#
|
1194
|
-
# == Syntax
|
1195
|
-
#
|
1196
|
-
# DESCRIBE alias;
|
1197
|
-
#
|
1198
|
-
# == Terms
|
1199
|
-
#
|
1200
|
-
# alias::
|
1201
|
-
# The name of a relation.
|
1202
|
-
#
|
1203
|
-
# == Usage
|
1204
|
-
#
|
1205
|
-
# Use the DESCRIBE operator to review the schema of a particular alias.
|
1206
|
-
#
|
1207
|
-
# == Example
|
1208
|
-
#
|
1209
|
-
# In this example a schema is specified using the AS clause.
|
1210
|
-
#
|
1211
|
-
# A = LOAD 'students' AS (name:chararray, age:int, gpa:float);
|
1212
|
-
# B = FILTER A BY name matches 'John%';
|
1213
|
-
# C = GROUP B BY name;
|
1214
|
-
# D = FOREACH B GENERATE COUNT(B.age);
|
1215
|
-
# DESCRIBE A;
|
1216
|
-
# A: {group, B: (name: chararray,age: int,gpa: float}
|
1217
|
-
# DESCRIBE B;
|
1218
|
-
# B: {group, B: (name: chararray,age: int,gpa: float}
|
1219
|
-
# DESCRIBE C;
|
1220
|
-
# C: {group, chararry,B: (name: chararray,age: int,gpa: float}
|
1221
|
-
# DESCRIBE D;
|
1222
|
-
# D: {long}
|
1223
|
-
#
|
1224
|
-
# In this example no schema is specified. All data items default to type bytearray.
|
1225
|
-
#
|
1226
|
-
# grunt> a = LOAD '/data/students';
|
1227
|
-
# grunt> b = FILTER a BY $0 matches 'John%';
|
1228
|
-
# grunt> c = GROUP b BY $0;
|
1229
|
-
# grunt> d = FOREACH c GENERATE COUNT(b.$1);
|
1230
|
-
# grunt> DESCRIBE a;
|
1231
|
-
#
|
1232
|
-
# Schema for a unknown.
|
1233
|
-
#
|
1234
|
-
# grunt> DESCRIBE b;
|
1235
|
-
# 2008-12-05 01:17:15,316 [main] WARN org.apache.pig.PigServer - bytearray is implicitly cast to chararray under LORegexp Operator
|
1236
|
-
#
|
1237
|
-
# Schema for b unknown.
|
1238
|
-
#
|
1239
|
-
# grunt> DESCRIBE c;
|
1240
|
-
# 2008-12-05 01:17:23,343 [main] WARN org.apache.pig.PigServer - bytearray is implicitly caste to chararray under LORegexp Operator
|
1241
|
-
#
|
1242
|
-
# c: {group: bytearray,b: {null}}
|
1243
|
-
#
|
1244
|
-
# grunt> DESCRIBE d;
|
1245
|
-
# 2008-12-05 03:04:30,076 [main] WARN org.apache.pig.PigServer - bytearray is implicitly caste to chararray under LORegexp Operator
|
1246
|
-
#
|
1247
|
-
# d: {long}
|
1248
|
-
#
|
1249
|
-
# DUMP
|
1250
|
-
#
|
1251
|
-
# Displays the contents of an alias.
|
1252
|
-
#
|
1253
|
-
# == Syntax
|
1254
|
-
#
|
1255
|
-
# DUMP alias;
|
1256
|
-
#
|
1257
|
-
# == Terms
|
1258
|
-
#
|
1259
|
-
# alias::
|
1260
|
-
# The name of a relation.
|
1261
|
-
#
|
1262
|
-
# == Usage
|
1263
|
-
#
|
1264
|
-
# Use the DUMP operator to display the contents of an alias. You can use DUMP as a
|
1265
|
-
# debugging device to make sure the correct results are being generated.
|
1266
|
-
#
|
1267
|
-
# == Example
|
1268
|
-
#
|
1269
|
-
# In this example a dump is performed after each statement.
|
1270
|
-
#
|
1271
|
-
# A = LOAD 'students' AS (name:chararray, age:int, gpa:float);
|
1272
|
-
# DUMP A;
|
1273
|
-
# B = FILTER A BY name matches 'John%';
|
1274
|
-
# DUMP B;
|
1275
|
-
# B = GROUP B BY name;
|
1276
|
-
# DUMP C;
|
1277
|
-
# D = FOREACH C GENERATE COUNT(B.age);
|
1278
|
-
# DUMP D;
|
1279
|
-
#
|
1280
|
-
# EXPLAIN
|
1281
|
-
#
|
1282
|
-
# Displays execution plans.
|
1283
|
-
#
|
1284
|
-
# == Syntax
|
1285
|
-
#
|
1286
|
-
# EXPLAIN alias;
|
1287
|
-
#
|
1288
|
-
# == Terms
|
1289
|
-
#
|
1290
|
-
# alias::
|
1291
|
-
# The name of a relation.
|
1292
|
-
#
|
1293
|
-
# == Usage
|
1294
|
-
#
|
1295
|
-
# Use the EXPLAIN operator to review the logical, physical, and map reduce
|
1296
|
-
# execution plans that are used to compute the specified relationship.
|
1297
|
-
#
|
1298
|
-
# * The logical plan shows a pipeline of operators to be executed to build the
|
1299
|
-
# relation. Type checking and backend-independent optimizations (such as
|
1300
|
-
# applying filters early on) also apply.
|
1301
|
-
#
|
1302
|
-
# * The physical plan shows how the logical operators are translated to
|
1303
|
-
# backend-specific physical operators. Some backend optimizations also apply.
|
1304
|
-
#
|
1305
|
-
# * The map reduce plan shows how the physical operators are grouped into map
|
1306
|
-
# reduce jobs.
|
1307
|
-
#
|
1308
|
-
# == Example
|
1309
|
-
#
|
1310
|
-
# In this example the EXPLAIN operator produces all three plans. (Note that only a
|
1311
|
-
# portion of the output is shown in this example.)
|
1312
|
-
#
|
1313
|
-
# A = LOAD 'students' AS (name:chararray, age:int, gpa:float);
|
1314
|
-
# B = GROUP A BY name;
|
1315
|
-
# C = FOREACH B GENERATE COUNT(A.age);
|
1316
|
-
# EXPLAIN C;
|
1317
|
-
#
|
1318
|
-
#
|
1319
|
-
# Logical Plan:
|
1320
|
-
#
|
1321
|
-
# Store xxx-Fri Dec 05 19:42:29 UTC 2008-23 Schema: {long} Type: Unknown
|
1322
|
-
# |
|
1323
|
-
# |---ForEach xxx-Fri Dec 05 19:42:29 UTC 2008-15 Schema: {long} Type: bag
|
1324
|
-
# etc …
|
1325
|
-
#
|
1326
|
-
# -----------------------------------------------
|
1327
|
-
# Physical Plan:
|
1328
|
-
# -----------------------------------------------
|
1329
|
-
# Store(fakefile:org.apache.pig.builtin.PigStorage) - xxx-Fri Dec 05 19:42:29 UTC 2008-40
|
1330
|
-
# |
|
1331
|
-
# |---New For Each(false)[bag] - xxx-Fri Dec 05 19:42:29 UTC 2008-39
|
1332
|
-
# | |
|
1333
|
-
# | POUserFunc(org.apache.pig.builtin.COUNT)[long] - xxx-Fri Dec 05
|
1334
|
-
# etc …
|
1335
|
-
#
|
1336
|
-
# --------------------------------------------------
|
1337
|
-
# | Map Reduce Plan |
|
1338
|
-
# --------------------------------------------------
|
1339
|
-
# MapReduce node xxx-Fri Dec 05 19:42:29 UTC 2008-41
|
1340
|
-
# Map Plan
|
1341
|
-
# Local Rearrange[tuple]{chararray}(false) - xxx-Fri Dec 05 19:42:29 UTC 2008-34
|
1342
|
-
# | |
|
1343
|
-
# | Project[chararray][0] - xxx-Fri Dec 05 19:42:29 UTC 2008-35
|
1344
|
-
# etc …
|
1345
|
-
# ILLUSTRATE
|
1346
|
-
#
|
1347
|
-
# Displays a step-by-step execution of a sequence of statements.
|
1348
|
-
#
|
1349
|
-
# == Syntax
|
1350
|
-
#
|
1351
|
-
# ILLUSTRATE alias;
|
1352
|
-
#
|
1353
|
-
# == Terms
|
1354
|
-
#
|
1355
|
-
# alias::
|
1356
|
-
# The name of a relation.
|
1357
|
-
#
|
1358
|
-
# == Usage
|
1359
|
-
#
|
1360
|
-
# Use the ILLUSTRATE operator to review how data items are transformed through a
|
1361
|
-
# sequence of Pig Latin statements.
|
1362
|
-
#
|
1363
|
-
# ILLUSTRATE accesses the ExampleGenerator algorithm which can select an
|
1364
|
-
# appropriate and concise set of example data items automatically. It does a
|
1365
|
-
# better job than random sampling would do; for example, random sampling suffers
|
1366
|
-
# from the drawback that selective operations such as filters or joins can
|
1367
|
-
# eliminate all the sampled data items, giving you empty results which is of no
|
1368
|
-
# help with debugging.
|
1369
|
-
#
|
1370
|
-
# With the ILLUSTRATE operator you can test your programs on small datasets and
|
1371
|
-
# get faster turnaround times. The ExampleGenerator algorithm uses Pig's Local
|
1372
|
-
# mode (rather than Hadoop mode) which means that illustrative example data is
|
1373
|
-
# generated in near real-time.
|
1374
|
-
#
|
1375
|
-
# == Example
|
1376
|
-
#
|
1377
|
-
# Suppose we have a data file called 'visits.txt'.
|
1378
|
-
# Amy cnn.com 20080218
|
1379
|
-
# Fred harvard.edu 20081204
|
1380
|
-
# Amy bbc.com 20081205
|
1381
|
-
# Fred stanford.edu 20081206
|
1382
|
-
#
|
1383
|
-
# In this example we count the number of sites a user has visited since
|
1384
|
-
# 12/1/08. The ILLUSTRATE statement will show how the results for num_user_visits
|
1385
|
-
# are derived.
|
1386
|
-
#
|
1387
|
-
# visits = LOAD 'visits.txt' AS (user:chararray, url:chararray, timestamp:chararray);
|
1388
|
-
#
|
1389
|
-
# recent_visits = FILTER visits BY timestamp >= '20081201';
|
1390
|
-
#
|
1391
|
-
# user_visits = GROUP recent_visits BY user;
|
1392
|
-
#
|
1393
|
-
# num_user_visits = FOREACH user_visits GENERATE COUNT(recent_visits);
|
1394
|
-
#
|
1395
|
-
# ILLUSTRATE num_user_visits
|
1396
|
-
#
|
1397
|
-
# The output from the ILLUSTRATE statement looks like this.
|
1398
|
-
#
|
1399
|
-
# ------------------------------------------------------------------------
|
1400
|
-
#
|
1401
|
-
# | visits | user: bytearray | url: bytearray | timestamp: bytearray |
|
1402
|
-
# ------------------------------------------------------------------------
|
1403
|
-
# | | Amy | cnn.com | 20080218 |
|
1404
|
-
# | | Fred | harvard.edu | 20081204 |
|
1405
|
-
# | | Amy | bbc.com | 20081205 |
|
1406
|
-
# | | Fred | stanford.edu | 20081206 |
|
1407
|
-
# ------------------------------------------------------------------------
|
1408
|
-
#
|
1409
|
-
# -------------------------------------------------------------------------------
|
1410
|
-
# | recent_visits | user: chararray | url: chararray | timestamp: chararray |
|
1411
|
-
# -------------------------------------------------------------------------------
|
1412
|
-
# | | Fred | harvard.edu | 20081204 |
|
1413
|
-
# | | Amy | bbc.com | 20081205 |
|
1414
|
-
# | | Fred | stanford.edu | 20081206 |
|
1415
|
-
# -------------------------------------------------------------------------------
|
1416
|
-
#
|
1417
|
-
# ------------------------------------------------------------------------------------------------------------------
|
1418
|
-
# | user_visits | group: chararray | recent_visits: bag({user: chararray,url: chararray,timestamp: chararray}) |
|
1419
|
-
# ------------------------------------------------------------------------------------------------------------------
|
1420
|
-
# | | Amy | {(Amy, bbc.com, 20081205)} |
|
1421
|
-
# | | Fred | {(Fred, harvard.edu, 20081204), (Fred, stanford.edu, 20081206)} |
|
1422
|
-
# ------------------------------------------------------------------------------------------------------------------
|
1423
|
-
#
|
1424
|
-
# -------------------------------
|
1425
|
-
# | num_user_visits | long |
|
1426
|
-
# -------------------------------
|
1427
|
-
# | | 1 |
|
1428
|
-
# | | 2 |
|
1429
|
-
# -------------------------------
|
1430
|
-
#
|
1431
|
-
|
1432
|
-
# ---------------------------------------------------------------------------
|
1433
|
-
#
|
1434
|
-
# DEFINE
|
1435
|
-
#
|
1436
|
-
# Assigns an alias to a function or command.
|
1437
|
-
#
|
1438
|
-
# == Syntax
|
1439
|
-
#
|
1440
|
-
# DEFINE alias {function | [`command` [input] [output] [ship] [cache]] };
|
1441
|
-
#
|
1442
|
-
# == Terms
|
1443
|
-
#
|
1444
|
-
# alias::
|
1445
|
-
# The name for the function or command.
|
1446
|
-
#
|
1447
|
-
# function::
|
1448
|
-
# The name of a function.
|
1449
|
-
#
|
1450
|
-
# Use this option to define functions for use with the FOREACH and FILTER operators.
|
1451
|
-
#
|
1452
|
-
# `command `::
|
1453
|
-
# A command, including the arguments, enclosed in back tics (where a command is anything that can be executed).
|
1454
|
-
#
|
1455
|
-
# Use this option to define commands for use with the STREAM operator.
|
1456
|
-
#
|
1457
|
-
# input::
|
1458
|
-
# INPUT ( {stdin | 'path'} [USING serializer] [, {stdin | 'path'} [USING serializer] …] )
|
1459
|
-
#
|
1460
|
-
# Where:
|
1461
|
-
# * INPUT – Keyword.
|
1462
|
-
# * 'path' – A file path, enclosed in single quotes.
|
1463
|
-
# * USING – Keyword.
|
1464
|
-
# * serializer – A function that converts data from tuples to stream format. PigStorage is the default serializer. You can also write your own UDF.
|
1465
|
-
#
|
1466
|
-
# output::
|
1467
|
-
# OUTPUT ( {stdout | stderr | 'path'} [USING deserializer] [, {stdout | stderr | 'path'} [USING deserializer] …] )
|
1468
|
-
#
|
1469
|
-
# Where:
|
1470
|
-
#
|
1471
|
-
# * OUTPUT – Keyword.
|
1472
|
-
# * 'path' – A file path, enclosed in single quotes.
|
1473
|
-
# * USING – Keyword.
|
1474
|
-
# * deserializer – A function that converts data from stream format to tuples. PigStorage is the default deserializer. You can also write your own UDF.
|
1475
|
-
#
|
1476
|
-
# ship::
|
1477
|
-
# SHIP('path' [, 'path' …])
|
1478
|
-
#
|
1479
|
-
# Where:
|
1480
|
-
#
|
1481
|
-
# * SHIP – Keyword.
|
1482
|
-
# * 'path' – A file path, enclosed in single quotes.
|
1483
|
-
#
|
1484
|
-
# cache::
|
1485
|
-
# CACHE('dfs_path#dfs_file' [, 'dfs_path#dfs_file' …])
|
1486
|
-
#
|
1487
|
-
# Where:
|
1488
|
-
#
|
1489
|
-
# * CACHE – Keyword.
|
1490
|
-
# * 'dfs_path#dfs_file' – A file path/file name on the distributed file system,
|
1491
|
-
# enclosed in single quotes. Example: '/mydir/mydata.txt#mydata.txt'
|
1492
|
-
#
|
1493
|
-
#
|
1494
|
-
# == Usage
|
1495
|
-
#
|
1496
|
-
# Use the DEFINE statement to assign a name (alias) to a function or to a command.
|
1497
|
-
#
|
1498
|
-
# Use DEFINE to specify a function when:
|
1499
|
-
#
|
1500
|
-
# * The function has a log package name that you don't want to include in a
|
1501
|
-
# script, especially if you call the function several times in that script.
|
1502
|
-
#
|
1503
|
-
# * The constructor for the function takes parameters (see the first example
|
1504
|
-
# below). If you need to use different constructor parameters for different
|
1505
|
-
# calls to the function you will need to create multiple defines – one for each
|
1506
|
-
# parameter set.
|
1507
|
-
#
|
1508
|
-
# Use DEFINE to specify a command when the streaming command specification is
|
1509
|
-
# complex or requires additional parameters (input, output, and so on).
|
1510
|
-
#
|
1511
|
-
# === About Input and Output
|
1512
|
-
#
|
1513
|
-
# Serialization is needed to convert data from tuples to a format that can be
|
1514
|
-
# processed by the streaming application. Deserialization is needed to convert the
|
1515
|
-
# output from the streaming application back into tuples.
|
1516
|
-
#
|
1517
|
-
# PigStorage, the default serialization/deserialization function, converts tuples
|
1518
|
-
# to tab-delimited lines. Pig's BinarySerializer and BinaryDeserializer functions
|
1519
|
-
# treat the entire file as a byte stream (no formatting or interpretation takes
|
1520
|
-
# place). You can also write your own serialization/deserialization functions.
|
1521
|
-
#
|
1522
|
-
# === About Ship
|
1523
|
-
#
|
1524
|
-
# Use the ship option to send streaming binary and supporting files, if any, from
|
1525
|
-
# the client node to the compute nodes. Pig does not automatically ship
|
1526
|
-
# dependencies; it is your responsibility to explicitly specify all the
|
1527
|
-
# dependencies and to make sure that the software the processing relies on (for
|
1528
|
-
# instance, perl or python) is installed on the cluster. Supporting files are
|
1529
|
-
# shipped to the task's current working directory and only relative paths should
|
1530
|
-
# be specified. Any pre-installed binaries should be specified in the path.
|
1531
|
-
#
|
1532
|
-
# Only files, not directories, can be specified with the ship option. One way to
|
1533
|
-
# work around this limitation is to tar all the dependencies into a tar file that
|
1534
|
-
# accurately reflects the structure needed on the compute nodes, then have a
|
1535
|
-
# wrapper for your script that un-tars the dependencies prior to execution.
|
1536
|
-
#
|
1537
|
-
# Note that the ship option has two components: the source specification, provided
|
1538
|
-
# in the ship clause, is the view of your machine; the command specification is
|
1539
|
-
# the view of the cluster.The only guarantee is that the shipped files are
|
1540
|
-
# available is the current working directory of the launched job and that your
|
1541
|
-
# current working directory is also on the PATH environment variable.
|
1542
|
-
#
|
1543
|
-
# Shipping files to relative paths or absolute paths is not supported since you
|
1544
|
-
# might not have permission to read/write/execute from arbitrary paths on the
|
1545
|
-
# clusters.
|
1546
|
-
#
|
1547
|
-
# === About Cache
|
1548
|
-
#
|
1549
|
-
# The ship option works with binaries, jars, and small datasets. However, loading
|
1550
|
-
# larger datasets at run time for every execution can severely impact
|
1551
|
-
# performance. Instead, use the cache option to access large files already moved
|
1552
|
-
# to and available on the compute nodes. Only files, not directories, can be
|
1553
|
-
# specified with the cache option.
|
1554
|
-
#
|
1555
|
-
# == Example: Input/Output
|
1556
|
-
#
|
1557
|
-
# In this example PigStorage is the default serialization/deserialization
|
1558
|
-
# function. The tuples from relation A are converted to tab-delimited lines that
|
1559
|
-
# are passed to the script.
|
1560
|
-
#
|
1561
|
-
# X = STREAM A THROUGH `stream.pl`;
|
1562
|
-
#
|
1563
|
-
# In this example PigStorage is used as the serialization/deserialization
|
1564
|
-
# function, but a comma is used as the delimiter.
|
1565
|
-
#
|
1566
|
-
# DEFINE Y `stream.pl` INPUT(stdin USING PigStorage(',')) OUTPUT (stdout USING PigStorage(','));
|
1567
|
-
# X = STREAM A THROUGH Y;
|
1568
|
-
#
|
1569
|
-
# In this example user-defined serialization/deserialization functions are used
|
1570
|
-
# with the script.
|
1571
|
-
#
|
1572
|
-
# DEFINE Y `stream.pl` INPUT(stdin USING MySerializer) OUTPUT (stdout USING MyDeserializer);
|
1573
|
-
# X = STREAM A THROUGH Y;
|
1574
|
-
#
|
1575
|
-
# == Example: Ship/Cache
|
1576
|
-
#
|
1577
|
-
# In this example ship is used to send the script to the cluster compute nodes.
|
1578
|
-
#
|
1579
|
-
# DEFINE Y `stream.pl` SHIP('/work/stream.pl');
|
1580
|
-
# X = STREAM A THROUGH Y;
|
1581
|
-
#
|
1582
|
-
# In this example cache is used to specify a file located on the cluster compute
|
1583
|
-
# nodes.
|
1584
|
-
#
|
1585
|
-
# DEFINE Y `stream.pl data.gz` SHIP('/work/stream.pl') CACHE('/input/data.gz#data.gz');
|
1586
|
-
# X = STREAM A THROUGH Y;
|
1587
|
-
#
|
1588
|
-
# == Example: Logging
|
1589
|
-
#
|
1590
|
-
# In this example the streaming stderr is stored in the _logs/<dir> directory of
|
1591
|
-
# the job's output directory. Because the job can have multiple streaming
|
1592
|
-
# applications associated with it, you need to ensure that different directory
|
1593
|
-
# names are used to avoid conflicts. Pig stores up to 100 tasks per streaming job.
|
1594
|
-
#
|
1595
|
-
# DEFINE Y `stream.pl` stderr('<dir>' limit 100);
|
1596
|
-
# X = STREAM A THROUGH Y;
|
1597
|
-
#
|
1598
|
-
# In this example a function is defined for use with the FOREACH …GENERATE operator.
|
1599
|
-
# grunt> REGISTER /src/myfunc.jar
|
1600
|
-
# grunt> define myFunc myfunc.MyEvalfunc('foo');
|
1601
|
-
# grunt> A = LOAD 'students';
|
1602
|
-
# grunt> B = FOREACH A GENERATE myFunc($0);
|
1603
|
-
#
|
1604
|
-
# In this example a command is defined for use with the STREAM operator.
|
1605
|
-
# grunt> A = LOAD 'data';
|
1606
|
-
# grunt> DEFINE cmd `stream_cmd –input file.dat`
|
1607
|
-
# grunt> B = STREAM A through cmd.
|
1608
|
-
#
|
1609
|
-
|
1610
|
-
|
1611
|
-
# ---------------------------------------------------------------------------
|
1612
|
-
#
|
1613
|
-
# = REGISTER
|
1614
|
-
#
|
1615
|
-
# Registers a JAR file so that the UDFs in the file can be used.
|
1616
|
-
#
|
1617
|
-
# == Syntax
|
1618
|
-
#
|
1619
|
-
# REGISTER alias;
|
1620
|
-
#
|
1621
|
-
# == Terms
|
1622
|
-
#
|
1623
|
-
# [alias] The path of a Java JAR file. Do not place the name in quotes.
|
1624
|
-
#
|
1625
|
-
# == Usage
|
1626
|
-
#
|
1627
|
-
# Use the REGISTER statement to specify the path of a Java JAR file containing UDFs.
|
1628
|
-
#
|
1629
|
-
# For more information about UDFs, see the User Defined Function Guide. Note that
|
1630
|
-
# Pig currently only supports functions written in Java.
|
1631
|
-
#
|
1632
|
-
# == Example
|
1633
|
-
#
|
1634
|
-
# In this example REGISTER states that myfunc.jar is located in the /src
|
1635
|
-
# directory.
|
1636
|
-
#
|
1637
|
-
# grunt> REGISTER /src/myfunc.jar;
|
1638
|
-
# grunt> A = LOAD 'students';
|
1639
|
-
# grunt> B = FOREACH A GENERATE myfunc.MyEvalFunc($0);
|
1640
|
-
#
|