wukong 3.0.0.pre → 3.0.0.pre2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,1103 +0,0 @@
1
- <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
- <html>
3
- <head>
4
- <META http-equiv="Content-Type" content="text/html; charset=UTF-8">
5
- <meta content="Apache Forrest" name="Generator">
6
- <meta name="Forrest-version" content="0.8">
7
- <meta name="Forrest-skin-name" content="pelt">
8
- <title>Pig Latin Reference Manual 1</title>
9
- <link type="text/css" href="skin/basic.css" rel="stylesheet">
10
- <link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
11
- <link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
12
- <link type="text/css" href="skin/profile.css" rel="stylesheet">
13
- <script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
14
- <link rel="shortcut icon" href="">
15
- </head>
16
- <body onload="init()">
17
- <script type="text/javascript">ndeSetTextSize();</script>
18
- <div id="top">
19
- <!--+
20
- |breadtrail
21
- +-->
22
- <div class="breadtrail">
23
- <a href="http://www.apache.org/">Apache</a> &gt; <a href="http://hadoop.apache.org/">Hadoop</a> &gt; <a href="http://hadoop.apache.org/pig/">Pig</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
24
- </div>
25
- <!--+
26
- |header
27
- +-->
28
- <div class="header">
29
- <!--+
30
- |start group logo
31
- +-->
32
- <div class="grouplogo">
33
- <a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
34
- </div>
35
- <!--+
36
- |end group logo
37
- +-->
38
- <!--+
39
- |start Project Logo
40
- +-->
41
- <div class="projectlogo">
42
- <a href="http://hadoop.apache.org/pig/"><img class="logoImage" alt="Pig" src="images/pig-logo.gif" title="A platform for analyzing large datasets."></a>
43
- </div>
44
- <!--+
45
- |end Project Logo
46
- +-->
47
- <!--+
48
- |start Search
49
- +-->
50
- <div class="searchbox">
51
- <form action="http://www.google.com/search" method="get" class="roundtopsmall">
52
- <input value="" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp;
53
- <input name="Search" value="Search" type="submit">
54
- </form>
55
- </div>
56
- <!--+
57
- |end search
58
- +-->
59
- <!--+
60
- |start Tabs
61
- +-->
62
- <ul id="tabs">
63
- <li>
64
- <a class="unselected" href="http://hadoop.apache.org/pig/">Project</a>
65
- </li>
66
- <li>
67
- <a class="unselected" href="http://wiki.apache.org/pig/">Wiki</a>
68
- </li>
69
- <li class="current">
70
- <a class="selected" href="index.html">Pig 0.7.0 Documentation</a>
71
- </li>
72
- </ul>
73
- <!--+
74
- |end Tabs
75
- +-->
76
- </div>
77
- </div>
78
- <div id="main">
79
- <div id="publishedStrip">
80
- <!--+
81
- |start Subtabs
82
- +-->
83
- <div id="level2tabs"></div>
84
- <!--+
85
- |end Endtabs
86
- +-->
87
- <script type="text/javascript"><!--
88
- document.write("Last Published: " + document.lastModified);
89
- // --></script>
90
- </div>
91
- <!--+
92
- |breadtrail
93
- +-->
94
- <div class="breadtrail">
95
-
96
- &nbsp;
97
- </div>
98
- <!--+
99
- |start Menu, mainarea
100
- +-->
101
- <!--+
102
- |start Menu
103
- +-->
104
- <div id="menu">
105
- <div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Pig</div>
106
- <div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
107
- <div class="menuitem">
108
- <a href="index.html">Overview</a>
109
- </div>
110
- <div class="menuitem">
111
- <a href="setup.html">Setup</a>
112
- </div>
113
- <div class="menuitem">
114
- <a href="tutorial.html">Tutorial</a>
115
- </div>
116
- <div class="menupage">
117
- <div class="menupagetitle">Pig Latin 1</div>
118
- </div>
119
- <div class="menuitem">
120
- <a href="piglatin_ref2.html">Pig Latin 2</a>
121
- </div>
122
- <div class="menuitem">
123
- <a href="cookbook.html">Cookbook</a>
124
- </div>
125
- <div class="menuitem">
126
- <a href="udf.html">UDFs</a>
127
- </div>
128
- </div>
129
- <div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Zebra</div>
130
- <div id="menu_1.2" class="menuitemgroup">
131
- <div class="menuitem">
132
- <a href="zebra_overview.html">Zebra Overview </a>
133
- </div>
134
- <div class="menuitem">
135
- <a href="zebra_users.html">Zebra Users </a>
136
- </div>
137
- <div class="menuitem">
138
- <a href="zebra_reference.html">Zebra Reference </a>
139
- </div>
140
- <div class="menuitem">
141
- <a href="zebra_mapreduce.html">Zebra MapReduce </a>
142
- </div>
143
- <div class="menuitem">
144
- <a href="zebra_pig.html">Zebra Pig </a>
145
- </div>
146
- <div class="menuitem">
147
- <a href="zebra_stream.html">Zebra Streaming </a>
148
- </div>
149
- </div>
150
- <div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Miscellaneous</div>
151
- <div id="menu_1.3" class="menuitemgroup">
152
- <div class="menuitem">
153
- <a href="api/">API Docs</a>
154
- </div>
155
- <div class="menuitem">
156
- <a href="http://wiki.apache.org/pig/">Wiki</a>
157
- </div>
158
- <div class="menuitem">
159
- <a href="http://wiki.apache.org/pig/FAQ">FAQ</a>
160
- </div>
161
- <div class="menuitem">
162
- <a href="http://hadoop.apache.org/pig/releases.html">Release Notes</a>
163
- </div>
164
- </div>
165
- <div id="credit"></div>
166
- <div id="roundbottom">
167
- <img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
168
- <!--+
169
- |alternative credits
170
- +-->
171
- <div id="credit2"></div>
172
- </div>
173
- <!--+
174
- |end Menu
175
- +-->
176
- <!--+
177
- |start content
178
- +-->
179
- <div id="content">
180
- <div title="Portable Document Format" class="pdflink">
181
- <a class="dida" href="piglatin_ref1.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
182
- PDF</a>
183
- </div>
184
- <h1>Pig Latin Reference Manual 1</h1>
185
- <div id="minitoc-area">
186
- <ul class="minitoc">
187
- <li>
188
- <a href="#Overview">Overview</a>
189
- </li>
190
- <li>
191
- <a href="#Pig+Latin+Statements">Pig Latin Statements</a>
192
- <ul class="minitoc">
193
- <li>
194
- <a href="#Running+Pig+Latin">Running Pig Latin </a>
195
- </li>
196
- <li>
197
- <a href="#Retrieving+Pig+Latin+Results">Retrieving Pig Latin Results</a>
198
- </li>
199
- <li>
200
- <a href="#Debugging+Pig+Latin">Debugging Pig Latin</a>
201
- </li>
202
- <li>
203
- <a href="#Working+with+Data">Working with Data</a>
204
- </li>
205
- <li>
206
- <a href="#Using+Comments+in+Scripts">Using Comments in Scripts</a>
207
- </li>
208
- <li>
209
- <a href="#Case+Sensitivity">Case Sensitivity</a>
210
- </li>
211
- </ul>
212
- </li>
213
- <li>
214
- <a href="#Multi-Query+Execution">Multi-Query Execution</a>
215
- <ul class="minitoc">
216
- <li>
217
- <a href="#Turning+it+On+or+Off">Turning it On or Off</a>
218
- </li>
219
- <li>
220
- <a href="#How+it+Works">How it Works</a>
221
- </li>
222
- <li>
223
- <a href="#Store+vs.+Dump">Store vs. Dump</a>
224
- </li>
225
- <li>
226
- <a href="#Error+Handling">Error Handling</a>
227
- </li>
228
- <li>
229
- <a href="#Backward+Compatibility">Backward Compatibility</a>
230
- </li>
231
- <li>
232
- <a href="#Implicit+Dependencies">Implicit Dependencies</a>
233
- </li>
234
- </ul>
235
- </li>
236
- <li>
237
- <a href="#Specialized+Joins">Specialized Joins</a>
238
- <ul class="minitoc">
239
- <li>
240
- <a href="#Replicated+Joins">Replicated Joins</a>
241
- </li>
242
- <li>
243
- <a href="#Skewed+Joins">Skewed Joins</a>
244
- </li>
245
- <li>
246
- <a href="#Merge+Joins">Merge Joins</a>
247
- </li>
248
- </ul>
249
- </li>
250
- <li>
251
- <a href="#Optimization+Rules">Optimization Rules</a>
252
- <ul class="minitoc">
253
- <li>
254
- <a href="#ImplicitSplitInserter">ImplicitSplitInserter</a>
255
- </li>
256
- <li>
257
- <a href="#TypeCastInserter">TypeCastInserter</a>
258
- </li>
259
- <li>
260
- <a href="#StreamOptimizer">StreamOptimizer</a>
261
- </li>
262
- <li>
263
- <a href="#OpLimitOptimizer">OpLimitOptimizer</a>
264
- </li>
265
- <li>
266
- <a href="#PushUpFilters">PushUpFilters</a>
267
- </li>
268
- <li>
269
- <a href="#PushDownExplodes">PushDownExplodes</a>
270
- </li>
271
- </ul>
272
- </li>
273
- <li>
274
- <a href="#Memory+Management">Memory Management</a>
275
- </li>
276
- <li>
277
- <a href="#Zebra+Integration">Zebra Integration</a>
278
- </li>
279
- </ul>
280
- </div>
281
-
282
- <!-- ABOUT PIG LATIN -->
283
-
284
- <a name="N10010"></a><a name="Overview"></a>
285
- <h2 class="h3">Overview</h2>
286
- <div class="section">
287
- <p>Use this manual together with <a href="piglatin_ref2.html">Pig Latin Reference Manual 2</a>. </p>
288
- <p>Also, be sure to review the information in the <a href="cookbook.html">Pig Cookbook</a>. </p>
289
- </div>
290
-
291
- <!-- PIG LATIN STATEMENTS -->
292
-
293
- <a name="N10027"></a><a name="Pig+Latin+Statements"></a>
294
- <h2 class="h3">Pig Latin Statements</h2>
295
- <div class="section">
296
- <p>A Pig Latin statement is an operator that takes a <a href="piglatin_ref2.html#Relations%2C+Bags%2C+Tuples%2C+Fields">relation</a>
297
- as input and produces another relation as output.
298
- (This definition applies to all Pig Latin operators except LOAD and STORE which read data from and write data to the file system.)
299
- Pig Latin statements can span multiple lines and must end with a semi-colon ( ; ).
300
- Pig Latin statements are generally organized in the following manner: </p>
301
- <ol>
302
-
303
- <li>
304
-
305
- <p>A LOAD statement reads data from the file system. </p>
306
-
307
- </li>
308
-
309
- <li>
310
-
311
- <p>A series of "transformation" statements process the data. </p>
312
-
313
- </li>
314
-
315
- <li>
316
-
317
- <p>A STORE statement writes output to the file system; or, a DUMP statement displays output to the screen.</p>
318
-
319
- </li>
320
-
321
- </ol>
322
- <a name="N10049"></a><a name="Running+Pig+Latin"></a>
323
- <h3 class="h4">Running Pig Latin </h3>
324
- <p>You can execute Pig Latin statements: </p>
325
- <ul>
326
-
327
- <li>Using grunt shell or command line</li>
328
-
329
- <li>In mapreduce mode or local mode</li>
330
-
331
- <li>Either interactively or in batch </li>
332
-
333
- </ul>
334
- <p></p>
335
- <p>Note that Pig now uses Hadoop's local mode (rather than Pig's native local mode).</p>
336
- <p>A few run examples are shown here; see <a href="setup.html">Pig Setup</a> for more examples.</p>
337
- <p>Grunt Shell - interactive, mapreduce mode (because mapreduce mode is the default you do not need to specify)</p>
338
- <pre class="code">
339
- $ pig
340
- ... - Connecting to ...
341
- grunt&gt; A = load 'data';
342
- grunt&gt; B = ... ;
343
- </pre>
344
- <p>Grunt Shell - batch, local mode (see the <a href="piglatin_ref2.html#exec">exec</a> and <a href="piglatin_ref2.html#run">run</a> commands)</p>
345
- <pre class="code">
346
- $ pig -x local
347
- grunt&gt; exec myscript.pig;
348
- or
349
- grunt&gt; run myscript.pig;
350
- </pre>
351
- <p>Command Line - batch, mapreduce mode</p>
352
- <pre class="code">
353
- $ pig myscript.pig
354
- </pre>
355
- <p>Command Line - batch, local mode mode</p>
356
- <pre class="code">
357
- $ pig -x local myscript.pig
358
- </pre>
359
- <p></p>
360
- <p>
361
- <em>In general</em>, Pig processes Pig Latin statements as follows:</p>
362
- <ol>
363
-
364
- <li>
365
-
366
- <p>First, Pig validates the syntax and semantics of all statements.</p>
367
-
368
- </li>
369
-
370
- <li>
371
-
372
- <p>Next, if Pig encounters a DUMP or STORE, Pig will execute the statements.</p>
373
-
374
- </li>
375
-
376
- </ol>
377
- <p></p>
378
- <p>In this example Pig will validate, but not execute, the LOAD and FOREACH statements.</p>
379
- <pre class="code">
380
- A = LOAD 'student' USING PigStorage() AS (name:chararray, age:int, gpa:float);
381
- B = FOREACH A GENERATE name;
382
- </pre>
383
- <p>In this example, Pig will validate and then execute the LOAD, FOREACH, and DUMP statements.</p>
384
- <pre class="code">
385
- A = LOAD 'student' USING PigStorage() AS (name:chararray, age:int, gpa:float);
386
- B = FOREACH A GENERATE name;
387
- DUMP B;
388
- (John)
389
- (Mary)
390
- (Bill)
391
- (Joe)
392
- </pre>
393
- <p>
394
- </p>
395
- <p>See <a href="#Multi-Query+Execution">Multi-Query Execution</a> for more information on how Pig Latin statements are processed.</p>
396
- <a name="N100BF"></a><a name="Retrieving+Pig+Latin+Results"></a>
397
- <h3 class="h4">Retrieving Pig Latin Results</h3>
398
- <p>Pig Latin includes operators you can use to retrieve the results of your Pig Latin statements: </p>
399
- <ol>
400
-
401
- <li>
402
-
403
- <p>Use the DUMP operator to display results to a screen. </p>
404
-
405
- </li>
406
-
407
- <li>
408
-
409
- <p>Use the STORE operator to write results to a file on the file system.</p>
410
-
411
- </li>
412
-
413
- </ol>
414
- <a name="N100D8"></a><a name="Debugging+Pig+Latin"></a>
415
- <h3 class="h4">Debugging Pig Latin</h3>
416
- <p>Pig Latin includes operators that can help you debug your Pig Latin statements:</p>
417
- <ol>
418
-
419
- <li>
420
-
421
- <p>Use the DESCRIBE operator to review the schema of a relation.</p>
422
-
423
- </li>
424
-
425
- <li>
426
-
427
- <p>Use the EXPLAIN operator to view the logical, physical, or map reduce execution plans to compute a relation.</p>
428
-
429
- </li>
430
-
431
- <li>
432
-
433
- <p>Use the ILLUSTRATE operator to view the step-by-step execution of a series of statements.</p>
434
-
435
- </li>
436
-
437
- </ol>
438
- <a name="N100F7"></a><a name="Working+with+Data"></a>
439
- <h3 class="h4">Working with Data</h3>
440
- <p>Pig Latin allows you to work with data in many ways. In general, and as a starting point:</p>
441
- <ol>
442
-
443
- <li>
444
-
445
- <p>Use the FILTER operator to work with tuples or rows of data. Use the FOREACH operator to work with columns of data.</p>
446
-
447
- </li>
448
-
449
- <li>
450
-
451
- <p>Use the GROUP operator to group data in a single relation. Use the COGROUP and JOIN operators to group or join data in two or more relations.</p>
452
-
453
- </li>
454
-
455
- <li>
456
-
457
- <p>Use the UNION operator to merge the contents of two or more relations. Use the SPLIT operator to partition the contents of a relation into multiple relations.</p>
458
-
459
- </li>
460
-
461
- </ol>
462
- <a name="N10116"></a><a name="Using+Comments+in+Scripts"></a>
463
- <h3 class="h4">Using Comments in Scripts</h3>
464
- <p>If you place Pig Latin statements in a script, the script can include comments. </p>
465
- <ol>
466
-
467
- <li>
468
-
469
- <p>For multi-line comments use /* &hellip;. */</p>
470
-
471
- </li>
472
-
473
- <li>
474
-
475
- <p>For single line comments use --</p>
476
-
477
- </li>
478
-
479
- </ol>
480
- <pre class="code">
481
- /* myscript.pig
482
- My script includes three simple Pig Latin Statements.
483
- */
484
-
485
- A = LOAD 'student' USING PigStorage() AS (name:chararray, age:int, gpa:float); -- load statement
486
- B = FOREACH A GENERATE name; -- foreach statement
487
- DUMP B; --dump statement
488
- </pre>
489
- <a name="N10133"></a><a name="Case+Sensitivity"></a>
490
- <h3 class="h4">Case Sensitivity</h3>
491
- <p>The names (aliases) of relations and fields are case sensitive. The names of Pig Latin functions are case sensitive.
492
- The names of parameters (see Parameter Substitution) and all other Pig Latin keywords are case insensitive.</p>
493
- <p>In the example below, note the following:</p>
494
- <ol>
495
-
496
- <li>
497
-
498
- <p>The names (aliases) of relations A, B, and C are case sensitive.</p>
499
-
500
- </li>
501
-
502
- <li>
503
-
504
- <p>The names (aliases) of fields f1, f2, and f3 are case sensitive.</p>
505
-
506
- </li>
507
-
508
- <li>
509
-
510
- <p>Function names PigStorage and COUNT are case sensitive.</p>
511
-
512
- </li>
513
-
514
- <li>
515
-
516
- <p>Keywords LOAD, USING, AS, GROUP, BY, FOREACH, GENERATE, and DUMP are case insensitive.
517
- They can also be written as load, using, as, group, by, etc.</p>
518
-
519
- </li>
520
-
521
- <li>
522
-
523
- <p>In the FOREACH statement, the field in relation B is referred to by positional notation ($0).</p>
524
-
525
- </li>
526
-
527
- </ol>
528
- <p></p>
529
- <pre class="code">
530
- grunt&gt; A = LOAD 'data' USING PigStorage() AS (f1:int, f2:int, f3:int);
531
- grunt&gt; B = GROUP A BY f1;
532
- grunt&gt; C = FOREACH B GENERATE COUNT ($0);
533
- grunt&gt; DUMP C;
534
- </pre>
535
- </div>
536
- <!-- END PIG LATIN STATEMENTS -->
537
-
538
-
539
-
540
- <!-- MULTI-QUERY EXECUTION-->
541
-
542
- <a name="N1016C"></a><a name="Multi-Query+Execution"></a>
543
- <h2 class="h3">Multi-Query Execution</h2>
544
- <div class="section">
545
- <p>With multi-query execution Pig processes an entire script or a batch of statements at once.</p>
546
- <a name="N10175"></a><a name="Turning+it+On+or+Off"></a>
547
- <h3 class="h4">Turning it On or Off</h3>
548
- <p>Multi-query execution is turned on by default.
549
- To turn it off and revert to Pig's "execute-on-dump/store" behavior, use the "-M" or "-no_multiquery" options. </p>
550
- <p>To run script "myscript.pig" without the optimization, execute Pig as follows: </p>
551
- <pre class="code">
552
- $ pig -M myscript.pig
553
- or
554
- $ pig -no_multiquery myscript.pig
555
- </pre>
556
- <a name="N10186"></a><a name="How+it+Works"></a>
557
- <h3 class="h4">How it Works</h3>
558
- <p>Multi-query execution introduces some changes:</p>
559
- <ol>
560
-
561
- <li>
562
-
563
- <p>For batch mode execution, the entire script is first parsed to determine if intermediate tasks
564
- can be combined to reduce the overall amount of work that needs to be done; execution starts only after the parsing is completed
565
- (see the <a href="piglatin_ref2.html#EXPLAIN">EXPLAIN</a> operator and the <a href="piglatin_ref2.html#exec">exec</a> and <a href="piglatin_ref2.html#run">run</a> commands). </p>
566
-
567
-
568
- </li>
569
-
570
- <li>
571
-
572
- <p>Two run scenarios are optimized, as explained below: explicit and implicit splits, and storing intermediate results.</p>
573
-
574
- </li>
575
-
576
- </ol>
577
- <a name="N101AA"></a><a name="Explicit+and+Implicit+Splits"></a>
578
- <h4>Explicit and Implicit Splits</h4>
579
- <p>There might be cases in which you want different processing on separate parts of the same data stream.</p>
580
- <p>Example 1:</p>
581
- <pre class="code">
582
- A = LOAD ...
583
- ...
584
- SPLIT A' INTO B IF ..., C IF ...
585
- ...
586
- STORE B' ...
587
- STORE C' ...
588
- </pre>
589
- <p>Example 2:</p>
590
- <pre class="code">
591
- A = LOAD ...
592
- ...
593
- B = FILTER A' ...
594
- C = FILTER A' ...
595
- ...
596
- STORE B' ...
597
- STORE C' ...
598
- </pre>
599
- <p>In prior Pig releases, Example 1 will dump A' to disk and then start jobs for B' and C'.
600
- Example 2 will execute all the dependencies of B' and store it and then execute all the dependencies of C' and store it.
601
- Both are equivalent, but the performance will be different. </p>
602
- <p>Here's what the multi-query execution does to increase the performance: </p>
603
- <ol>
604
-
605
- <li>
606
- <p>For Example 2, adds an implicit split to transform the query to Example 1.
607
- This eliminates the processing of A' multiple times.</p>
608
- </li>
609
-
610
- <li>
611
- <p>Makes the split non-blocking and allows processing to continue.
612
- This helps reduce the amount of data that has to be stored right at the split. </p>
613
- </li>
614
-
615
- <li>
616
- <p>Allows multiple outputs from a job. This way some results can be stored as a side-effect of the main job.
617
- This is also necessary to make the previous item work. </p>
618
- </li>
619
-
620
- <li>
621
- <p>Allows multiple split branches to be carried on to the combiner/reducer.
622
- This reduces the amount of IO again in the case where multiple branches in the split can benefit from a combiner run. </p>
623
- </li>
624
-
625
- </ol>
626
- <a name="N101DB"></a><a name="Storing+Intermediate+Results"></a>
627
- <h4>Storing Intermediate Results</h4>
628
- <p>Sometimes it is necessary to store intermediate results. </p>
629
- <pre class="code">
630
- A = LOAD ...
631
- ...
632
- STORE A'
633
- ...
634
- STORE A''
635
- </pre>
636
- <p>If the script doesn't re-load A' for the processing of A the steps above A' will be duplicated.
637
- This is a special case of Example 2 above, so the same steps are recommended.
638
- With multi-query execution, the script will process A and dump A' as a side-effect.</p>
639
- <a name="N101ED"></a><a name="Store+vs.+Dump"></a>
640
- <h3 class="h4">Store vs. Dump</h3>
641
- <p>With multi-query exection, you want to use <a href="piglatin_ref2.html#STORE">STORE</a> to save (persist) your results.
642
- You do not want to use <a href="piglatin_ref2.html#DUMP">DUMP</a> as it will disable multi-query execution and is likely to slow down execution. (If you have included DUMP statements in your scripts for debugging purposes, you should remove them.) </p>
643
- <p>DUMP Example: In this script, because the DUMP command is interactive, the multi-query execution will be disabled and two separate jobs will be created to execute this script. The first job will execute A &gt; B &gt; DUMP while the second job will execute A &gt; B &gt; C &gt; STORE.</p>
644
- <pre class="code">
645
- A = LOAD 'input' AS (x, y, z);
646
- B = FILTER A BY x &gt; 5;
647
- DUMP B;
648
- C = FOREACH B GENERATE y, z;
649
- STORE C INTO 'output';
650
- </pre>
651
- <p>STORE Example: In this script, multi-query optimization will kick in allowing the entire script to be executed as a single job. Two outputs are produced: output1 and output2.</p>
652
- <pre class="code">
653
- A = LOAD 'input' AS (x, y, z);
654
- B = FILTER A BY x &gt; 5;
655
- STORE B INTO 'output1';
656
- C = FOREACH B GENERATE y, z;
657
- STORE C INTO 'output2';
658
- </pre>
659
- <a name="N1020D"></a><a name="Error+Handling"></a>
660
- <h3 class="h4">Error Handling</h3>
661
- <p>With multi-query execution Pig processes an entire script or a batch of statements at once.
662
- By default Pig tries to run all the jobs that result from that, regardless of whether some jobs fail during execution.
663
- To check which jobs have succeeded or failed use one of these options. </p>
664
- <p>First, Pig logs all successful and failed store commands. Store commands are identified by output path.
665
- At the end of execution a summary line indicates success, partial failure or failure of all store commands. </p>
666
- <p>Second, Pig returns different code upon completion for these scenarios:</p>
667
- <ol>
668
-
669
- <li>
670
- <p>Return code 0: All jobs succeeded</p>
671
- </li>
672
-
673
- <li>
674
- <p>Return code 1: <em>Used for retrievable errors</em>
675
- </p>
676
- </li>
677
-
678
- <li>
679
- <p>Return code 2: All jobs have failed </p>
680
- </li>
681
-
682
- <li>
683
- <p>Return code 3: Some jobs have failed </p>
684
- </li>
685
-
686
- </ol>
687
- <p></p>
688
- <p>In some cases it might be desirable to fail the entire script upon detecting the first failed job.
689
- This can be achieved with the "-F" or "-stop_on_failure" command line flag.
690
- If used, Pig will stop execution when the first failed job is detected and discontinue further processing.
691
- This also means that file commands that come after a failed store in the script will not be executed (this can be used to create "done" files). </p>
692
- <p>This is how the flag is used: </p>
693
- <pre class="code">
694
- $ pig -F myscript.pig
695
- or
696
- $ pig -stop_on_failure myscript.pig
697
- </pre>
698
- <a name="N1023F"></a><a name="Backward+Compatibility"></a>
699
- <h3 class="h4">Backward Compatibility</h3>
700
- <p>Most existing Pig scripts will produce the same result with or without the multi-query execution.
701
- There are cases though where this is not true. Path names and schemes are discussed here.</p>
702
- <p>Any script is parsed in it's entirety before it is sent to execution. Since the current directory can change
703
- throughout the script any path used in LOAD or STORE statement is translated to a fully qualified and absolute path.</p>
704
- <p>In map-reduce mode, the following script will load from "hdfs://&lt;host&gt;:&lt;port&gt;/data1" and store into "hdfs://&lt;host&gt;:&lt;port&gt;/tmp/out1". </p>
705
- <pre class="code">
706
- cd /;
707
- A = LOAD 'data1';
708
- cd tmp;
709
- STORE A INTO 'out1';
710
- </pre>
711
- <p>These expanded paths will be passed to any LoadFunc or Slicer implementation.
712
- In some cases this can cause problems, especially when a LoadFunc/Slicer is not used to read from a dfs file or path
713
- (for example, loading from an SQL database). </p>
714
- <p>Solutions are to either: </p>
715
- <ol>
716
-
717
- <li>
718
- <p>Specify "-M" or "-no_multiquery" to revert to the old names</p>
719
- </li>
720
-
721
- <li>
722
- <p>Specify a custom scheme for the LoadFunc/Slicer </p>
723
- </li>
724
-
725
- </ol>
726
- <p>Arguments used in a LOAD statement that have a scheme other than "hdfs" or "file" will not be expanded and passed to the LoadFunc/Slicer unchanged.</p>
727
- <p>In the SQL case, the SQLLoader function is invoked with 'sql://mytable'. </p>
728
- <pre class="code">
729
- A = LOAD 'sql://mytable' USING SQLLoader();
730
- </pre>
731
- <a name="N1026E"></a><a name="Implicit+Dependencies"></a>
732
- <h3 class="h4">Implicit Dependencies</h3>
733
- <p>If a script has dependencies on the execution order outside of what Pig knows about, execution may fail. </p>
734
- <a name="N10277"></a><a name="Example"></a>
735
- <h4>Example</h4>
736
- <p>In this script, MYUDF might try to read from out1, a file that A was just stored into.
737
- However, Pig does not know that MYUDF depends on the out1 file and might submit the jobs
738
- producing the out2 and out1 files at the same time.</p>
739
- <pre class="code">
740
- ...
741
- STORE A INTO 'out1';
742
- B = LOAD 'data2';
743
- C = FOREACH B GENERATE MYUDF($0,'out1');
744
- STORE C INTO 'out2';
745
- </pre>
746
- <p>To make the script work (to ensure that the right execution order is enforced) add the exec statement.
747
- The exec statement will trigger the execution of the statements that produce the out1 file. </p>
748
- <pre class="code">
749
- ...
750
- STORE A INTO 'out1';
751
- EXEC;
752
- B = LOAD 'data2';
753
- C = FOREACH B GENERATE MYUDF($0,'out1');
754
- STORE C INTO 'out2';
755
- </pre>
756
- <a name="N1028C"></a><a name="Example-N1028C"></a>
757
- <h4>Example</h4>
758
- <p>In this script, the STORE/LOAD operators have different file paths; however, the LOAD operator depends on the STORE operator.</p>
759
- <pre class="code">
760
- A = LOAD '/user/xxx/firstinput' USING PigStorage();
761
- B = group ....
762
- C = .... agrregation function
763
- STORE C INTO '/user/vxj/firstinputtempresult/days1';
764
- ..
765
- Atab = LOAD '/user/xxx/secondinput' USING PigStorage();
766
- Btab = group ....
767
- Ctab = .... agrregation function
768
- STORE Ctab INTO '/user/vxj/secondinputtempresult/days1';
769
- ..
770
- E = LOAD '/user/vxj/firstinputtempresult/' USING PigStorage();
771
- F = group ....
772
- G = .... aggregation function
773
- STORE G INTO '/user/vxj/finalresult1';
774
-
775
- Etab =LOAD '/user/vxj/secondinputtempresult/' USING PigStorage();
776
- Ftab = group ....
777
- Gtab = .... aggregation function
778
- STORE Gtab INTO '/user/vxj/finalresult2';
779
- </pre>
780
- <p>To make the script works, add the exec statement. </p>
781
- <pre class="code">
782
- A = LOAD '/user/xxx/firstinput' USING PigStorage();
783
- B = group ....
784
- C = .... agrregation function
785
- STORE C INTO '/user/vxj/firstinputtempresult/days1';
786
- ..
787
- Atab = LOAD '/user/xxx/secondinput' USING PigStorage();
788
- Btab = group ....
789
- Ctab = .... agrregation function
790
- STORE Ctab INTO '/user/vxj/secondinputtempresult/days1';
791
-
792
- EXEC;
793
-
794
- E = LOAD '/user/vxj/firstinputtempresult/' USING PigStorage();
795
- F = group ....
796
- G = .... aggregation function
797
- STORE G INTO '/user/vxj/finalresult1';
798
- ..
799
- Etab =LOAD '/user/vxj/secondinputtempresult/' USING PigStorage();
800
- Ftab = group ....
801
- Gtab = .... aggregation function
802
- STORE Gtab INTO '/user/vxj/finalresult2';
803
- </pre>
804
- </div>
805
- <!-- END MULTI-QUERY EXECUTION-->
806
-
807
-
808
-
809
- <!-- SPECIALIZED JOINS-->
810
-
811
- <a name="N102A7"></a><a name="Specialized+Joins"></a>
812
- <h2 class="h3">Specialized Joins</h2>
813
- <div class="section">
814
- <p>
815
- Pig Latin includes three "specialized" joins: replicated joins, skewed joins, and merge joins. </p>
816
- <ul>
817
-
818
- <li>Replicated, skewed, and merge joins can be performed using <a href="piglatin_ref2.html#JOIN+%28inner%29">inner joins</a>.</li>
819
-
820
- <li>Replicated and skewed joins can also be performed using <a href="piglatin_ref2.html#JOIN+%28outer%29">outer joins</a>.</li>
821
-
822
- </ul>
823
- <a name="N102C3"></a><a name="Replicated+Joins"></a>
824
- <h3 class="h4">Replicated Joins</h3>
825
- <p>Fragment replicate join is a special type of join that works well if one or more relations are small enough to fit into main memory.
826
- In such cases, Pig can perform a very efficient join because all of the hadoop work is done on the map side. In this type of join the
827
- large relation is followed by one or more small relations. The small relations must be small enough to fit into main memory; if they
828
- don't, the process fails and an error is generated.</p>
829
- <a name="N102CC"></a><a name="Usage"></a>
830
- <h4>Usage</h4>
831
- <p>Perform a replicated join with the USING clause (see <a href="piglatin_ref2.html#JOIN+%28inner%29">inner joins</a> and <a href="piglatin_ref2.html#JOIN+%28outer%29">outer joins</a>).
832
- In this example, a large relation is joined with two smaller relations. Note that the large relation comes first followed by the smaller relations;
833
- and, all small relations together must fit into main memory, otherwise an error is generated. </p>
834
- <pre class="code">
835
- big = LOAD 'big_data' AS (b1,b2,b3);
836
-
837
- tiny = LOAD 'tiny_data' AS (t1,t2,t3);
838
-
839
- mini = LOAD 'mini_data' AS (m1,m2,m3);
840
-
841
- C = JOIN big BY b1, tiny BY t1, mini BY m1 USING 'replicated';
842
- </pre>
843
- <a name="N102E2"></a><a name="Conditions"></a>
844
- <h4>Conditions</h4>
845
- <p>Fragment replicate joins are experimental; we don't have a strong sense of how small the small relation must be to fit
846
- into memory. In our tests with a simple query that involves just a JOIN, a relation of up to 100 M can be used if the process overall
847
- gets 1 GB of memory. Please share your observations and experience with us.</p>
848
- <a name="N102F1"></a><a name="Skewed+Joins"></a>
849
- <h3 class="h4">Skewed Joins</h3>
850
- <p>
851
- Parallel joins are vulnerable to the presence of skew in the underlying data.
852
- If the underlying data is sufficiently skewed, load imbalances will swamp any of the parallelism gains.
853
- In order to counteract this problem, skewed join computes a histogram of the key space and uses this
854
- data to allocate reducers for a given key. Skewed join does not place a restriction on the size of the input keys.
855
- It accomplishes this by splitting the left input on the join predicate and streaming the right input. The left input is
856
- sampled to create the histogram.
857
- </p>
858
- <p>
859
- Skewed join can be used when the underlying data is sufficiently skewed and you need a finer
860
- control over the allocation of reducers to counteract the skew. It should also be used when the data
861
- associated with a given key is too large to fit in memory.
862
- </p>
863
- <a name="N102FD"></a><a name="Usage-N102FD"></a>
864
- <h4>Usage</h4>
865
- <p>Perform a skewed join with the USING clause (see <a href="piglatin_ref2.html#JOIN+%28inner%29">inner joins</a> and <a href="piglatin_ref2.html#JOIN+%28outer%29">outer joins</a>). </p>
866
- <pre class="code">
867
- big = LOAD 'big_data' AS (b1,b2,b3);
868
- massive = LOAD 'massive_data' AS (m1,m2,m3);
869
- C = JOIN big BY b1, massive BY m1 USING 'skewed';
870
- </pre>
871
- <a name="N10313"></a><a name="Conditions-N10313"></a>
872
- <h4>Conditions</h4>
873
- <p>
874
- Skewed join will only work under these conditions:
875
- </p>
876
- <ul>
877
-
878
- <li>Skewed join works with two-table inner join. Currently we do not support more than two tables for skewed join.
879
- Specifying three-way (or more) joins will fail validation. For such joins, we rely on you to break them up into two-way joins.</li>
880
-
881
- <li>The pig.skewedjoin.reduce.memusage Java parameter specifies the fraction of heap available for the
882
- reducer to perform the join. A low fraction forces pig to use more reducers but increases
883
- copying cost. We have seen good performance when we set this value
884
- in the range 0.1 - 0.4. However, note that this is hardly an accurate range. Its value
885
- depends on the amount of heap available for the operation, the number of columns
886
- in the input and the skew. An appropriate value is best obtained by conducting experiments to achieve
887
- a good performance. The default value is =0.5=. </li>
888
-
889
- </ul>
890
- <a name="N1032A"></a><a name="Merge+Joins"></a>
891
- <h3 class="h4">Merge Joins</h3>
892
- <p>
893
- Often user data is stored such that both inputs are already sorted on the join key.
894
- In this case, it is possible to join the data in the map phase of a MapReduce job.
895
- This provides a significant performance improvement compared to passing all of the data through
896
- unneeded sort and shuffle phases.
897
- </p>
898
- <p>
899
- Pig has implemented a merge join algorithm, or sort-merge join, although in this case the sort is already
900
- assumed to have been done (see the Conditions, below).
901
-
902
- Pig implements the merge join algorithm by selecting the left input of the join to be the input file for the map phase,
903
- and the right input of the join to be the side file. It then samples records from the right input to build an
904
- index that contains, for each sampled record, the key(s) the filename and the offset into the file the record
905
- begins at. This sampling is done in an initial map only job. A second MapReduce job is then initiated,
906
- with the left input as its input. Each map uses the index to seek to the appropriate record in the right
907
- input and begin doing the join.
908
- </p>
909
- <a name="N10336"></a><a name="Usage-N10336"></a>
910
- <h4>Usage</h4>
911
- <p>Perform a merge join with the USING clause (see <a href="piglatin_ref2.html#JOIN+%28inner%29">inner joins</a>).</p>
912
- <pre class="code">
913
- C = JOIN A BY a1, B BY b1 USING 'merge';
914
- </pre>
915
- <a name="N10348"></a><a name="Conditions-N10348"></a>
916
- <h4>Conditions</h4>
917
- <p>
918
- Merge join will only work under these conditions:
919
- </p>
920
- <ul>
921
-
922
- <li>Both inputs are sorted in *ascending* order of join keys. If an input consists of many files, there should be
923
- a total ordering across the files in the *ascending order of file name*. So for example if one of the inputs to the
924
- join is a directory called input1 with files a and b under it, the data should be sorted in ascending order of join
925
- key when read starting at a and ending in b. Likewise if an input directory has part files part-00000, part-00001,
926
- part-00002 and part-00003, the data should be sorted if the files are read in the sequence part-00000, part-00001,
927
- part-00002 and part-00003. </li>
928
-
929
- <li>The merge join only has two inputs </li>
930
-
931
- <li>The loadfunc for the right input of the join should implement the OrderedLoadFunc interface (PigStorage does
932
- implement the OrderedLoadFunc interface). </li>
933
-
934
- <li>Only inner join will be supported </li>
935
-
936
-
937
- <li>Between the load of the sorted input and the merge join statement there can only be filter statements and
938
- foreach statement where the foreach statement should meet the following conditions:
939
- <ul>
940
-
941
- <li>There should be no UDFs in the foreach statement </li>
942
-
943
- <li>The foreach statement should not change the position of the join keys </li>
944
-
945
- <li>There should not transformation on the join keys which will change the sort order </li>
946
-
947
- </ul>
948
-
949
- </li>
950
-
951
-
952
- </ul>
953
- <p></p>
954
- <p>
955
- For optimal performance, each part file of the left (sorted) input of the join should have a size of at least
956
- 1 hdfs block size (for example if the hdfs block size is 128 MB, each part file should be less than 128 MB).
957
- If the total input size (including all part files) is greater than blocksize, then the part files should be uniform in size
958
- (without large skews in sizes). The main idea is to eliminate skew in the amount of input the final map
959
- job performing the merge-join will process.
960
- </p>
961
- <p>
962
- In local mode, merge join will revert to regular join.
963
- </p>
964
- </div>
965
- <!-- END SPECIALIZED JOINS-->
966
-
967
- <!-- OPTIMIZATION RULES -->
968
-
969
- <a name="N1037F"></a><a name="Optimization+Rules"></a>
970
- <h2 class="h3">Optimization Rules</h2>
971
- <div class="section">
972
- <p>Pig supports various optimization rules. By default optimization, and all optimization rules, are turned on.
973
- To turn off optimiztion, use:</p>
974
- <pre class="code">
975
- pig -optimizer_off [opt_rule | all ]
976
- </pre>
977
- <p>Note that some rules are mandatory and cannot be turned off.</p>
978
- <a name="N1038F"></a><a name="ImplicitSplitInserter"></a>
979
- <h3 class="h4">ImplicitSplitInserter</h3>
980
- <p>Status: Mandatory</p>
981
- <p>
982
-
983
- <a href="piglatin_ref2.html#SPLIT">SPLIT</a> is the only operator that models multiple outputs in Pig.
984
- To ease the process of building logical plans, all operators are allowed to have multiple outputs. As part of the
985
- optimization, all non-split operators that have multiple outputs are altered to have a SPLIT operator as the output
986
- and the outputs of the operator are then made outputs of the SPLIT operator. An example will illustrate the point.
987
- Here, a split will be inserted after the LOAD and the split outputs will be connected to the FILTER (b) and the COGROUP (c).
988
- </p>
989
- <pre class="code">
990
- A = LOAD 'input';
991
- B = FILTER A BY $1 == 1;
992
- C = COGROUP A BY $0, B BY $0;
993
- </pre>
994
- <a name="N103A4"></a><a name="TypeCastInserter"></a>
995
- <h3 class="h4">TypeCastInserter</h3>
996
- <p>Status: Mandatory</p>
997
- <p>
998
- If you specify a <a href="piglatin_ref2.html#Schemas">schema</a> with the
999
- <a href="piglatin_ref2.html#LOAD">LOAD</a> statement, the optimizer will perform a pre-fix projection of the columns
1000
- and <a href="piglatin_ref2.html#Cast+Operators">cast</a> the columns to the appropriate types. An example will illustrate the point.
1001
- The LOAD statement (a) has a schema associated with it. The optimizer will insert a FOREACH operator that will project columns 0, 1 and 2
1002
- and also cast them to chararray, int and float respectively.
1003
- </p>
1004
- <pre class="code">
1005
- A = LOAD 'input' AS (name: chararray, age: int, gpa: float);
1006
- B = FILER A BY $1 == 1;
1007
- C = GROUP A By $0;
1008
- </pre>
1009
- <a name="N103C1"></a><a name="StreamOptimizer"></a>
1010
- <h3 class="h4">StreamOptimizer</h3>
1011
- <p>
1012
- Optimize when <a href="piglatin_ref2.html#LOAD">LOAD</a> precedes <a href="piglatin_ref2.html#STREAM">STREAM</a>
1013
- and the loader class is the same as the serializer for the stream. Similarly, optimize when STREAM is followed by
1014
- <a href="piglatin_ref2.html#STORE">STORE</a> and the deserializer class is same as the storage class.
1015
- For both of these cases the optimization is to replace the loader/serializer with BinaryStorage which just moves bytes
1016
- around and to replace the storer/deserializer with BinaryStorage.
1017
- </p>
1018
- <a name="N103D7"></a><a name="OpLimitOptimizer"></a>
1019
- <h3 class="h4">OpLimitOptimizer</h3>
1020
- <p>
1021
- The objective of this rule is to push the <a href="piglatin_ref2.html#LIMIT">LIMIT</a> operator up the data flow graph
1022
- (or down the tree for database folks). In addition, for top-k (ORDER BY followed by a LIMIT) the LIMIT is pushed into the ORDER BY.
1023
- </p>
1024
- <pre class="code">
1025
- A = LOAD 'input';
1026
- B = ORDER A BY $0;
1027
- C = LIMIT B 10;
1028
- </pre>
1029
- <a name="N103E9"></a><a name="PushUpFilters"></a>
1030
- <h3 class="h4">PushUpFilters</h3>
1031
- <p>
1032
- The objective of this rule is to push the <a href="piglatin_ref2.html#FILTER">FILTER</a> operators up the data flow graph.
1033
- As a result, the number of records that flow through the pipeline is reduced.
1034
- </p>
1035
- <pre class="code">
1036
- A = LOAD 'input';
1037
- B = GROUP A BY $0;
1038
- C = FILTER B BY $0 &lt; 10;
1039
- </pre>
1040
- <a name="N103FB"></a><a name="PushDownExplodes"></a>
1041
- <h3 class="h4">PushDownExplodes</h3>
1042
- <p>
1043
- The objective of this rule is to reduce the number of records that flow through the pipeline by moving
1044
- <a href="piglatin_ref2.html#FOREACH">FOREACH</a> operators with a
1045
- <a href="piglatin_ref2.html#Flatten+Operator">FLATTEN</a> down the data flow graph.
1046
- In the example shown below, it would be more efficient to move the foreach after the join to reduce the cost of the join operation.
1047
- </p>
1048
- <pre class="code">
1049
- A = LOAD 'input' AS (a, b, c);
1050
- B = LOAD 'input2' AS (x, y, z);
1051
- C = FOREACH A GENERATE FLATTEN($0), B, C;
1052
- D = JOIN C BY $1, B BY $1;
1053
- </pre>
1054
- </div> <!-- END OPTIMIZATION RULES -->
1055
-
1056
- <!-- MEMORY MANAGEMENT -->
1057
-
1058
- <a name="N10416"></a><a name="Memory+Management"></a>
1059
- <h2 class="h3">Memory Management</h2>
1060
- <div class="section">
1061
- <p>Pig allocates a fix amount of memory to store bags and spills to disk as soon as the memory limit is reached. This is very similar to how Hadoop decides when to spill data accumulated by the combiner. </p>
1062
- <p>The amount of memory allocated to bags is determined by pig.cachedbag.memusage; the default is set to 10% of available memory. Note that this memory is shared across all large bags used by the application.</p>
1063
- </div>
1064
- <!-- END MEMORY MANAGEMENT -->
1065
-
1066
-
1067
- <!-- ZEBRA INTEGRATION -->
1068
-
1069
- <a name="N10427"></a><a name="Zebra+Integration"></a>
1070
- <h2 class="h3">Zebra Integration</h2>
1071
- <div class="section">
1072
- <p>For information about how to integrate Zebra with your Pig scripts, see <a href="zebra_pig.html">Zebra and Pig</a>.</p>
1073
- </div>
1074
- <!-- END ZEBRA INTEGRATION -->
1075
-
1076
-
1077
-
1078
-
1079
- </div>
1080
- <!--+
1081
- |end content
1082
- +-->
1083
- <div class="clearboth">&nbsp;</div>
1084
- </div>
1085
- <div id="footer">
1086
- <!--+
1087
- |start bottomstrip
1088
- +-->
1089
- <div class="lastmodified">
1090
- <script type="text/javascript"><!--
1091
- document.write("Last Published: " + document.lastModified);
1092
- // --></script>
1093
- </div>
1094
- <div class="copyright">
1095
- Copyright &copy;
1096
- 2007-2010 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
1097
- </div>
1098
- <!--+
1099
- |end bottomstrip
1100
- +-->
1101
- </div>
1102
- </body>
1103
- </html>