wukong 3.0.0.pre → 3.0.0.pre2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,8 +0,0 @@
1
- ---
2
- layout: default
3
- title: mrflip.github.com/wukong
4
- collapse: false
5
- ---
6
- h1(gemheader). Intro %(small):: 3 simple examples%
7
-
8
- {% include intro.textile %}
@@ -1,174 +0,0 @@
1
- ---
2
- layout: default
3
- title: mrflip.github.com/wukong - TODO
4
- collapse: false
5
- ---
6
-
7
-
8
- h1(gemheader). Wukong More Info
9
-
10
- ** "Why is it called Wukong?":#name
11
- ** "Don't Use Wukong, use this instead":#whateverdude
12
- ** "Further Reading and useful links":#links
13
- ** "Note on Patches/Pull Requests":#patches
14
- ** "What's up with Wukong::AndPig?":#andpig
15
- ** "Map/Reduce Algorithms":#algorithms
16
- ** "TODOs":#TODO
17
-
18
-
19
- <notextile><div class="toggle"></notextile>
20
-
21
- h2(#name). Why is it called Wukong?
22
-
23
- Hadoop, as you may know, is "named after a stuffed elephant.":http://en.wikipedia.org/wiki/Hadoop Since Wukong was started by the "infochimps":http://infochimps.org team, we needed a simian analog. A Monkey King who journeyed to the land of the Elephant seems to fit the bill:
24
-
25
- bq. Sun Wukong (孙悟空), known in the West as the Monkey King, is the main character in the classical Chinese epic novel Journey to the West. In the novel, he accompanies the monk Xuanzang on the journey to retrieve Buddhist sutras from India.
26
-
27
- bq. Sun Wukong possesses incredible strength, being able to lift his 13,500 jīn (8,100 kg) Ruyi Jingu Bang with ease. He also has superb speed, traveling 108,000 li (54,000 kilometers) in one somersault. Sun knows 72 transformations, which allows him to transform into various animals and objects; he is, however, shown with slight problems transforming into other people, since he is unable to complete the transformation of his tail. He is a skilled fighter, capable of holding his own against the best generals of heaven. Each of his hairs possesses magical properties, and is capable of transforming into a clone of the Monkey King himself, or various weapons, animals, and other objects. He also knows various spells in order to command wind, part water, conjure protective circles against demons, freeze humans, demons, and gods alike. -- ["Sun Wukong's Wikipedia entry":http://en.wikipedia.org/wiki/Wukong]
28
-
29
- The "Jaime Hewlett / Damon Albarn short":http://news.bbc.co.uk/sport1/hi/olympics/monkey that the BBC made for their 2008 Olympics coverage gives the general idea.
30
-
31
- <notextile></div><div class="toggle"></notextile>
32
-
33
- h2(#algorithms). Map/Reduce Algorithms
34
-
35
- Example graph scripts:
36
-
37
- * Multigraph
38
- * Pagerank (done)
39
- * Breadth-first search
40
- * Triangle enumeration
41
- * Clustering
42
-
43
- h3. K-Nearest Neighbors
44
-
45
- More example hadoop algorithms:
46
- * Bigram counts: http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/exercises/bigrams.html
47
- * Inverted index construction: http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/exercises/indexer.html
48
- * Pagerank : http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/exercises/pagerank.html
49
- * SIPs, Median, classifiers and more : http://matpalm.com/
50
- * Brad Heintz's "Distributed Computing with Ruby":http://www.bradheintz.com/no1thing/talks/ demonstrates Travelling Salesman in map/reduce.
51
-
52
- * "Clustering billions of images with large scale nearest neighbor search":http://scholar.google.com/scholar?cluster=2473742255769621469&hl=en uses three map/reduce passes:
53
- ** Subsample to build a "spill tree" that roughly localizes each object
54
- ** Use the spill tree on the full dataset to group each object with its potential neighbors
55
- ** Calculate the metrics and emit only the k-nearest neighbors
56
-
57
- Example example scripts (from http://www.cloudera.com/resources/learning-mapreduce):
58
-
59
- 1. Find the [number of] hits by 5 minute timeslot for a website given its access logs.
60
- 2. Find the pages with over 1 million hits in day for a website given its access logs.
61
- 3. Find the pages that link to each page in a collection of webpages.
62
- 4. Calculate the proportion of lines that match a given regular expression for a collection of documents.
63
- 5. Sort tabular data by a primary and secondary column.
64
- 6. Find the most popular pages for a website given its access logs.
65
-
66
- <notextile></div><div class="toggle"></notextile>
67
-
68
- h2(#whateverdude). Don't Use Wukong, use this instead
69
-
70
- There are several worthy Hadoop|Streaming Frameworks:
71
-
72
- * infochimps.org's "Wukong":http://github.com/mrflip/wukong -- ruby; object-oriented *and* record-oriented
73
- * NYTimes' "MRToolkit":http://code.google.com/p/mrtoolkit/ -- ruby; much more log-oriented
74
- * Freebase's "Happy":http://code.google.com/p/happy/ -- python; the most performant, as it can use Jython to make direct API calls.
75
- * Last.fm's "Dumbo":http://wiki.github.com/klbostee/dumbo -- python
76
-
77
- Most people use Wukong / one of the above (or straight Java Hadoop, poor souls) for heavy lifting, and several of the following hadoop tools for efficiency:
78
-
79
- * Pig OR
80
- * Hive -- hive is more SQL-ish, Pig is more elegant (in a brushed-metal kind of way). I greatly prefer Pig, because I hate SQL; you may feel differently.
81
- * Sqoop
82
- * Mahout
83
-
84
- <notextile></div><div class="toggle"></notextile>
85
-
86
- h2(#links). Further Reading and useful links:
87
-
88
- * "Ruby Hadoop Quickstart":http://blog.pdatasolutions.com/post/191978092/ruby-on-hadoop-quickstart - dive right in with Wukong, Hadoop and the Amazon Elastic MapReduce cloud. Once you get bored with the command line, this is the fastest path to Wukong power.
89
- * "Distributed Computing with Ruby":http://www.bradheintz.com/no1thing/talks/ has some raw ruby, some Wukong and some JRuby/Hadoop integration -- it demonstrates a Travelling Salesman in map/reduce. Cool!
90
-
91
- * "Hadoop, The Definitive Guide":http://www.amazon.com/Hadoop-Definitive-Guide-Tom-White/dp/0596521979
92
-
93
- * "Running Hadoop On Ubuntu Linux (Single-Node Cluster)":http://www.michael-noll.com/wiki/Running_Hadoop_On_Ubuntu_Linux_(Single-Node_Cluster) and "unning Hadoop On Ubuntu Linux (Multi-Node Cluster).":http://www.michael-noll.com/wiki/Running_Hadoop_On_Ubuntu_Linux_(Multi-Node_Cluster)
94
- * "Running Hadoop MapReduce on Amazon EC2 and S3":http://developer.amazonwebservices.com/connect/entry.jspa?externalID=873
95
-
96
- * "Hadoop Overview by Doug Cutting":http://video.google.com/videoplay?docid=-4912926263813234341 - the founder of the Hadoop project. (49m video)
97
-
98
- * "Cluster Computing and Map|Reduce":http://www.youtube.com/results?search_query=cluster+computing+and+mapreduce
99
- ** "Lecture 1: Overview":http://www.youtube.com/watch?v=yjPBkvYh-ss
100
- ** "Lecture 2 (technical): Map|Reduce":http://www.youtube.com/watch?v=-vD6PUdf3Js
101
- ** "Lecture 3 (technical): GFS (Google File System)":http://www.youtube.com/watch?v=5Eib_H_zCEY
102
- ** "Lecture 4 (theoretical): Canopy Clustering":http://www.youtube.com/watch?v=1ZDybXl212Q
103
- ** "Lecture 5 (theoretical): Breadth-First Search":http://www.youtube.com/watch?v=BT-piFBP4fE
104
-
105
- * "Cloudera Hadoop Training:":http://www.cloudera.com/hadoop-training
106
- ** "Thinking at Scale":http://www.cloudera.com/hadoop-training-thinking-at-scale
107
- ** "Mapreduce and HDFS":http://www.cloudera.com/hadoop-training-mapreduce-hdfs
108
- ** "A Tour of the Hadoop Ecosystem":http://www.cloudera.com/hadoop-training-ecosystem-tour
109
- ** "Programming with Hadoop":http://www.cloudera.com/hadoop-training-programming-with-hadoop
110
- ** "Hadoop and Hive: introduction":http://www.cloudera.com/hadoop-training-hive-introduction
111
- ** "Hadoop and Hive: tutorial":http://www.cloudera.com/hadoop-training-hive-tutorial
112
- ** "Hadoop and Pig: Introduction":http://www.cloudera.com/hadoop-training-pig-introduction
113
- ** "Hadoop and Pig: Tutorial":http://www.cloudera.com/hadoop-training-pig-tutorial
114
- ** "Mapreduce Algorithms":http://www.cloudera.com/hadoop-training-mapreduce-algorithms
115
- ** "Exercise: Getting started with Hadoop":http://www.cloudera.com/hadoop-training-exercise-getting-started-with-hadoop
116
- ** "Exercise: Writing mapreduce programs":http://www.cloudera.com/hadoop-training-exercise-writing-mapreduce-programs
117
- ** "Cloudera Blog":http://www.cloudera.com/blog/
118
-
119
- * "Hadoop Wiki: Hadoop Streaming":http://wiki.apache.org/hadoop/HadoopStreaming
120
- * "Hadoop Docs: Hadoop Streaming":http://hadoop.apache.org/common/docs/current/streaming.html
121
-
122
- * A "dimwitted screed on Ruby, Hadoop and Starling":http://www.theregister.co.uk/2008/08/11/hadoop_dziuba/ seemingly written with jockstrap on head.
123
-
124
- <notextile></div><div class="toggle"></notextile>
125
-
126
- h2(#patches). Note on Patches/Pull Requests
127
-
128
- * Fork the project.
129
- * Make your feature addition or bug fix.
130
- * Add tests for it. This is important so I don't break it in a future version unintentionally.
131
- * Commit, do not mess with rakefile, version, or history. (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
132
- * Send me a pull request. Bonus points for topic branches.
133
-
134
- <notextile></div><div class="toggle"></notextile>
135
-
136
- h2(#andpig). What's up with Wukong::AndPig?
137
-
138
- @Wukong::AndPig@ is a small library to more easily generate code for the "Pig":http://hadoop.apache.org/pig data analysis language. See its "README":http://github.com/mrflip/wukong/tree/master/lib/wukong/and_pig/README.textile for more.
139
-
140
- It's **not really being worked on**, and you should probably **ignore it**.
141
-
142
- <notextile></div><div class="toggle"></notextile>
143
-
144
- h2(#todo). TODOs
145
-
146
- Utility
147
-
148
- * columnizing / reconstituting
149
-
150
- * Set up with JRuby
151
- * Allow for direct HDFS operations
152
- * Make the dfs commands slightly less stupid
153
- * add more standard options
154
- * Allow for combiners
155
- * JobStarter / JobSteps
156
- * might as well take dumbo's command line args
157
-
158
- BUGS:
159
-
160
- * Can't do multiple input files in local mode
161
-
162
- Patterns to implement:
163
-
164
- * Stats reducer
165
- ** basic sum, avg, max, min, std.dev of a numeric field
166
- ** the "running standard deviation":http://www.johndcook.com/standard_deviation.html
167
-
168
- * Efficient median (and other order statistics)
169
-
170
- * Make StructRecordizer work generically with other reducers (spec. AccumulatingReducer)
171
-
172
- Make wutils: tsv-oriented implementations of the coreutils (eg uniq, sort, cut, nl, wc, split, ls, df and du) to instrinsically accept and emit tab-separated records.
173
-
174
- <notextile></div></notextile>
@@ -1,24 +0,0 @@
1
- ---
2
- layout: default
3
- title: edamame news
4
- collapse: true
5
- ---
6
- <h1 class="gemheader">{% if site.gemname %}{{ site.gemname }}{% else %}mrflip{% endif %}<span class="small">:: news</span></h1>
7
-
8
- <div id="news">
9
- {% for t in site.posts %} {% assign has_posts = true %}{% endfor %}{% if has_posts %}
10
- {% for post in site.posts %}
11
- <div class="toggle" id="news-{{ post.id }}">
12
-
13
- <h2><a href="{{ post.url }}">{{ post.title }}</a><span class="postdate"> &raquo; {{ post.date | date_to_string }}</span></h2>
14
-
15
- {{ post.content }}
16
-
17
- </div>
18
- {% endfor %}
19
- {% else %}
20
- <p class="heavy">
21
- <em>(no news. good news?)</em>
22
- </p>
23
- {% endif %}
24
- </div>
@@ -1,122 +0,0 @@
1
-
2
- h2. RelationalOperators
3
-
4
- * foreach
5
-
6
- * cogroup
7
- * group
8
- * join
9
-
10
- * cross
11
-
12
- * distinct
13
- * filter
14
- * limit
15
- * order
16
- * split
17
- * union
18
-
19
- * load
20
- * store
21
-
22
- h2. Streaming Operator
23
- * stream
24
-
25
- h2. UDF Statements
26
- * define
27
- * register
28
-
29
- h2. Diagnostic Statements
30
- * describe
31
- * dump
32
- * explain
33
- * illustrate
34
-
35
- h2. Built-in Functions
36
- * EvalFunctions
37
- * AVG
38
- * CONCAT
39
- * COUNT
40
- * DIFF
41
- * MIN
42
- * MAX
43
- * SIZE
44
- * SUM
45
- * TOKENIZE
46
- * Load/StoreFunctions
47
- * BinaryDeserializer
48
- * BinarySerializer
49
- * BinStorage
50
- * PigStorage
51
- * PigDump
52
- * TextLoader
53
-
54
- h1. Operators
55
- * ArithmeticOperators
56
- - addition+
57
- - subtraction-
58
- - multiplication*
59
- - division/
60
- - modulo%
61
- - bincond?
62
- * ComparisonOperators
63
- - Equal==
64
- - notequal!=
65
- - lessthan<
66
- - greaterthan>
67
- - lessthanorequalto<=
68
- - greaterthanorequalto>=
69
- - patternmatchingmatches
70
- * NullOperators
71
- - isnull
72
- - isnotnull
73
- * BooleanOperators
74
- - and
75
- - or
76
- - not
77
- * DereferenceOperators
78
- - tupledereference.
79
- - mapdereference#
80
- * SignOperators
81
- - positive+
82
- - negative-
83
- * CastOperators
84
- - (type)$0
85
- - (type)alias
86
- - Nulls
87
- - Constants
88
- - Expressions
89
- - Schemas
90
- - Keywords
91
-
92
- h1. DataTypes
93
- h2. SimpleDataTypes
94
- - int
95
- - long
96
- - double
97
- - arrays
98
- - chararray
99
- - bytearray
100
- h2. ComplexDataTypes
101
- - tuple
102
- - bag
103
- - map
104
-
105
- h1. FileCommands
106
- * cat
107
- * cd
108
- * copyFromLocal
109
- * copyToLocal
110
- * cp
111
- * ls
112
- * mkdir
113
- * mv
114
- * pwd
115
- * rm
116
- * rmf
117
-
118
- h1. UtilityCommands
119
- * help
120
- * kill
121
- * quit
122
- * set
@@ -1,1640 +0,0 @@
1
- # ---------------------------------------------------------------------------
2
- #
3
- # = CROSS
4
- #
5
- # Computes the cross product of two or more relations.
6
- #
7
- # == Syntax
8
- #
9
- # alias = CROSS alias, alias [, alias …] [PARALLEL n];
10
- #
11
- # == Terms
12
- #
13
- # alias::
14
- # The name of a relation.
15
- #
16
- # PARALLEL n::
17
- # Increase the parallelism of a job by specifying the number of reduce tasks,
18
- # n. The optimal number of parallel tasks depends on the amount of memory on
19
- # each node and the memory required by each of the tasks. To determine n, use
20
- # the following as a general guideline:
21
- # n = (nr_nodes - 1) * 0.45 * nr_GB
22
- # where nr_nodes is the number of nodes used and nr_GB is the amount of physical
23
- # memory on each node.
24
- #
25
- # Note the following:
26
- # * Parallel only affects the number of reduce tasks. Map parallelism is
27
- # determined by the input file, one map for each HDFS block.
28
- # * If you don’t specify parallel, you still get the same map parallelism but
29
- # only one reduce task.
30
- #
31
- # == Usage
32
- #
33
- # Use the CROSS operator to compute the cross product (Cartesian product) of two
34
- # or more relations.
35
- #
36
- # CROSS is an expensive operation and should be used sparingly.
37
- #
38
- # == Example
39
- #
40
- # Suppose we have relations A and B.
41
- #
42
- # (A) (B)
43
- # ----------- --------
44
- # (1, 2, 3) (2, 4)
45
- # (4, 2, 1) (8, 9)
46
- # (1, 3)
47
- #
48
- # In this example the cross product of relation A and B is computed.
49
- #
50
- # X = CROSS A, B;
51
- #
52
- # Relation X looks like this.
53
- #
54
- # (1, 2, 3, 2, 4)
55
- # (1, 2, 3, 8, 9)
56
- # (1, 2, 3, 1, 3)
57
- # (4, 2, 1, 2, 4)
58
- # (4, 2, 1, 8, 9)
59
- # (4, 2, 1, 1, 3)
60
- #
61
-
62
-
63
- # ---------------------------------------------------------------------------
64
- #
65
- # DISTINCT
66
- #
67
- # Removes duplicate tuples in a relation.
68
- #
69
- # == Syntax
70
- #
71
- # alias = DISTINCT alias [PARALLEL n];
72
- #
73
- # == Terms
74
- #
75
- # alias::
76
- # The name of a relation.
77
- #
78
- # PARALLEL n::
79
- # Increase the parallelism of a job by specifying the number of reduce tasks,
80
- # n. The optimal number of parallel tasks depends on the amount of memory on
81
- # each node and the memory required by each of the tasks. To determine n, use
82
- # the following as a general guideline:
83
- # n = (nr_nodes - 1) * 0.45 * nr_GB
84
- # where nr_nodes is the number of nodes used and nr_GB is the amount of physical
85
- # memory on each node.
86
- #
87
- # Note the following:
88
- # * Parallel only affects the number of reduce tasks. Map parallelism is
89
- # determined by the input file, one map for each HDFS block.
90
- # * If you don’t specify parallel, you still get the same map parallelism but
91
- # only one reduce task.
92
- #
93
- # == Usage
94
- #
95
- # Use the DISTINCT operator to remove duplicate tuples in a relation. DISTINCT
96
- # does not preserve the original order of the contents (to eliminate duplicates,
97
- # Pig must first sort the data). You cannot use DISTINCT on a subset of fields. To
98
- # do this, use FOREACH … GENERATE to select the fields, and then use DISTINCT.
99
- #
100
- # == Example
101
- #
102
- # Suppose we have relation A.
103
- #
104
- # (A)
105
- # ---------
106
- # (8, 3, 4)
107
- # (1, 2, 3)
108
- # (4, 3, 3)
109
- # (4, 3, 3)
110
- # (1, 2, 3)
111
- #
112
- # In this example all duplicate tuples are removed.
113
- #
114
- # X = DISTINCT A;
115
- #
116
- # Relation X looks like this.
117
- #
118
- # (1, 2, 3)
119
- # (4, 3, 3)
120
- # (8, 3, 4)
121
- #
122
-
123
- # ---------------------------------------------------------------------------
124
- #
125
- # FILTER
126
- #
127
- # Selects tuples (rows) from a relation based on some condition.
128
- #
129
- # == Syntax
130
- #
131
- # alias = FILTER alias BY expression;
132
- #
133
- # == Terms
134
- #
135
- # alias::
136
- # The name of a relation.
137
- #
138
- # BY::
139
- # Required keyword.
140
- #
141
- # expression::
142
- # An expression.
143
- #
144
- # == Usage
145
- #
146
- # Use the FILTER operator to work with tuples (rows) of data. FILTER is commonly
147
- # used to select the data that you want; or, conversely, to filter out (remove)
148
- # the data you don’t want.
149
- #
150
- # Note: If you want to work with specific fields (columns) of data, use the
151
- # FOREACH …GENERATE operation.
152
- #
153
- # == Examples
154
- #
155
- # Suppose we have relation A.
156
- #
157
- # (A: f1:int, f2:int, f3:int)
158
- # ----------------
159
- # (1, 2, 3)
160
- # (4, 2, 1)
161
- # (8, 3, 4)
162
- # (4, 3, 3)
163
- # (7, 2, 5)
164
- # (8, 4, 3)
165
- #
166
- # In this example the condition states that if the third field equals 3, then add the tuple to relation X.
167
- #
168
- # X = FILTER A BY f3 == 3;
169
- #
170
- # Relation X looks like this.
171
- #
172
- # (1, 2, 3)
173
- # (4, 3, 3)
174
- # (8, 4, 3)
175
- #
176
- # In this example the condition states that if the first field equals 8 or if the sum of fields f2 and f3 is not greater than first field, then add the tuple to relation X.
177
- #
178
- # X = FILTER A BY (f1 == 8) OR (NOT (f2+f3 > f1));
179
- #
180
- # Relation X looks like this.
181
- #
182
- # (4, 2, 1)
183
- # (8, 3, 4)
184
- # (7, 2, 5)
185
- # (8, 4, 3)
186
- #
187
-
188
- # ---------------------------------------------------------------------------
189
- #
190
- # FOREACH … GENERATE
191
- #
192
- # Generates data transformations based on fields (columns) of data.
193
- #
194
- # == Syntax
195
- #
196
- # alias = FOREACH { gen_blk | nested_gen_blk } [AS schema];
197
- #
198
- # == Terms
199
- #
200
- # alias::
201
- # The name of a relation.
202
- #
203
- # gen_blk::
204
- # FOREACH … GENERATE used with a non-nested relation. Use this syntax:
205
- #
206
- # alias = FOREACH alias GENERATE expression [expression ….]
207
- #
208
- # nested_gen_blk::
209
- # FOREACH … GENERATE used with a nested relation. Use this syntax:
210
- #
211
- # alias = FOREACH nested_alias {
212
- # alias = nested_op; [alias = nested_op; …]
213
- # GENERATE expression [expression ….]
214
- # };
215
- #
216
- # where:
217
- # * The nested block is enclosed in opening and closing brackets { … }.
218
- # * The GENERATE keyword must be the last statement within the nested block.
219
- #
220
- # expression::
221
- # An expression.
222
- #
223
- # nested_alias::
224
- # If one of the fields (columns) in a relation is a bag, the bag can be treated
225
- # as an inner or a nested relation.
226
- #
227
- # nested_op::
228
- # Allowable operations include FILTER, ORDER, and DISTINCT.
229
- #
230
- # The FOREACH … GENERATE operation itself is not allowed since this could lead
231
- # to an arbitrary number of nesting levels.
232
- #
233
- # AS::
234
- # Keyword.
235
- #
236
- # schema::
237
- # A schema using the AS keyword (see Schemas).
238
- #
239
- # * If the FLATTEN keyword is used, enclose the schema in parentheses.
240
- #
241
- # * If the FLATTEN keyword is not used, don't enclose the schema in parentheses.
242
- #
243
- # == Usage
244
- #
245
- # Use the FOREACH …GENERATE operation to work with individual fields (columns) of data. The FOREACH …GENERATE operation works with non-nested and nested relations.
246
- #
247
- # A statement with a non-nested relation A could look like this.
248
- #
249
- # X = FOREACH A GENERATE f1;
250
- #
251
- # A statement with a nested relation A could look like this.
252
- #
253
- # X = FOREACH B {
254
- #
255
- # S = FILTER A by 'xyz';
256
- #
257
- # GENERATE COUNT (S.$0);
258
- #
259
- # }
260
- #
261
- # Note: FOREACH … GENERATE works with fields (columns) of data. If you want to work with entire tuples (rows) of data, use the FILTER operation.
262
- #
263
- # == Examples
264
- #
265
- # Suppose we have relations A and B, and derived relation C (where C = COGROUP A BY a1 INNER, B BY b1 INNER;).
266
- #
267
- # (A: a1:int, a2:int, a3:int)
268
- # -----------------
269
- # (1, 2, 3)
270
- # (4, 2, 1)
271
- # (8, 3, 4)
272
- # (4, 3, 3)
273
- # (7, 2, 5)
274
- # (8, 4, 3)
275
- #
276
- #
277
- # (B: b1:int, b2:int)
278
- # ---------------
279
- # (2, 4)
280
- # (8, 9)
281
- # (1, 3)
282
- # (2, 7)
283
- # (2, 9)
284
- # (4, 6)
285
- # (4, 9)
286
- #
287
- # (C: c1, c2, c3)
288
- # ---------------------
289
- # (1, {(1, 2, 3)}, {(1, 3)})
290
- # (4, {(4, 2, 1), (4, 3, 3)}, {(4, 6), (4, 9)})
291
- # (8, {(8, 3, 4), (8, 4, 3)}, {(8, 9)})
292
- #
293
- #
294
- # == Example: Projection
295
- #
296
- # In this example the asterisk (*) is used to project all fields from relation A to relation X (this is similar to SQL Select *). Relation A and X are identical.
297
- #
298
- # X = FOREACH A GENERATE *;
299
- #
300
- # In this example two fields from relation A are projected to form relation X.
301
- #
302
- # X = FOREACH A GENERATE a1, a2;
303
- #
304
- # Relation X looks this.
305
- #
306
- # (1, 2)
307
- # (4, 2)
308
- # (8, 3)
309
- # (4, 3)
310
- # (7, 2)
311
- # (8, 4)
312
- # == Example: Nested Projection
313
- #
314
- # Note: See GROUP for information about the "group" field in relation C.
315
- #
316
- # In this example if one of the fields in the input relation is a tuple, bag or map, we can perform projection on that field.
317
- #
318
- # X = FOREACH C GENERATE group, B.b2;
319
- #
320
- # Relation X looks like this.
321
- #
322
- # (1, {(3)})
323
- # (4, {(6), (9)})
324
- # (8, {(9)})
325
- #
326
- # In this example multiple nested columns are retained.
327
- #
328
- # X = FOREACH C GENERATE group, A.(a1, a2);
329
- #
330
- # Relation X looks like this.
331
- #
332
- # (1, {(1, 2)})
333
- # (4, {(4, 2), (4, 3)})
334
- # (8, {(8, 3), (8, 4)})
335
- # == Example: Schema
336
- #
337
- # In this example two fields in relation A are summed to form relation X. A schema is defined for the projected field.
338
- #
339
- # X = FOREACH A GENERATE a1+a2 AS f1:int;
340
- #
341
- # Y = FILTER X by f1 > 10;
342
- #
343
- # Relations X and Y look this.
344
- #
345
- # (X) (Y)
346
- # ----- ------
347
- # (3) (11)
348
- # (6) (12)
349
- # (11)
350
- # (7)
351
- # (9)
352
- # (12)
353
- #
354
- # == Example: Applying Functions
355
- #
356
- # Note: See GROUP for information about the "group" field in relation C.
357
- #
358
- # In this example the built-in function SUM() is used to sum a set of numbers in a bag.
359
- #
360
- # X = FOREACH C GENERATE group, SUM (A.a1);
361
- #
362
- # Relation X looks like this.
363
- #
364
- # (1, 1)
365
- # (4, 8)
366
- # (8, 16)
367
- # == Example: Flattening
368
- #
369
- # Note: See GROUP for information about the "group" field in relation C.
370
- #
371
- # In this example the FLATTEN keyword is used to eliminate nesting.
372
- #
373
- # X = FOREACH C GENERATE group, FLATTEN(A);
374
- #
375
- # Relation X looks like this.
376
- #
377
- # (1, 1, 2, 3)
378
- # (4, 4, 2, 1)
379
- # (4, 4, 3, 3)
380
- # (8, 8, 3, 4)
381
- # (8, 8, 4, 3)
382
- #
383
- # Another FLATTEN example.
384
- #
385
- # X = FOREACH C GENERATE GROUP, FLATTEN(A.a3);
386
- #
387
- # Relation X looks like this.
388
- #
389
- # (1, 3)
390
- # (4, 1)
391
- # (4, 3)
392
- # (8, 4)
393
- # (8, 3)
394
- #
395
- # Another FLATTEN example.
396
- #
397
- # X = FOREACH C GENERATE FLATTEN(A.(f1, f2)), FLATTEN(B.$1);
398
- #
399
- # Relation X looks like this. Note that for the group '4' in C, there are two tuples in each bag. Thus, when both bags are flattened, the cross product of these tuples is returned; that is, tuples (4, 2, 6), (4, 3, 6), (4, 2, 9), and (4, 3, 9).
400
- #
401
- # (1, 2, 3)
402
- # (4, 2, 6)
403
- # (4, 3, 6)
404
- # (4, 2, 9)
405
- # (4, 3, 9)
406
- # (8, 3, 9)
407
- # (8, 4, 9)
408
- #
409
- # == Example: Nested Block
410
- #
411
- # Suppose we have relation A and derived relation B (where B = GROUP A BY url;). Since relation B contains tuples with bags it can be treated as a nested relation.
412
- #
413
- # A (url:chararray, outlink:chararray)
414
- # ---------------------------------------------
415
- # (www.ccc.com,www.hjk.com)
416
- # (www.ddd.com,www.xyz.org)
417
- # (www.aaa.com,www.cvn.org)
418
- # (www.www.com,www.kpt.net)
419
- # (www.www.com,www.xyz.org)
420
- # (www.ddd.com,www.xyz.org)
421
- #
422
- #
423
- # B
424
- # ---------------------------------------------
425
- # (www.aaa.com,{(www.aaa.com,www.cvn.org)})
426
- # (www.ccc.com,{(www.ccc.com,www.hjk.com)})
427
- # (www.ddd.com,{(www.ddd.com,www.xyz.org),(www.ddd.com,www.xyz.org)})
428
- # (www.www.com,{(www.www.com,www.kpt.net),(www.www.com,www.xyz.org)})
429
- #
430
- # In this example we perform two of the allowed Pig operations, FILTER (FA) and DISTINCT (DA), as well as projection (PA). Note that the last statement in the nested block must be GENERATE.
431
- #
432
- # X = foreach B {
433
- # FA= FILTER A BY outlink == 'www.xyz.org';
434
- # PA = FA.outlink;
435
- # DA = DISTINCT PA;
436
- # GENERATE GROUP, COUNT(DA);
437
- # }
438
- #
439
- # Relation X looks like this.
440
- #
441
- # (www.ddd.com,1L)
442
- # (www.www.com,1L)
443
-
444
-
445
- # ---------------------------------------------------------------------------
446
- #
447
- # GROUP
448
- #
449
- # Groups the data in a single relation.
450
- #
451
- # == Syntax
452
- #
453
- # alias = GROUP alias
454
- # [BY {[field_alias [, field_alias]] | * | [expression] } ]
455
- # [ALL] [PARALLEL n];
456
- #
457
- # == Terms
458
- #
459
- # alias::
460
- # The name of a relation.
461
- #
462
- # BY::
463
- # Keyword. Use this clause to group the relation by fields or by expression.
464
- #
465
- # field_alias::
466
- # The name of a field in a relation. This is the group key or key field.
467
- #
468
- # A relation can be grouped by a single field (f1) or by the composite value of
469
- # multiple fields (f1,f2).
470
- #
471
- # *::
472
- # The asterisk. A designator for all fields in the relation.
473
- #
474
- # expression::
475
- # An expression.
476
- #
477
- # ALL::
478
- # Keyword. Use ALL if you want all tuples to go to a single group; for example, when doing aggregates across entire relations.
479
- #
480
- # PARALLEL n::
481
- # Increase the parallelism of a job by specifying the number of reduce tasks,
482
- # n. The optimal number of parallel tasks depends on the amount of memory on
483
- # each node and the memory required by each of the tasks. To determine n, use
484
- # the following as a general guideline:
485
- # n = (nr_nodes - 1) * 0.45 * nr_GB
486
- # where nr_nodes is the number of nodes used and nr_GB is the amount of physical
487
- # memory on each node.
488
- #
489
- # Note the following:
490
- # * Parallel only affects the number of reduce tasks. Map parallelism is
491
- # determined by the input file, one map for each HDFS block.
492
- # * If you don’t specify parallel, you still get the same map parallelism but
493
- # only one reduce task.
494
- #
495
- # == Usage
496
- #
497
- # The GROUP operator groups together tuples that have the same group key (key
498
- # field). The result of a GROUP operation is a relation that includes one tuple
499
- # per group. This tuple contains two fields:
500
- #
501
- # * The first field is named "group" (do not confuse this with the GROUP operator)
502
- # and is the same type of the group key.
503
- #
504
- # * The second field takes the name of the original relation and is type bag.
505
- #
506
- # Suppose we have the following data:
507
- #
508
- # john 25 3.6
509
- # george 25 2.9
510
- # anne 27 3.9
511
- # julia 28 3.6
512
- #
513
- # And, suppose we perform the LOAD and GROUP statements shown below. We can use
514
- # the DESCRIBE operator to view the schemas for relation Y. We can use DUMP to
515
- # view the contents of Y.
516
- #
517
- # Note that relation Y has two fields. The first field is named "group" and is
518
- # type int (the same as age). The second field takes the name of the original
519
- # relation "X" and is type bag (that can contain tuples with three elements of
520
- # type chararray, int, and float).
521
- #
522
- # Statements
523
- #
524
- # X = LOAD 'data AS (name:chararray, age:int, gpa:float);
525
- # Y = GROUP X BY age;
526
- # DESCRIBE Y;
527
- # Y: {group: int,X: {name: chararray,age: int,gpa: float}}
528
- # DUMP Y;
529
- #
530
- # (25,{(john,25,3.6F),(george,25,2.9F)})
531
- # (27,{(anne,27,3.9F)})
532
- # (28,{(julia,28,3.6F)})
533
- #
534
- # As shown in this FOREACH statement, we can refer to the fields in relation Y by their names "group" and "X".
535
- #
536
- # Z = FOREACH Y GENERATE group, COUNT(X);
537
- #
538
- # Relation Z looks like this.
539
- #
540
- # (25,2L)
541
- # (27,1L)
542
- # (28,1L)
543
- #
544
- # == Examples
545
- #
546
- # Suppose we have relation A.
547
- #
548
- # A: (owner:chararray, pet:chararray)
549
- # -----------------
550
- # (Alice, turtle)
551
- # (Alice, goldfish)
552
- # (Alice, cat)
553
- # (Bob, dog)
554
- # (Bob, cat)
555
- #
556
- # In this example tuples are grouped using the field "owner."
557
- #
558
- # X = GROUP A BY owner;
559
- #
560
- # Relation X looks like this. "group" is the name of the first field. "A" is the
561
- # name of the second field.
562
- #
563
- # (Alice, {(Alice, turtle), (Alice, goldfish)})
564
- # (Bob, {(Bob, dog), (Bob, cat)})
565
- #
566
- # In this example tuples are grouped using the ALL keyword. Field "A" is then
567
- # counted and projected to from relation Y.
568
- #
569
- # X = GROUP A ALL;
570
- # Y = FOREACH X GENERATE COUNT(A);
571
- #
572
- # Relation X looks like this. "group" is the name of the first field. "A" is the
573
- # name of the second field.
574
- #
575
- # (all,{(Alice,turtle),(Alice,goldfish),(Alice,cat),(Bob,dog),(Bob,cat)})
576
- #
577
- # Relation Y looks like this.
578
- #
579
- # (5L)
580
- #
581
- # Suppose we have relation S.
582
- #
583
- # S: (f1:chararay, f2:int, f3:int)
584
- # -----------------
585
- # (r1, 1, 2)
586
- # (r2, 2, 1)
587
- # (r3, 2, 8)
588
- # (r4, 4, 4)
589
- #
590
- # In this example tuples are grouped using an expression, f2*f3.
591
- #
592
- # X = GROUP S BY f2*f3;
593
- #
594
- # Relation Y looks like this. The first field is named "group". The second field is named "S".
595
- #
596
- # (2, {(r1, 1, 2), (r2, 2, 1)})
597
- # (16, {(r3, 2, 8), (r4, 4, 4)})
598
-
599
-
600
- # ---------------------------------------------------------------------------
601
- #
602
- # JOIN
603
- #
604
- # Joins two or more relations based on common field values.
605
- #
606
- # == Syntax
607
- #
608
- # alias = JOIN alias BY field_alias,
609
- # alias BY field_alias [, alias BY field_alias …]
610
- # [PARALLEL n];
611
- #
612
- # == Terms
613
- #
614
- # alias::
615
- # The name of a relation.
616
- #
617
- # BY::
618
- # Keyword.
619
- #
620
- # field_alias::
621
- # The name of a field in a relation. The alias and field_alias specified in the
622
- # BY clause must correspond.
623
- #
624
- # == Example:
625
- # X = JOIN relationA BY fieldA, relationB by fieldB, relationC by fieldC;
626
- #
627
- # PARALLEL n::
628
- # Increase the parallelism of a job by specifying the number of reduce tasks,
629
- # n. The optimal number of parallel tasks depends on the amount of memory on
630
- # each node and the memory required by each of the tasks. To determine n, use
631
- # the following as a general guideline:
632
- # n = (nr_nodes - 1) * 0.45 * nr_GB
633
- # where nr_nodes is the number of nodes used and nr_GB is the amount of physical
634
- # memory on each node.
635
- #
636
- # Note the following:
637
- # * Parallel only affects the number of reduce tasks. Map parallelism is
638
- # determined by the input file, one map for each HDFS block.
639
- # * If you don’t specify parallel, you still get the same map parallelism but
640
- # only one reduce task.
641
- #
642
- # == Usage
643
- #
644
- # Use the JOIN operator to join two or more relations based on common field
645
- # values. The JOIN operator always performs an inner join.
646
- #
647
- # Note: The JOIN and COGROUP operators perform similar functions. JOIN creates a
648
- # flat set of output records while COGROUP creates a nested set of output records.
649
- #
650
- # == Example
651
- #
652
- # Suppose we have relations A and B.
653
- #
654
- # (A: a1, a2, a3) (B: b1, b2)
655
- # ----------------- ---------------
656
- # (1, 2, 3) (2, 4)
657
- # (4, 2, 1) (8, 9)
658
- # (8, 3, 4) (1, 3)
659
- # (4, 3, 3) (2, 7)
660
- # (7, 2, 5) (2, 9)
661
- # (8, 4, 3) (4, 6)
662
- # (4, 9)
663
- #
664
- # In this example relations A and B are joined on their first fields.
665
- #
666
- # X = JOIN A BY a1, B BY b1;
667
- #
668
- # Relation X looks like this.
669
- #
670
- # (1, 2, 3, 1, 3)
671
- # (4, 2, 1, 4, 6)
672
- # (4, 3, 3, 4, 6)
673
- # (4, 2, 1, 4, 9)
674
- # (4, 3, 3, 4, 9)
675
- # (8, 3, 4, 8, 9)
676
- # (8, 4, 3, 8, 9)
677
- #
678
-
679
-
680
- # ---------------------------------------------------------------------------
681
- #
682
- # LIMIT
683
- #
684
- # Limits the number of output tuples.
685
- #
686
- # == Syntax
687
- #
688
- # alias = LIMIT alias n;
689
- #
690
- # == Terms
691
- #
692
- # alias::
693
- # The name of a relation.
694
- #
695
- # n::
696
- # The number of tuples.
697
- #
698
- # == Usage
699
- #
700
- # Use the LIMIT operator to limit the number of output tuples (rows). If the
701
- # specified number of output tuples is equal to or exceeds the number of tuples in
702
- # the relation, the output will include all tuples in the relation.
703
- #
704
- # There is no guarantee which tuples will be returned, and the tuples that are
705
- # returned can change from one run to the next. A particular set of tuples can be
706
- # requested using the ORDER operator followed by LIMIT.
707
- #
708
- # Note: The LIMIT operator allows Pig to avoid processing all tuples in a
709
- # relation. In most cases a query that uses LIMIT will run more efficiently than
710
- # an identical query that does not use LIMIT. It is always a good idea to use
711
- # limit if you can.
712
- #
713
- # == Examples
714
- #
715
- # Suppose we have relation A.
716
- #
717
- # (A: f1:int, f2:int, f3:int)
718
- # -----------------
719
- # (1, 2, 3)
720
- # (4, 2, 1)
721
- # (8, 3, 4)
722
- # (4, 3, 3)
723
- # (7, 2, 5)
724
- # (8, 4, 3)
725
- #
726
- # In this example output is limited to 3 tuples.
727
- #
728
- # X = LIMIT A 3;
729
- #
730
- # Relation X could look like this (there is no guarantee which three tuples will be output).
731
- #
732
- # (1, 2, 3)
733
- # (4, 3, 3)
734
- # (7, 2, 5)
735
- #
736
- # In this example the ORDER operator is used to order the tuples and the LIMIT operator is used to output the first three tuples.
737
- #
738
- # B = ORDER A BY f1 DESC, f2 ASC;
739
- # X = LIMIT B 3;
740
- #
741
- # Relation B and relation X look like this.
742
- #
743
- # (B) (X)
744
- # ----------- -----------
745
- # (8, 3, 4) (8, 3, 4)
746
- # (8, 4, 3) (8, 4, 3)
747
- # (7, 2, 5) (7, 2, 5)
748
- # (4, 2, 1)
749
- # (4, 3, 3)
750
- # (1, 2, 3)
751
-
752
-
753
- # ---------------------------------------------------------------------------
754
- #
755
- # LOAD
756
- #
757
- # Loads data from the file system.
758
- #
759
- # == Syntax
760
- #
761
- # LOAD 'data' [USING function] [AS schema];
762
- #
763
- # == Terms
764
- #
765
- # 'data'::
766
- # The name of the file or directory, in single quotes.
767
- #
768
- # If you specify a directory name, all the files in the directory are loaded.
769
- #
770
- # You can use hadoop-supported globing to specify files at the file system or
771
- # directory levels (see [WWW]hadoop glob documentation for details on globing
772
- # syntax).
773
- #
774
- # USING::
775
- # Keyword.
776
- #
777
- # function::
778
- # The load function.
779
- #
780
- # PigStorage is the default load/store function and does not need to be
781
- # specified. This function reads/writes simple newline-separated records with
782
- # delimiter-separated fields. The function has one parameter, the field
783
- # delimiter (tab (‘\t’) if the default delimiter).
784
- #
785
- # If the data is stored in a special format that the Pig load functions cannot
786
- # parse, you can write your own load function.
787
- #
788
- # AS::
789
- # Keyword.
790
- #
791
- # schema::
792
- # A schema using the AS keyword, enclosed in parentheses (see Schemas).
793
- #
794
- # == Usage
795
- #
796
- # Use the LOAD operator to load data from the file system.
797
- #
798
- # == Examples
799
- #
800
- # Suppose we have a data file called myfile.txt. The fields are tab-delimited. The
801
- # records are newline-separated.
802
- #
803
- # 1 2 3
804
- # 4 2 1
805
- # 8 3 4
806
- #
807
- # In this example the default load function, PigStorage, loads data from
808
- # myfile.txt into relation A. Note that, because no schema is specified, the
809
- # fields are not named and all fields default to type bytearray. The two
810
- # statements are equivalent.
811
- #
812
- # A = LOAD 'myfile.txt';
813
- # A = LOAD 'myfile.txt' USING PigStorage('\t');
814
- #
815
- # Relation A looks like this.
816
- #
817
- # (1, 2, 3)
818
- # (4, 2, 1)
819
- # (8, 3, 4)
820
- #
821
- # In this example a schema is specified using the AS keyword. The two statements
822
- # are equivalent.
823
- #
824
- # A = LOAD 'myfile.txt' AS (f1:int, f2:int, f3:int);
825
- # A = LOAD 'myfile.txt' USING PigStorage(‘\t’) AS (f1:int, f2:int, f3:int);
826
-
827
-
828
- # ---------------------------------------------------------------------------
829
- #
830
- # ORDER
831
- #
832
- # Sorts a relation based on one or more fields.
833
- #
834
- # == Syntax
835
- #
836
- # alias = ORDER alias BY { * [ASC|DESC] | field_alias [ASC|DESC]
837
- # [, field_alias [ASC|DESC] …] } [PARALLEL n];
838
- #
839
- # == Terms
840
- #
841
- # alias::
842
- # The name of a relation.
843
- #
844
- # BY::
845
- # Required keyword.
846
- #
847
- # *::
848
- # Represents all fields in the relation.
849
- #
850
- # ASC::
851
- # Sort in ascending order.
852
- #
853
- # DESC::
854
- # Sort in descending order.
855
- #
856
- # field_alias::
857
- # A field in the relation.
858
- #
859
- # PARALLEL n::
860
- # Increase the parallelism of a job by specifying the number of reduce tasks,
861
- # n. The optimal number of parallel tasks depends on the amount of memory on
862
- # each node and the memory required by each of the tasks. To determine n, use
863
- # the following as a general guideline:
864
- # n = (nr_nodes - 1) * 0.45 * nr_GB
865
- # where nr_nodes is the number of nodes used and nr_GB is the amount of physical
866
- # memory on each node.
867
- #
868
- # Note the following:
869
- # * Parallel only affects the number of reduce tasks. Map parallelism is
870
- # determined by the input file, one map for each HDFS block.
871
- # * If you don’t specify parallel, you still get the same map parallelism but
872
- # only one reduce task.
873
- #
874
- # == Usage
875
- #
876
- # In Pig, relations are logically unordered.
877
- #
878
- # * If you order relation A to produce relation X (X = ORDER A BY * DESC;),
879
- # relations A and X still contain the same thing.
880
- #
881
- # * If you retrieve the contents of relation X, they are guaranteed to be in the
882
- # order you specified (descending).
883
- #
884
- # * However, if you further process relation X, there is no guarantee that the
885
- # contents will be processed in the order you specified.
886
- #
887
- # == Examples
888
- #
889
- # Suppose we have relation A.
890
- #
891
- # (A: f1, f2, f3)
892
- # -----------------
893
- # (1, 2, 3)
894
- # (4, 2, 1)
895
- # (8, 3, 4)
896
- # (4, 3, 3)
897
- # (7, 2, 5)
898
- # (8, 4, 3)
899
- #
900
- # In this example relation A is sorted by the third field, f3 in descending order.
901
- #
902
- # X = ORDER A BY f3 DESC;
903
- #
904
- # Relation X could look like this (note that the order of the three tuples ending
905
- # in 3 can vary).
906
- #
907
- # (7, 2, 5)
908
- # (8, 3, 4)
909
- # (1, 2, 3)
910
- # (4, 3, 3)
911
- # (8, 4, 3)
912
- # (4, 2, 1)
913
-
914
-
915
- # ---------------------------------------------------------------------------
916
- #
917
- # SPLIT
918
- #
919
- # Partitions a relation into two or more relations.
920
- #
921
- # == Syntax
922
- #
923
- # SPLIT alias INTO alias IF expression, alias IF expression [, alias IF expression …];
924
- #
925
- # == Terms
926
- #
927
- # alias::
928
- # The name of a relation.
929
- #
930
- # INTO::
931
- # Required keyword.
932
- #
933
- # IF::
934
- # Required keyword.
935
- #
936
- # expression::
937
- # An expression.
938
- #
939
- # == Usage
940
- #
941
- # Use the SPLIT operator to partition a relation into two or more relations based
942
- # on some expression. Depending on the expression:
943
- #
944
- # * A tuple may be assigned to more than one relation.
945
- #
946
- # * A tuple may not be assigned to any relation.
947
- #
948
- # == Example
949
- #
950
- # Suppose we have relation A.
951
- #
952
- # (A: f1, f2, f3)
953
- # -----------------
954
- # (1, 2, 3)
955
- # (4, 5, 6)
956
- # (7, 8, 9)
957
- #
958
- # In this example relation A is split into three relations, X, Y, and Z.
959
- #
960
- # SPLIT A INTO X IF f1< 7, Y IF f2==5, Z IF (f3<6 OR f3>6);
961
- #
962
- # Relations X, Y, and Z look like this.
963
- #
964
- # (X) (Y) (Z)
965
- # ---------- ----------- -----------
966
- # (1, 2, 3) (4, 5, 6) (1, 2, 3)
967
- # (4, 5, 6) (7, 8, 9)
968
-
969
-
970
- # ---------------------------------------------------------------------------
971
- #
972
- # STORE
973
- #
974
- # Stores data to the file system.
975
- #
976
- # == Syntax
977
- #
978
- # STORE alias INTO 'directory' [USING function];
979
- #
980
- # == Terms
981
- #
982
- # alias::
983
- # The name of a relation.
984
- #
985
- # INTO::
986
- # Required keyword.
987
- #
988
- # 'directory'::
989
- # The name of the storage directory, in quotes. If the directory already exists, the STORE operation will fail.
990
- #
991
- #
992
- #
993
- # The output data files, named part-nnnnn, are written to this directory.
994
- #
995
- # USING::
996
- # Keyword. Use this clause to name the store function.
997
- #
998
- # function::
999
- # The load function.
1000
- #
1001
- # PigStorage is the default load/store function and does not need to be specified. This function reads/writes simple newline-separated records with delimiter-separated fields. The function has one parameter, the field delimiter (tab ‘\t’ if the default delimiter)
1002
- #
1003
- # If you want to store the data in a special format that the Pig Load/Store functions cannot handle, you can write your own store function.
1004
- #
1005
- # == Usage
1006
- #
1007
- # Use the STORE operator to store data on the file system.
1008
- #
1009
- # == Example
1010
- #
1011
- # Suppose we have relation A.
1012
- #
1013
- # (A)
1014
- #
1015
- # ----------------
1016
- # (1, 2, 3)
1017
- # (4, 2, 1)
1018
- # (8, 3, 4)
1019
- # (4, 3, 3)
1020
- # (7, 2, 5)
1021
- # (8, 4, 3)
1022
- #
1023
- # In this example the contents of relation A are written to file part-00000 located in directory myoutput.
1024
- #
1025
- # STORE relationA INTO ‘myoutput’ USING PigStorage (‘*’);
1026
- #
1027
- # The part-00000 file looks like this. Fields are delimited with the asterisk * characters and records are separated by newlines.
1028
- #
1029
- # 1*2*3
1030
- # 4*2*1
1031
- # 8*3*4
1032
- # 4*3*3
1033
- # 7*2*5
1034
- # 8*4*3
1035
- #
1036
-
1037
-
1038
- # ---------------------------------------------------------------------------
1039
- #
1040
- # STREAM
1041
- #
1042
- # Sends data to an external script or program.
1043
- #
1044
- # == Syntax
1045
- #
1046
- # alias = STREAM alias [, alias …] THROUGH {`command` | cmd_alias } [AS schema] ;
1047
- #
1048
- # == Terms
1049
- #
1050
- # alias::
1051
- # The name of a relation.
1052
- #
1053
- # THROUGH::
1054
- # Keyword.
1055
- #
1056
- # `command`::
1057
- # A command, including the arguments, enclosed in back tics (where a command is anything that can be executed).
1058
- #
1059
- # cmd_alias::
1060
- # The name of a command created using the DEFINE operator.
1061
- #
1062
- # AS::
1063
- # Keyword.
1064
- #
1065
- # schema::
1066
- # A schema using the AS keyword, enclosed in parentheses (see Schemas).
1067
- #
1068
- # == Usage
1069
- #
1070
- # Use the STREAM operator to send data through an external script or program. Multiple stream operators can appear in the same Pig script. The stream operators can be adjacent to each other or have other operations in between.
1071
- #
1072
- # When used with a command, a stream statement could look like this:
1073
- #
1074
- # A = LOAD 'data';
1075
- #
1076
- # B = STREAM A THROUGH `stream.pl -n 5`;
1077
- #
1078
- # When used with a cmd_alias, a stream statement could look like this, where cmd is the defined alias.
1079
- #
1080
- # A = LOAD 'data';
1081
- #
1082
- # DEFINE cmd `stream.pl –n 5`;
1083
- #
1084
- # B = STREAM A THROUGH cmd;
1085
- # About Data Guarantees
1086
- #
1087
- # Data guarantees are determined based on the position of the streaming operator in the Pig script.
1088
- #
1089
- # * Unordered data – No guarantee for the order in which the data is delivered to
1090
- # the streaming application.
1091
- #
1092
- # * Grouped data – The data for the same grouped key is guaranteed to be provided
1093
- # to the streaming application contiguously
1094
- #
1095
- # * Grouped and ordered data – The data for the same grouped key is guaranteed to
1096
- # be provided to the streaming application contiguously. Additionally, the data
1097
- # within the group is guaranteed to be sorted by the provided secondary key.
1098
- #
1099
- # In addition to position, data grouping and ordering can be determined by the
1100
- # data itself. However, you need to know the property of the data to be able to
1101
- # take advantage of its structure.
1102
- #
1103
- # == Example: Data Guarantees
1104
- #
1105
- # In this example the data is unordered.
1106
- #
1107
- # A = LOAD 'data';
1108
- # B = STREAM A THROUGH `stream.pl`;
1109
- #
1110
- # In this example the data is grouped.
1111
- #
1112
- # A = LOAD 'data';
1113
- # B = GROUP A BY $1;
1114
- # C = FOREACH B FLATTEN(A);
1115
- # D = STREAM C THROUGH `stream.pl`
1116
- #
1117
- # In this example the data is grouped and ordered.
1118
- #
1119
- # A = LOAD 'data';
1120
- # B = GROUP A BY $1;
1121
- # C = FOREACH B {
1122
- # D = ORDER A BY ($3, $4);
1123
- # GENERATE D;
1124
- # }
1125
- # E = STREAM C THROUGH `stream.pl`;
1126
- #
1127
- # == Example: Schemas
1128
- #
1129
- # In this example a schema is specified as part of the STREAM statement.
1130
- #
1131
- # X = STREAM A THROUGH `stream.pl` as (f1:int, f2;int, f3:int);
1132
- #
1133
- # Additional Examples
1134
- #
1135
- # See DEFINE for additional examples.
1136
-
1137
-
1138
- # ---------------------------------------------------------------------------
1139
- #
1140
- # UNION
1141
- #
1142
- # Computes the union of two or more relations.
1143
- #
1144
- # == Syntax
1145
- #
1146
- # alias = UNION alias, alias [, alias …];
1147
- #
1148
- # == Terms
1149
- #
1150
- # alias::
1151
- # The name of a relation.
1152
- #
1153
- # == Usage
1154
- #
1155
- # Use the UNION operator to compute the union of two or more relations. The UNION operator:
1156
- #
1157
- # * Does not preserve the order of tuples. Both the input and output relations are
1158
- # interpreted as unordered bags of tuples.
1159
- #
1160
- # * Does not ensure (as databases do) that all tuples adhere to the same schema or
1161
- # that they have the same number of fields. In a typical scenario, however, this
1162
- # should be the case; therefore, it is the user's responsibility to either (1)
1163
- # ensure that the tuples in the input relations have the same schema or (2) be
1164
- # able to process varying tuples in the output relation.
1165
- #
1166
- # * Does not eliminate duplicate tuples.
1167
- #
1168
- # == Example
1169
- #
1170
- # Suppose we have relations A and B.
1171
- #
1172
- # (A) (B)
1173
- # ----------- --------
1174
- # (1, 2, 3) (2, 4)
1175
- # (4, 2, 1) (8, 9)
1176
- # (1, 3)
1177
- #
1178
- # In this example the union of relation A and B is computed.
1179
- #
1180
- # X = UNION A, B;
1181
- #
1182
- # Relation X looks like this.
1183
- #
1184
- # (1, 2, 3)
1185
- # (4, 2, 1)
1186
- # (2, 4)
1187
- # (8, 9)
1188
- # (1, 3)
1189
- # Diagnostic Operators
1190
- # DESCRIBE
1191
- #
1192
- # Returns the schema of an alias.
1193
- #
1194
- # == Syntax
1195
- #
1196
- # DESCRIBE alias;
1197
- #
1198
- # == Terms
1199
- #
1200
- # alias::
1201
- # The name of a relation.
1202
- #
1203
- # == Usage
1204
- #
1205
- # Use the DESCRIBE operator to review the schema of a particular alias.
1206
- #
1207
- # == Example
1208
- #
1209
- # In this example a schema is specified using the AS clause.
1210
- #
1211
- # A = LOAD 'students' AS (name:chararray, age:int, gpa:float);
1212
- # B = FILTER A BY name matches 'John%';
1213
- # C = GROUP B BY name;
1214
- # D = FOREACH B GENERATE COUNT(B.age);
1215
- # DESCRIBE A;
1216
- # A: {group, B: (name: chararray,age: int,gpa: float}
1217
- # DESCRIBE B;
1218
- # B: {group, B: (name: chararray,age: int,gpa: float}
1219
- # DESCRIBE C;
1220
- # C: {group, chararry,B: (name: chararray,age: int,gpa: float}
1221
- # DESCRIBE D;
1222
- # D: {long}
1223
- #
1224
- # In this example no schema is specified. All data items default to type bytearray.
1225
- #
1226
- # grunt> a = LOAD '/data/students';
1227
- # grunt> b = FILTER a BY $0 matches 'John%';
1228
- # grunt> c = GROUP b BY $0;
1229
- # grunt> d = FOREACH c GENERATE COUNT(b.$1);
1230
- # grunt> DESCRIBE a;
1231
- #
1232
- # Schema for a unknown.
1233
- #
1234
- # grunt> DESCRIBE b;
1235
- # 2008-12-05 01:17:15,316 [main] WARN org.apache.pig.PigServer - bytearray is implicitly cast to chararray under LORegexp Operator
1236
- #
1237
- # Schema for b unknown.
1238
- #
1239
- # grunt> DESCRIBE c;
1240
- # 2008-12-05 01:17:23,343 [main] WARN org.apache.pig.PigServer - bytearray is implicitly caste to chararray under LORegexp Operator
1241
- #
1242
- # c: {group: bytearray,b: {null}}
1243
- #
1244
- # grunt> DESCRIBE d;
1245
- # 2008-12-05 03:04:30,076 [main] WARN org.apache.pig.PigServer - bytearray is implicitly caste to chararray under LORegexp Operator
1246
- #
1247
- # d: {long}
1248
- #
1249
- # DUMP
1250
- #
1251
- # Displays the contents of an alias.
1252
- #
1253
- # == Syntax
1254
- #
1255
- # DUMP alias;
1256
- #
1257
- # == Terms
1258
- #
1259
- # alias::
1260
- # The name of a relation.
1261
- #
1262
- # == Usage
1263
- #
1264
- # Use the DUMP operator to display the contents of an alias. You can use DUMP as a
1265
- # debugging device to make sure the correct results are being generated.
1266
- #
1267
- # == Example
1268
- #
1269
- # In this example a dump is performed after each statement.
1270
- #
1271
- # A = LOAD 'students' AS (name:chararray, age:int, gpa:float);
1272
- # DUMP A;
1273
- # B = FILTER A BY name matches 'John%';
1274
- # DUMP B;
1275
- # B = GROUP B BY name;
1276
- # DUMP C;
1277
- # D = FOREACH C GENERATE COUNT(B.age);
1278
- # DUMP D;
1279
- #
1280
- # EXPLAIN
1281
- #
1282
- # Displays execution plans.
1283
- #
1284
- # == Syntax
1285
- #
1286
- # EXPLAIN alias;
1287
- #
1288
- # == Terms
1289
- #
1290
- # alias::
1291
- # The name of a relation.
1292
- #
1293
- # == Usage
1294
- #
1295
- # Use the EXPLAIN operator to review the logical, physical, and map reduce
1296
- # execution plans that are used to compute the specified relationship.
1297
- #
1298
- # * The logical plan shows a pipeline of operators to be executed to build the
1299
- # relation. Type checking and backend-independent optimizations (such as
1300
- # applying filters early on) also apply.
1301
- #
1302
- # * The physical plan shows how the logical operators are translated to
1303
- # backend-specific physical operators. Some backend optimizations also apply.
1304
- #
1305
- # * The map reduce plan shows how the physical operators are grouped into map
1306
- # reduce jobs.
1307
- #
1308
- # == Example
1309
- #
1310
- # In this example the EXPLAIN operator produces all three plans. (Note that only a
1311
- # portion of the output is shown in this example.)
1312
- #
1313
- # A = LOAD 'students' AS (name:chararray, age:int, gpa:float);
1314
- # B = GROUP A BY name;
1315
- # C = FOREACH B GENERATE COUNT(A.age);
1316
- # EXPLAIN C;
1317
- #
1318
- #
1319
- # Logical Plan:
1320
- #
1321
- # Store xxx-Fri Dec 05 19:42:29 UTC 2008-23 Schema: {long} Type: Unknown
1322
- # |
1323
- # |---ForEach xxx-Fri Dec 05 19:42:29 UTC 2008-15 Schema: {long} Type: bag
1324
- # etc …
1325
- #
1326
- # -----------------------------------------------
1327
- # Physical Plan:
1328
- # -----------------------------------------------
1329
- # Store(fakefile:org.apache.pig.builtin.PigStorage) - xxx-Fri Dec 05 19:42:29 UTC 2008-40
1330
- # |
1331
- # |---New For Each(false)[bag] - xxx-Fri Dec 05 19:42:29 UTC 2008-39
1332
- # | |
1333
- # | POUserFunc(org.apache.pig.builtin.COUNT)[long] - xxx-Fri Dec 05
1334
- # etc …
1335
- #
1336
- # --------------------------------------------------
1337
- # | Map Reduce Plan |
1338
- # --------------------------------------------------
1339
- # MapReduce node xxx-Fri Dec 05 19:42:29 UTC 2008-41
1340
- # Map Plan
1341
- # Local Rearrange[tuple]{chararray}(false) - xxx-Fri Dec 05 19:42:29 UTC 2008-34
1342
- # | |
1343
- # | Project[chararray][0] - xxx-Fri Dec 05 19:42:29 UTC 2008-35
1344
- # etc …
1345
- # ILLUSTRATE
1346
- #
1347
- # Displays a step-by-step execution of a sequence of statements.
1348
- #
1349
- # == Syntax
1350
- #
1351
- # ILLUSTRATE alias;
1352
- #
1353
- # == Terms
1354
- #
1355
- # alias::
1356
- # The name of a relation.
1357
- #
1358
- # == Usage
1359
- #
1360
- # Use the ILLUSTRATE operator to review how data items are transformed through a
1361
- # sequence of Pig Latin statements.
1362
- #
1363
- # ILLUSTRATE accesses the ExampleGenerator algorithm which can select an
1364
- # appropriate and concise set of example data items automatically. It does a
1365
- # better job than random sampling would do; for example, random sampling suffers
1366
- # from the drawback that selective operations such as filters or joins can
1367
- # eliminate all the sampled data items, giving you empty results which is of no
1368
- # help with debugging.
1369
- #
1370
- # With the ILLUSTRATE operator you can test your programs on small datasets and
1371
- # get faster turnaround times. The ExampleGenerator algorithm uses Pig's Local
1372
- # mode (rather than Hadoop mode) which means that illustrative example data is
1373
- # generated in near real-time.
1374
- #
1375
- # == Example
1376
- #
1377
- # Suppose we have a data file called 'visits.txt'.
1378
- # Amy cnn.com 20080218
1379
- # Fred harvard.edu 20081204
1380
- # Amy bbc.com 20081205
1381
- # Fred stanford.edu 20081206
1382
- #
1383
- # In this example we count the number of sites a user has visited since
1384
- # 12/1/08. The ILLUSTRATE statement will show how the results for num_user_visits
1385
- # are derived.
1386
- #
1387
- # visits = LOAD 'visits.txt' AS (user:chararray, url:chararray, timestamp:chararray);
1388
- #
1389
- # recent_visits = FILTER visits BY timestamp >= '20081201';
1390
- #
1391
- # user_visits = GROUP recent_visits BY user;
1392
- #
1393
- # num_user_visits = FOREACH user_visits GENERATE COUNT(recent_visits);
1394
- #
1395
- # ILLUSTRATE num_user_visits
1396
- #
1397
- # The output from the ILLUSTRATE statement looks like this.
1398
- #
1399
- # ------------------------------------------------------------------------
1400
- #
1401
- # | visits | user: bytearray | url: bytearray | timestamp: bytearray |
1402
- # ------------------------------------------------------------------------
1403
- # | | Amy | cnn.com | 20080218 |
1404
- # | | Fred | harvard.edu | 20081204 |
1405
- # | | Amy | bbc.com | 20081205 |
1406
- # | | Fred | stanford.edu | 20081206 |
1407
- # ------------------------------------------------------------------------
1408
- #
1409
- # -------------------------------------------------------------------------------
1410
- # | recent_visits | user: chararray | url: chararray | timestamp: chararray |
1411
- # -------------------------------------------------------------------------------
1412
- # | | Fred | harvard.edu | 20081204 |
1413
- # | | Amy | bbc.com | 20081205 |
1414
- # | | Fred | stanford.edu | 20081206 |
1415
- # -------------------------------------------------------------------------------
1416
- #
1417
- # ------------------------------------------------------------------------------------------------------------------
1418
- # | user_visits | group: chararray | recent_visits: bag({user: chararray,url: chararray,timestamp: chararray}) |
1419
- # ------------------------------------------------------------------------------------------------------------------
1420
- # | | Amy | {(Amy, bbc.com, 20081205)} |
1421
- # | | Fred | {(Fred, harvard.edu, 20081204), (Fred, stanford.edu, 20081206)} |
1422
- # ------------------------------------------------------------------------------------------------------------------
1423
- #
1424
- # -------------------------------
1425
- # | num_user_visits | long |
1426
- # -------------------------------
1427
- # | | 1 |
1428
- # | | 2 |
1429
- # -------------------------------
1430
- #
1431
-
1432
- # ---------------------------------------------------------------------------
1433
- #
1434
- # DEFINE
1435
- #
1436
- # Assigns an alias to a function or command.
1437
- #
1438
- # == Syntax
1439
- #
1440
- # DEFINE alias {function | [`command` [input] [output] [ship] [cache]] };
1441
- #
1442
- # == Terms
1443
- #
1444
- # alias::
1445
- # The name for the function or command.
1446
- #
1447
- # function::
1448
- # The name of a function.
1449
- #
1450
- # Use this option to define functions for use with the FOREACH and FILTER operators.
1451
- #
1452
- # `command `::
1453
- # A command, including the arguments, enclosed in back tics (where a command is anything that can be executed).
1454
- #
1455
- # Use this option to define commands for use with the STREAM operator.
1456
- #
1457
- # input::
1458
- # INPUT ( {stdin | 'path'} [USING serializer] [, {stdin | 'path'} [USING serializer] …] )
1459
- #
1460
- # Where:
1461
- # * INPUT – Keyword.
1462
- # * 'path' – A file path, enclosed in single quotes.
1463
- # * USING – Keyword.
1464
- # * serializer – A function that converts data from tuples to stream format. PigStorage is the default serializer. You can also write your own UDF.
1465
- #
1466
- # output::
1467
- # OUTPUT ( {stdout | stderr | 'path'} [USING deserializer] [, {stdout | stderr | 'path'} [USING deserializer] …] )
1468
- #
1469
- # Where:
1470
- #
1471
- # * OUTPUT – Keyword.
1472
- # * 'path' – A file path, enclosed in single quotes.
1473
- # * USING – Keyword.
1474
- # * deserializer – A function that converts data from stream format to tuples. PigStorage is the default deserializer. You can also write your own UDF.
1475
- #
1476
- # ship::
1477
- # SHIP('path' [, 'path' …])
1478
- #
1479
- # Where:
1480
- #
1481
- # * SHIP – Keyword.
1482
- # * 'path' – A file path, enclosed in single quotes.
1483
- #
1484
- # cache::
1485
- # CACHE('dfs_path#dfs_file' [, 'dfs_path#dfs_file' …])
1486
- #
1487
- # Where:
1488
- #
1489
- # * CACHE – Keyword.
1490
- # * 'dfs_path#dfs_file' – A file path/file name on the distributed file system,
1491
- # enclosed in single quotes. Example: '/mydir/mydata.txt#mydata.txt'
1492
- #
1493
- #
1494
- # == Usage
1495
- #
1496
- # Use the DEFINE statement to assign a name (alias) to a function or to a command.
1497
- #
1498
- # Use DEFINE to specify a function when:
1499
- #
1500
- # * The function has a log package name that you don't want to include in a
1501
- # script, especially if you call the function several times in that script.
1502
- #
1503
- # * The constructor for the function takes parameters (see the first example
1504
- # below). If you need to use different constructor parameters for different
1505
- # calls to the function you will need to create multiple defines – one for each
1506
- # parameter set.
1507
- #
1508
- # Use DEFINE to specify a command when the streaming command specification is
1509
- # complex or requires additional parameters (input, output, and so on).
1510
- #
1511
- # === About Input and Output
1512
- #
1513
- # Serialization is needed to convert data from tuples to a format that can be
1514
- # processed by the streaming application. Deserialization is needed to convert the
1515
- # output from the streaming application back into tuples.
1516
- #
1517
- # PigStorage, the default serialization/deserialization function, converts tuples
1518
- # to tab-delimited lines. Pig's BinarySerializer and BinaryDeserializer functions
1519
- # treat the entire file as a byte stream (no formatting or interpretation takes
1520
- # place). You can also write your own serialization/deserialization functions.
1521
- #
1522
- # === About Ship
1523
- #
1524
- # Use the ship option to send streaming binary and supporting files, if any, from
1525
- # the client node to the compute nodes. Pig does not automatically ship
1526
- # dependencies; it is your responsibility to explicitly specify all the
1527
- # dependencies and to make sure that the software the processing relies on (for
1528
- # instance, perl or python) is installed on the cluster. Supporting files are
1529
- # shipped to the task's current working directory and only relative paths should
1530
- # be specified. Any pre-installed binaries should be specified in the path.
1531
- #
1532
- # Only files, not directories, can be specified with the ship option. One way to
1533
- # work around this limitation is to tar all the dependencies into a tar file that
1534
- # accurately reflects the structure needed on the compute nodes, then have a
1535
- # wrapper for your script that un-tars the dependencies prior to execution.
1536
- #
1537
- # Note that the ship option has two components: the source specification, provided
1538
- # in the ship clause, is the view of your machine; the command specification is
1539
- # the view of the cluster.The only guarantee is that the shipped files are
1540
- # available is the current working directory of the launched job and that your
1541
- # current working directory is also on the PATH environment variable.
1542
- #
1543
- # Shipping files to relative paths or absolute paths is not supported since you
1544
- # might not have permission to read/write/execute from arbitrary paths on the
1545
- # clusters.
1546
- #
1547
- # === About Cache
1548
- #
1549
- # The ship option works with binaries, jars, and small datasets. However, loading
1550
- # larger datasets at run time for every execution can severely impact
1551
- # performance. Instead, use the cache option to access large files already moved
1552
- # to and available on the compute nodes. Only files, not directories, can be
1553
- # specified with the cache option.
1554
- #
1555
- # == Example: Input/Output
1556
- #
1557
- # In this example PigStorage is the default serialization/deserialization
1558
- # function. The tuples from relation A are converted to tab-delimited lines that
1559
- # are passed to the script.
1560
- #
1561
- # X = STREAM A THROUGH `stream.pl`;
1562
- #
1563
- # In this example PigStorage is used as the serialization/deserialization
1564
- # function, but a comma is used as the delimiter.
1565
- #
1566
- # DEFINE Y `stream.pl` INPUT(stdin USING PigStorage(',')) OUTPUT (stdout USING PigStorage(','));
1567
- # X = STREAM A THROUGH Y;
1568
- #
1569
- # In this example user-defined serialization/deserialization functions are used
1570
- # with the script.
1571
- #
1572
- # DEFINE Y `stream.pl` INPUT(stdin USING MySerializer) OUTPUT (stdout USING MyDeserializer);
1573
- # X = STREAM A THROUGH Y;
1574
- #
1575
- # == Example: Ship/Cache
1576
- #
1577
- # In this example ship is used to send the script to the cluster compute nodes.
1578
- #
1579
- # DEFINE Y `stream.pl` SHIP('/work/stream.pl');
1580
- # X = STREAM A THROUGH Y;
1581
- #
1582
- # In this example cache is used to specify a file located on the cluster compute
1583
- # nodes.
1584
- #
1585
- # DEFINE Y `stream.pl data.gz` SHIP('/work/stream.pl') CACHE('/input/data.gz#data.gz');
1586
- # X = STREAM A THROUGH Y;
1587
- #
1588
- # == Example: Logging
1589
- #
1590
- # In this example the streaming stderr is stored in the _logs/<dir> directory of
1591
- # the job's output directory. Because the job can have multiple streaming
1592
- # applications associated with it, you need to ensure that different directory
1593
- # names are used to avoid conflicts. Pig stores up to 100 tasks per streaming job.
1594
- #
1595
- # DEFINE Y `stream.pl` stderr('<dir>' limit 100);
1596
- # X = STREAM A THROUGH Y;
1597
- #
1598
- # In this example a function is defined for use with the FOREACH …GENERATE operator.
1599
- # grunt> REGISTER /src/myfunc.jar
1600
- # grunt> define myFunc myfunc.MyEvalfunc('foo');
1601
- # grunt> A = LOAD 'students';
1602
- # grunt> B = FOREACH A GENERATE myFunc($0);
1603
- #
1604
- # In this example a command is defined for use with the STREAM operator.
1605
- # grunt> A = LOAD 'data';
1606
- # grunt> DEFINE cmd `stream_cmd –input file.dat`
1607
- # grunt> B = STREAM A through cmd.
1608
- #
1609
-
1610
-
1611
- # ---------------------------------------------------------------------------
1612
- #
1613
- # = REGISTER
1614
- #
1615
- # Registers a JAR file so that the UDFs in the file can be used.
1616
- #
1617
- # == Syntax
1618
- #
1619
- # REGISTER alias;
1620
- #
1621
- # == Terms
1622
- #
1623
- # [alias] The path of a Java JAR file. Do not place the name in quotes.
1624
- #
1625
- # == Usage
1626
- #
1627
- # Use the REGISTER statement to specify the path of a Java JAR file containing UDFs.
1628
- #
1629
- # For more information about UDFs, see the User Defined Function Guide. Note that
1630
- # Pig currently only supports functions written in Java.
1631
- #
1632
- # == Example
1633
- #
1634
- # In this example REGISTER states that myfunc.jar is located in the /src
1635
- # directory.
1636
- #
1637
- # grunt> REGISTER /src/myfunc.jar;
1638
- # grunt> A = LOAD 'students';
1639
- # grunt> B = FOREACH A GENERATE myfunc.MyEvalFunc($0);
1640
- #