wukong 3.0.0.pre → 3.0.0.pre2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,1509 +0,0 @@
1
- <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
- <html>
3
- <head>
4
- <META http-equiv="Content-Type" content="text/html; charset=UTF-8">
5
- <meta content="Apache Forrest" name="Generator">
6
- <meta name="Forrest-version" content="0.8">
7
- <meta name="Forrest-skin-name" content="pelt">
8
- <title>Pig UDF Manual</title>
9
- <link type="text/css" href="skin/basic.css" rel="stylesheet">
10
- <link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
11
- <link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
12
- <link type="text/css" href="skin/profile.css" rel="stylesheet">
13
- <script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
14
- <link rel="shortcut icon" href="">
15
- </head>
16
- <body onload="init()">
17
- <script type="text/javascript">ndeSetTextSize();</script>
18
- <div id="top">
19
- <!--+
20
- |breadtrail
21
- +-->
22
- <div class="breadtrail">
23
- <a href="http://www.apache.org/">Apache</a> &gt; <a href="http://hadoop.apache.org/">Hadoop</a> &gt; <a href="http://hadoop.apache.org/pig/">Pig</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
24
- </div>
25
- <!--+
26
- |header
27
- +-->
28
- <div class="header">
29
- <!--+
30
- |start group logo
31
- +-->
32
- <div class="grouplogo">
33
- <a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
34
- </div>
35
- <!--+
36
- |end group logo
37
- +-->
38
- <!--+
39
- |start Project Logo
40
- +-->
41
- <div class="projectlogo">
42
- <a href="http://hadoop.apache.org/pig/"><img class="logoImage" alt="Pig" src="images/pig-logo.gif" title="A platform for analyzing large datasets."></a>
43
- </div>
44
- <!--+
45
- |end Project Logo
46
- +-->
47
- <!--+
48
- |start Search
49
- +-->
50
- <div class="searchbox">
51
- <form action="http://www.google.com/search" method="get" class="roundtopsmall">
52
- <input value="" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp;
53
- <input name="Search" value="Search" type="submit">
54
- </form>
55
- </div>
56
- <!--+
57
- |end search
58
- +-->
59
- <!--+
60
- |start Tabs
61
- +-->
62
- <ul id="tabs">
63
- <li>
64
- <a class="unselected" href="http://hadoop.apache.org/pig/">Project</a>
65
- </li>
66
- <li>
67
- <a class="unselected" href="http://wiki.apache.org/pig/">Wiki</a>
68
- </li>
69
- <li class="current">
70
- <a class="selected" href="index.html">Pig 0.7.0 Documentation</a>
71
- </li>
72
- </ul>
73
- <!--+
74
- |end Tabs
75
- +-->
76
- </div>
77
- </div>
78
- <div id="main">
79
- <div id="publishedStrip">
80
- <!--+
81
- |start Subtabs
82
- +-->
83
- <div id="level2tabs"></div>
84
- <!--+
85
- |end Endtabs
86
- +-->
87
- <script type="text/javascript"><!--
88
- document.write("Last Published: " + document.lastModified);
89
- // --></script>
90
- </div>
91
- <!--+
92
- |breadtrail
93
- +-->
94
- <div class="breadtrail">
95
-
96
- &nbsp;
97
- </div>
98
- <!--+
99
- |start Menu, mainarea
100
- +-->
101
- <!--+
102
- |start Menu
103
- +-->
104
- <div id="menu">
105
- <div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Pig</div>
106
- <div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
107
- <div class="menuitem">
108
- <a href="index.html">Overview</a>
109
- </div>
110
- <div class="menuitem">
111
- <a href="setup.html">Setup</a>
112
- </div>
113
- <div class="menuitem">
114
- <a href="tutorial.html">Tutorial</a>
115
- </div>
116
- <div class="menuitem">
117
- <a href="piglatin_ref1.html">Pig Latin 1</a>
118
- </div>
119
- <div class="menuitem">
120
- <a href="piglatin_ref2.html">Pig Latin 2</a>
121
- </div>
122
- <div class="menuitem">
123
- <a href="cookbook.html">Cookbook</a>
124
- </div>
125
- <div class="menupage">
126
- <div class="menupagetitle">UDFs</div>
127
- </div>
128
- </div>
129
- <div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Zebra</div>
130
- <div id="menu_1.2" class="menuitemgroup">
131
- <div class="menuitem">
132
- <a href="zebra_overview.html">Zebra Overview </a>
133
- </div>
134
- <div class="menuitem">
135
- <a href="zebra_users.html">Zebra Users </a>
136
- </div>
137
- <div class="menuitem">
138
- <a href="zebra_reference.html">Zebra Reference </a>
139
- </div>
140
- <div class="menuitem">
141
- <a href="zebra_mapreduce.html">Zebra MapReduce </a>
142
- </div>
143
- <div class="menuitem">
144
- <a href="zebra_pig.html">Zebra Pig </a>
145
- </div>
146
- <div class="menuitem">
147
- <a href="zebra_stream.html">Zebra Streaming </a>
148
- </div>
149
- </div>
150
- <div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Miscellaneous</div>
151
- <div id="menu_1.3" class="menuitemgroup">
152
- <div class="menuitem">
153
- <a href="api/">API Docs</a>
154
- </div>
155
- <div class="menuitem">
156
- <a href="http://wiki.apache.org/pig/">Wiki</a>
157
- </div>
158
- <div class="menuitem">
159
- <a href="http://wiki.apache.org/pig/FAQ">FAQ</a>
160
- </div>
161
- <div class="menuitem">
162
- <a href="http://hadoop.apache.org/pig/releases.html">Release Notes</a>
163
- </div>
164
- </div>
165
- <div id="credit"></div>
166
- <div id="roundbottom">
167
- <img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
168
- <!--+
169
- |alternative credits
170
- +-->
171
- <div id="credit2"></div>
172
- </div>
173
- <!--+
174
- |end Menu
175
- +-->
176
- <!--+
177
- |start content
178
- +-->
179
- <div id="content">
180
- <div title="Portable Document Format" class="pdflink">
181
- <a class="dida" href="udf.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
182
- PDF</a>
183
- </div>
184
- <h1>Pig UDF Manual</h1>
185
- <div id="minitoc-area">
186
- <ul class="minitoc">
187
- <li>
188
- <a href="#Overview">Overview</a>
189
- </li>
190
- <li>
191
- <a href="#Eval+Functions">Eval Functions</a>
192
- <ul class="minitoc">
193
- <li>
194
- <a href="#How+to+Use+a+Simple+Eval+Function">How to Use a Simple Eval Function</a>
195
- </li>
196
- <li>
197
- <a href="#How+to+Write+a+Simple+Eval+Function"> How to Write a Simple Eval Function</a>
198
- </li>
199
- <li>
200
- <a href="#Aggregate+Functions">Aggregate Functions</a>
201
- </li>
202
- <li>
203
- <a href="#Filter+Functions"> Filter Functions</a>
204
- </li>
205
- <li>
206
- <a href="#Pig+Types"> Pig Types</a>
207
- </li>
208
- <li>
209
- <a href="#Schema"> Schema</a>
210
- </li>
211
- <li>
212
- <a href="#Error+Handling"> Error Handling</a>
213
- </li>
214
- <li>
215
- <a href="#Function+Overloading">Function Overloading</a>
216
- </li>
217
- <li>
218
- <a href="#Reporting+Progress">Reporting Progress</a>
219
- </li>
220
- <li>
221
- <a href="#Import+Lists">Import Lists</a>
222
- </li>
223
- </ul>
224
- </li>
225
- <li>
226
- <a href="#Load%2FStore+Functions"> Load/Store Functions</a>
227
- <ul class="minitoc">
228
- <li>
229
- <a href="#Load+Functions"> Load Functions</a>
230
- </li>
231
- <li>
232
- <a href="#Store+Functions"> Store Functions</a>
233
- </li>
234
- </ul>
235
- </li>
236
- <li>
237
- <a href="#Builtin+Functions+and+Function+Repositories">Builtin Functions and Function Repositories</a>
238
- </li>
239
- <li>
240
- <a href="#Accumulator+Interface">Accumulator Interface</a>
241
- </li>
242
- <li>
243
- <a href="#Advanced+Topics">Advanced Topics</a>
244
- <ul class="minitoc">
245
- <li>
246
- <a href="#Function+Instantiation">Function Instantiation</a>
247
- </li>
248
- <li>
249
- <a href="#Schemas">Schemas</a>
250
- </li>
251
- <li>
252
- <a href="#Passing+Configurations+to+UDFs">Passing Configurations to UDFs</a>
253
- </li>
254
- </ul>
255
- </li>
256
- </ul>
257
- </div>
258
-
259
-
260
-
261
- <a name="N1000D"></a><a name="Overview"></a>
262
- <h2 class="h3">Overview</h2>
263
- <div class="section">
264
- <p>Pig provides extensive support for user-defined functions (UDFs) as a way to specify custom processing.
265
- Functions can be a part of almost every operator in Pig.
266
- This document describes how to use existing functions as well as how to write your own functions.</p>
267
- </div>
268
-
269
-
270
- <a name="N10017"></a><a name="Eval+Functions"></a>
271
- <h2 class="h3">Eval Functions</h2>
272
- <div class="section">
273
- <a name="N1001D"></a><a name="How+to+Use+a+Simple+Eval+Function"></a>
274
- <h3 class="h4">How to Use a Simple Eval Function</h3>
275
- <p>Eval is the most common type of function. It can be used in <span class="codefrag">FOREACH</span> statements as shown in this script: </p>
276
- <pre class="code">
277
- -- myscript.pig
278
- REGISTER myudfs.jar;
279
- A = LOAD 'student_data' AS (name: chararray, age: int, gpa: float);
280
- B = FOREACH A GENERATE myudfs.UPPER(name);
281
- DUMP B;
282
- </pre>
283
- <p>The command below can be used to run the script. Note that all examples in this document run in local mode for simplicity
284
- but the examples can also run in Hadoop mode. For more information on how to run Pig, please see the PigTutorial. </p>
285
- <pre class="code">
286
- java -cp pig.jar org.apache.pig.Main -x local myscript.pig
287
- </pre>
288
- <p>The first line of the script provides the location of the <span class="codefrag">jar&nbsp;file</span> that contains the UDF.
289
- (Note that there are no quotes around the jar file. Having quotes would result in a syntax error.)
290
- To locate the jar file, Pig first checks the <span class="codefrag">classpath</span>. If the jar file can't be found in the classpath,
291
- Pig assumes that the location is either an absolute path or a path relative to the location from which Pig was invoked.
292
- If the jar file can't be found, an error will be printed: <span class="codefrag">java.io.IOException:&nbsp;Can't&nbsp;read&nbsp;jar&nbsp;file:&nbsp;myudfs.jar</span>. </p>
293
- <p>Multiple <span class="codefrag">register</span> commands can be used in the same script. If the same fully-qualified function is present in multiple jars,
294
- the first occurrence will be used consistently with Java semantics. </p>
295
- <p>The name of the UDF has to be fully qualified with the package name or an error will be reported:
296
- <span class="codefrag">java.io.IOException:&nbsp;Cannot&nbsp;instantiate:UPPER</span>. Also, the function name is case sensitive (UPPER and upper are not the same).
297
- A UDF can take one or more parameters. The exact signature of the function should clear from its documentation. </p>
298
- <p>The function provided in this example takes an ASCII string and produces its uppercase version. If you are familiar with column transformation functions
299
- in SQL, you will recognize that UPPER fits this concept. However, as we will see later in the document, eval functions in Pig go beyond column
300
- transformation functions and include aggregate and filter functions. </p>
301
- <p>If you are just a user of UDFs, this is most of what you need to know about UDFs to use them in your code. </p>
302
- <a name="N10053"></a><a name="How+to+Write+a+Simple+Eval+Function"></a>
303
- <h3 class="h4"> How to Write a Simple Eval Function</h3>
304
- <p>Let's now look at the implementation of the <span class="codefrag">UPPER</span> UDF. </p>
305
- <pre class="code">
306
- package myudfs;
307
- import java.io.IOException;
308
- import org.apache.pig.EvalFunc;
309
- import org.apache.pig.data.Tuple;
310
- import org.apache.pig.impl.util.WrappedIOException;
311
-
312
- public class UPPER extends EvalFunc (String)
313
- {
314
- public String exec(Tuple input) throws IOException {
315
- if (input == null || input.size() == 0)
316
- return null;
317
- try{
318
- String str = (String)input.get(0);
319
- return str.toUpperCase();
320
- }catch(Exception e){
321
- throw WrappedIOException.wrap("Caught exception processing input row ", e);
322
- }
323
- }
324
- }
325
- </pre>
326
- <p>The first line indicates that the function is part of the <span class="codefrag">myudfs</span> package. The UDF class extends the <span class="codefrag">EvalFunc</span> class which is the base class for all eval functions. It is parameterized with the return type of the UDF which is a Java <span class="codefrag">String</span> in this case. We will look into the <span class="codefrag">EvalFunc</span> class in more detail later, but for now all we need to do is to implement the <span class="codefrag">exec</span> function. This function is invoked on every input tuple. The input into the function is a tuple with input parameters in the order they are passed to the function in the Pig script. In our example, it will contain a single string field corresponding to the student name. </p>
327
- <p>The first thing to decide is what to do with invalid data. This depends on the format of the data. If the data is of type <span class="codefrag">bytearray</span> it means that it has not yet been converted to its proper type. In this case, if the format of the data does not match the expected type, a NULL value should be returned. If, on the other hand, the input data is of another type, this means that the conversion has already happened and the data should be in the correct format. This is the case with our example and that's why it throws an error (line 16.) Note that <span class="codefrag">WrappedIOException</span> is a helper class to convert the actual exception to an IOException. </p>
328
- <p>Also, note that lines 10-11 check if the input data is null or empty and if so returns null. </p>
329
- <p>The actual function implementation is on lines 13-14 and is self-explanatory. </p>
330
- <p>Now that we have the function implemented, it needs to be compiled and included in a jar. You will need to build <span class="codefrag">pig.jar</span> to compile your UDF. You can use the following set of commands to checkout the code from SVN repository and create pig.jar: </p>
331
- <pre class="code">
332
- svn co http://svn.apache.org/repos/asf/hadoop/pig/trunk
333
- cd trunk
334
- ant
335
- </pre>
336
- <p>You should see <span class="codefrag">pig.jar</span> in your current working directory. The set of commands below first compiles the function and then creates a jar file that contains it. </p>
337
- <pre class="code">
338
- cd myudfs
339
- javac -cp pig.jar UPPER.java
340
- cd ..
341
- jar -cf myudfs.jar myudfs
342
- </pre>
343
- <p>You should now see <span class="codefrag">myudfs.jar</span> in your current working directory. You can use this jar with the script described in the previous section. </p>
344
- <a name="N1009F"></a><a name="Aggregate+Functions"></a>
345
- <h3 class="h4">Aggregate Functions</h3>
346
- <p>Aggregate functions are another common type of eval function. Aggregate functions are usually applied to grouped data, as shown in this script: </p>
347
- <pre class="code">
348
- -- myscript2.pig
349
- A = LOAD 'student_data' AS (name: chararray, age: int, gpa: float);
350
- B = GROUP A BY name;
351
- C = FOREACH B GENERATE group, COUNT(A);
352
- DUMP C;
353
- </pre>
354
- <p>The script above uses the <span class="codefrag">COUNT</span> function to count the number of students with the same name. There are a couple of things to note about this script. First, even though we are using a function, there is no <span class="codefrag">register</span> command. Second, the function is not qualified with the package name. The reason for both is that <span class="codefrag">COUNT</span> is a <span class="codefrag">builtin</span> function meaning that it comes with the Pig distribution. These are the only two differences between builtins and UDFs. Builtins are discussed in more detail later in this document. </p>
355
- <p>An aggregate function is an eval function that takes a bag and returns a scalar value. One interesting and useful property of many aggregate functions is that they can be computed incrementally in a distributed fashion. We call these functions <span class="codefrag">algebraic</span>. <span class="codefrag">COUNT</span> is an example of an algebraic function because we can count the number of elements in a subset of the data and then sum the counts to produce a final output. In the Hadoop world, this means that the partial computations can be done by the map and combiner, and the final result can be computed by the reducer. </p>
356
- <p>It is very important for performance to make sure that aggregate functions that are algebraic are implemented as such. Let's look at the implementation of the COUNT function to see what this means. (Error handling and some other code is omitted to save space. The full code can be accessed <a href="http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/COUNT.java?view=markup"> here</a>.</p>
357
- <pre class="code">
358
- public class COUNT extends EvalFunc (Long) implements Algebraic{
359
- public Long exec(Tuple input) throws IOException {return count(input);}
360
- public String getInitial() {return Initial.class.getName();}
361
- public String getIntermed() {return Intermed.class.getName();}
362
- public String getFinal() {return Final.class.getName();}
363
- static public class Initial extends EvalFunc (Tuple) {
364
- public Tuple exec(Tuple input) throws IOException {return TupleFactory.getInstance().newTuple(count(input));}
365
- }
366
- static public class Intermed extends EvalFunc (Tuple) {
367
- public Tuple exec(Tuple input) throws IOException {return TupleFactory.getInstance().newTuple(sum(input));}
368
- }
369
- static public class Final extends EvalFunc (Long) {
370
- public Tuple exec(Tuple input) throws IOException {return sum(input);}
371
- }
372
- static protected Long count(Tuple input) throws ExecException {
373
- Object values = input.get(0);
374
- if (values instanceof DataBag) return ((DataBag)values).size();
375
- else if (values instanceof Map) return new Long(((Map)values).size());
376
- }
377
- static protected Long sum(Tuple input) throws ExecException, NumberFormatException {
378
- DataBag values = (DataBag)input.get(0);
379
- long sum = 0;
380
- for (Iterator (Tuple) it = values.iterator(); it.hasNext();) {
381
- Tuple t = it.next();
382
- sum += (Long)t.get(0);
383
- }
384
- return sum;
385
- }
386
- }
387
- </pre>
388
- <p>
389
- <span class="codefrag">COUNT</span> implements <span class="codefrag">Algebraic</span> interface which looks like this: </p>
390
- <pre class="code">
391
- public interface Algebraic{
392
- public String getInitial();
393
- public String getIntermed();
394
- public String getFinal();
395
- }
396
- </pre>
397
- <p>For a function to be algebraic, it needs to implement <span class="codefrag">Algebraic</span> interface that consist of definition of three classes derived from <span class="codefrag">EvalFunc</span>. The contract is that the <span class="codefrag">exec</span> function of the <span class="codefrag">Initial</span> class is called once and is passed the original input tuple. Its output is a tuple that contains partial results. The <span class="codefrag">exec</span> function of the <span class="codefrag">Intermed</span> class can be called zero or more times and takes as its input a tuple that contains partial results produced by the <span class="codefrag">Initial</span> class or by prior invocations of the <span class="codefrag">Intermed</span> class and produces a tuple with another partial result. Finally, the <span class="codefrag">exec</span> function of the <span class="codefrag">Final</span> class is called and produces the final result as a scalar type. </p>
398
- <p>Here's the way to think about this in the Hadoop world. The <span class="codefrag">exec</span> function of the <span class="codefrag">Initial</span> class is invoked once by the <span class="codefrag">map</span> process and produces partial results. The <span class="codefrag">exec</span> function of the <span class="codefrag">Intermed</span> class is invoked once by each <span class="codefrag">combiner</span> invocation (which can happen zero or more times) and also produces partial results. The <span class="codefrag">exec</span> function of the <span class="codefrag">Final</span> class is invoked once by the reducer and produces the final result. </p>
399
- <p>Take a look at the <span class="codefrag">COUNT</span> implementation to see how this is done. Note that the <span class="codefrag">exec</span> function of the <span class="codefrag">Initial</span> and <span class="codefrag">Intermed</span> classes is parameterized with <span class="codefrag">Tuple</span> and the <span class="codefrag">exec</span> of the <span class="codefrag">Final</span> class is parameterized with the real type of the function, which in the case of the <span class="codefrag">COUNT</span> is <span class="codefrag">Long</span>. Also, note that the fully-qualified name of the class needs to be returned from <span class="codefrag">getInitial</span>, <span class="codefrag">getIntermed</span>, and <span class="codefrag">getFinal</span> methods. </p>
400
- <a name="N1013F"></a><a name="Filter+Functions"></a>
401
- <h3 class="h4"> Filter Functions</h3>
402
- <p>Filter functions are eval functions that return a <span class="codefrag">boolean</span> value. Filter functions can be used anywhere a Boolean expression is appropriate, including the <span class="codefrag">FILTER</span> operator or <span class="codefrag">bincond</span> expression. </p>
403
- <p>The example below uses the <span class="codefrag">IsEmpy</span> builtin filter function to implement joins. </p>
404
- <pre class="code">
405
- -- inner join
406
- A = LOAD 'student_data' AS (name: chararray, age: int, gpa: float);
407
- B = LOAD 'voter_data' AS (name: chararray, age: int, registration: chararay, contributions: float);
408
- C = COGROUP A BY name, B BY name;
409
- D = FILTER C BY not IsEmpty(A);
410
- E = FILTER D BY not IsEmpty(B);
411
- F = FOREACH E GENERATE flatten(A), flatten(B);
412
- DUMP F;
413
- </pre>
414
- <p>Note that, even if filtering is omitted, the same results will be produced because the <span class="codefrag">foreach</span> results is a cross product and cross products get rid of empty bags. However, doing up-front filtering is more efficient since it reduces the input of the cross product. </p>
415
- <pre class="code">
416
- -- full outer join
417
- A = LOAD 'student_data' AS (name: chararray, age: int, gpa: float);
418
- B = LOAD 'voter_data' AS (name: chararray, age: int, registration: chararay, contributions: float);
419
- C = COGROUP A BY name, B BY name;
420
- D = FOREACH C GENERATE group, flatten((IsEmpty(A) ? null : A)), flatten((IsEmpty(B) ? null : B));
421
- dump D
422
- </pre>
423
- <p>The implementation of the <span class="codefrag">IsEmpty</span> function looks like this: </p>
424
- <pre class="code">
425
- import java.io.IOException;
426
- import java.util.Map;
427
- import org.apache.pig.FilterFunc;
428
- import org.apache.pig.backend.executionengine.ExecException;
429
- import org.apache.pig.data.DataBag;
430
- import org.apache.pig.data.Tuple;
431
- import org.apache.pig.data.DataType;
432
- import org.apache.pig.impl.util.WrappedIOException;
433
-
434
- public class IsEmpty extends FilterFunc {
435
- public Boolean exec(Tuple input) throws IOException {
436
- if (input == null || input.size() == 0)
437
- return null;
438
- try {
439
- Object values = input.get(0);
440
- if (values instanceof DataBag)
441
- return ((DataBag)values).size() == 0;
442
- else if (values instanceof Map)
443
- return ((Map)values).size() == 0;
444
- else{
445
- throw new IOException("Cannot test a " +
446
- DataType.findTypeName(values) + " for emptiness.");
447
- }
448
- } catch (ExecException ee) {
449
- throw WrappedIOException.wrap("Caught exception processing input row ", ee);
450
- }
451
- }
452
- }
453
-
454
- </pre>
455
- <a name="N10170"></a><a name="Pig+Types"></a>
456
- <h3 class="h4"> Pig Types</h3>
457
- <p>The main thing to know about Pig's type system is that Pig uses native Java types for almost all of its types, as shown in this table. </p>
458
- <table class="ForrestTable" cellspacing="1" cellpadding="4">
459
-
460
- <tr>
461
-
462
- <th colspan="1" rowspan="1">
463
- Pig Type
464
- </th>
465
- <th colspan="1" rowspan="1">
466
- Java Class
467
- </th>
468
-
469
- </tr>
470
-
471
- <tr>
472
-
473
- <td colspan="1" rowspan="1">
474
-
475
- <p> bytearray </p>
476
-
477
- </td>
478
- <td colspan="1" rowspan="1">
479
-
480
- <p> DataByteArray </p>
481
-
482
- </td>
483
-
484
- </tr>
485
-
486
- <tr>
487
-
488
- <td colspan="1" rowspan="1">
489
-
490
- <p> chararray </p>
491
-
492
- </td>
493
- <td colspan="1" rowspan="1">
494
-
495
- <p> String </p>
496
-
497
- </td>
498
-
499
- </tr>
500
-
501
- <tr>
502
-
503
- <td colspan="1" rowspan="1">
504
-
505
- <p> int </p>
506
-
507
- </td>
508
- <td colspan="1" rowspan="1">
509
-
510
- <p> Integer </p>
511
-
512
- </td>
513
-
514
- </tr>
515
-
516
- <tr>
517
-
518
- <td colspan="1" rowspan="1">
519
-
520
- <p> long </p>
521
-
522
- </td>
523
- <td colspan="1" rowspan="1">
524
-
525
- <p> Long </p>
526
-
527
- </td>
528
-
529
- </tr>
530
-
531
- <tr>
532
-
533
- <td colspan="1" rowspan="1">
534
-
535
- <p> float </p>
536
-
537
- </td>
538
- <td colspan="1" rowspan="1">
539
-
540
- <p> Float </p>
541
-
542
- </td>
543
-
544
- </tr>
545
-
546
- <tr>
547
-
548
- <td colspan="1" rowspan="1">
549
-
550
- <p> double </p>
551
-
552
- </td>
553
- <td colspan="1" rowspan="1">
554
-
555
- <p> Double </p>
556
-
557
- </td>
558
-
559
- </tr>
560
-
561
- <tr>
562
-
563
- <td colspan="1" rowspan="1">
564
-
565
- <p> tuple </p>
566
-
567
- </td>
568
- <td colspan="1" rowspan="1">
569
-
570
- <p> Tuple </p>
571
-
572
- </td>
573
-
574
- </tr>
575
-
576
- <tr>
577
-
578
- <td colspan="1" rowspan="1">
579
-
580
- <p> bag </p>
581
-
582
- </td>
583
- <td colspan="1" rowspan="1">
584
-
585
- <p> DataBag </p>
586
-
587
- </td>
588
-
589
- </tr>
590
-
591
- <tr>
592
-
593
- <td colspan="1" rowspan="1">
594
-
595
- <p> map </p>
596
-
597
- </td>
598
- <td colspan="1" rowspan="1">
599
-
600
- <p> Map&lt;Object, Object&gt; </p>
601
-
602
- </td>
603
-
604
- </tr>
605
-
606
- </table>
607
- <p>All Pig-specific classes are available <a href="http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/data/"> here</a>. </p>
608
- <p>
609
- <span class="codefrag">Tuple</span> and <span class="codefrag">DataBag</span> are different in that they are not concrete classes but rather interfaces. This enables users to extend Pig with their own versions of tuples and bags. As a result, UDFs cannot directly instantiate bags or tuples; they need to go through factory classes: <span class="codefrag">TupleFactory</span> and <span class="codefrag">BagFactory</span>. </p>
610
- <p>The builtin <span class="codefrag">TOKENIZE</span> function shows how bags and tuples are created. A function takes a text string as input and returns a bag of words from the text. (Note that currently Pig bags always contain tuples.) </p>
611
- <pre class="code">
612
- package org.apache.pig.builtin;
613
-
614
- import java.io.IOException;
615
- import java.util.StringTokenizer;
616
- import org.apache.pig.EvalFunc;
617
- import org.apache.pig.data.BagFactory;
618
- import org.apache.pig.data.DataBag;
619
- import org.apache.pig.data.Tuple;
620
- import org.apache.pig.data.TupleFactory;
621
-
622
- public class TOKENIZE extends EvalFunc (DataBag) {
623
- TupleFactory mTupleFactory = TupleFactory.getInstance();
624
- BagFactory mBagFactory = BagFactory.getInstance();
625
-
626
- public DataBag exec(Tuple input) throws IOException
627
- try {
628
- DataBag output = mBagFactory.newDefaultBag();
629
- Object o = input.get(0);
630
- if (!(o instanceof String)) {
631
- throw new IOException("Expected input to be chararray, but got " + o.getClass().getName());
632
- }
633
- StringTokenizer tok = new StringTokenizer((String)o, " \",()*", false);
634
- while (tok.hasMoreTokens()) output.add(mTupleFactory.newTuple(tok.nextToken()));
635
- return output;
636
- } catch (ExecException ee) {
637
- // error handling goes here
638
- }
639
- }
640
- }
641
- </pre>
642
- <a name="N10254"></a><a name="Schema"></a>
643
- <h3 class="h4"> Schema</h3>
644
- <p>The latest version of Pig uses type information for validation and performance. It is important for UDFs to participate in type propagation. Until now, our UDFs made no effort to communicate their output schema to Pig. This is because, most of the time, Pig can figure out this information by using Java's <a href="http://java.sun.com/developer/technicalArticles/ALT/Reflection/"> Reflection</a>. If your UDF returns a scalar or a map, no work is required. However, if your UDF returns a <span class="codefrag">tuple</span> or a <span class="codefrag">bag</span> (of tuples), it needs to help Pig figure out the structure of the tuple. </p>
645
- <p>If a UDF returns a <span class="codefrag">tuple</span> or a <span class="codefrag">bag</span> and schema information is not provided, Pig assumes that the tuple contains a single field of type <span class="codefrag">bytearray</span>. If this is not the case, then not specifying the schema can cause failures. We look at this next. </p>
646
- <p>Let's assume that we have UDF <span class="codefrag">Swap</span> that, given a tuple with two fields, swaps their order. Let's assume that the UDF does not specify a schema and look at the scripts below: </p>
647
- <pre class="code">
648
- register myudfs.jar;
649
- A = load 'student_data' as (name: chararray, age: int, gpa: float);
650
- B = foreach A generate flatten(myudfs.Swap(name, age)), gpa;
651
- C = foreach B generate $2;
652
- D = limit B 20;
653
- dump D;
654
- </pre>
655
- <p>This script will result in the following error cause by line 4. </p>
656
- <pre class="code">
657
- java.io.IOException: Out of bound access. Trying to access non-existent column: 2. Schema {bytearray,gpa: float} has 2 column(s).
658
- </pre>
659
- <p>This is because Pig is only aware of two columns in B while line 4 is requesting the third column of the tuple. (Column indexing in Pig starts with 0.) </p>
660
- <p>The function, including the schema, looks like this: </p>
661
- <pre class="code">
662
- package myudfs;
663
- import java.io.IOException;
664
- import org.apache.pig.EvalFunc;
665
- import org.apache.pig.data.Tuple;
666
- import org.apache.pig.data.TupleFactory;
667
- import org.apache.pig.impl.logicalLayer.schema.Schema;
668
- import org.apache.pig.data.DataType;
669
-
670
- public class Swap extends EvalFunc (Tuple) {
671
- public Tuple exec(Tuple input) throws IOException {
672
- if (input == null || input.size() 2
673
- return null;
674
- try{
675
- Tuple output = TupleFactory.getInstance().newTuple(2);
676
- output.set(0, input.get(1));
677
- output.set(1, input.get(0));
678
- return output;
679
- } catch(Exception e){
680
- System.err.println("Failed to process input; error - " + e.getMessage());
681
- return null;
682
- }
683
- }
684
- public Schema outputSchema(Schema input) {
685
- try{
686
- Schema tupleSchema = new Schema();
687
- tupleSchema.add(input.getField(1));
688
- tupleSchema.add(input.getField(0));
689
- return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),tupleSchema, DataType.TUPLE));
690
- }catch (Exception e){
691
- return null;
692
- }
693
- }
694
- }
695
- </pre>
696
- <p>The function creates a schema with a single field (of type <span class="codefrag">FieldSchema=)&nbsp;of&nbsp;type&nbsp;=tuple</span>. The name of the field is constructed using the <span class="codefrag">getSchemaName</span> function of the <span class="codefrag">EvalFunc</span> class. The name consists of the name of the UDF function, the first parameter passed to it, and a sequence number to guarantee uniqueness. In the previous script, if you replace <span class="codefrag">dump&nbsp;D;</span> with <span class="codefrag">describe&nbsp;B;</span> , you will see the following output:
697
- </p>
698
- <pre class="code">
699
- B: {myudfs.swap_age_3::age: int,myudfs.swap_age_3::name: chararray,gpa: float}
700
- </pre>
701
- <p>The second parameter to the <span class="codefrag">FieldSchema</span> constructor is the schema representing this field, which in this case is a tuple with two fields. The third parameter represents the type of the schema, which in this case is a <span class="codefrag">TUPLE</span>. All supported schema types are defined in the <span class="codefrag">org.apache.pig.data.DataType</span> class. </p>
702
- <pre class="code">
703
- public class DataType {
704
- public static final byte UNKNOWN = 0;
705
- public static final byte NULL = 1;
706
- public static final byte BOOLEAN = 5; // internal use only
707
- public static final byte BYTE = 6; // internal use only
708
- public static final byte INTEGER = 10;
709
- public static final byte LONG = 15;
710
- public static final byte FLOAT = 20;
711
- public static final byte DOUBLE = 25;
712
- public static final byte BYTEARRAY = 50;
713
- public static final byte CHARARRAY = 55;
714
- public static final byte MAP = 100;
715
- public static final byte TUPLE = 110;
716
- public static final byte BAG = 120;
717
- public static final byte ERROR = -1;
718
- // more code here
719
- }
720
- </pre>
721
- <p>You need to import the <span class="codefrag">org.apache.pig.data.DataType</span> class into your code to define schemas. You also need to import the schema class <span class="codefrag">org.apache.pig.impl.logicalLayer.schema.Schema</span>. </p>
722
- <p>The example above shows how to create an output schema for a tuple. Doing this for a bag is very similar. Let's extend the <span class="codefrag">TOKENIZE</span> function to do that: </p>
723
- <pre class="code">
724
- package org.apache.pig.builtin;
725
-
726
- import java.io.IOException;
727
- import java.util.StringTokenizer;
728
- import org.apache.pig.EvalFunc;
729
- import org.apache.pig.data.BagFactory;
730
- import org.apache.pig.data.DataBag;
731
- import org.apache.pig.data.Tuple;
732
- import org.apache.pig.data.TupleFactory;
733
- import org.apache.pig.impl.logicalLayer.schema.Schema;
734
- import org.apache.pig.data.DataType;
735
-
736
- public class TOKENIZE extends EvalFunc (DataBag) {
737
- TupleFactory mTupleFactory = TupleFactory.getInstance();
738
- BagFactory mBagFactory = BagFactory.getInstance();
739
- public DataBag exec(Tuple input) throws IOException {
740
- try {
741
- DataBag output = mBagFactory.newDefaultBag();
742
- Object o = input.get(0);
743
- if (!(o instanceof String)) {
744
- throw new IOException("Expected input to be chararray, but got " + o.getClass().getName());
745
- }
746
- StringTokenizer tok = new StringTokenizer((String)o, " \",()*", false);
747
- while (tok.hasMoreTokens()) output.add(mTupleFactory.newTuple(tok.nextToken()));
748
- return output;
749
- } catch (ExecException ee) {
750
- // error handling goes here
751
- }
752
- }
753
- public Schema outputSchema(Schema input) {
754
- try{
755
- Schema bagSchema = new Schema();
756
- bagSchema.add(new Schema.FieldSchema("token", DataType.CHARARRAY));
757
-
758
- return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),
759
- bagSchema, DataType.BAG));
760
- }catch (Exception e){
761
- return null;
762
- }
763
- }
764
- }
765
- </pre>
766
- <p>As you can see, this is very similar to the output schema definition in the <span class="codefrag">Swap</span> function. One difference is that instead of reusing input schema, we create a brand new field schema to represent the tokens stored in the bag. The other difference is that the type of the schema created is <span class="codefrag">BAG</span> (not =TUPLE=). </p>
767
- <a name="N102D1"></a><a name="Error+Handling"></a>
768
- <h3 class="h4"> Error Handling</h3>
769
- <p>There are several types of errors that can occur in a UDF: </p>
770
- <ol>
771
-
772
- <li>
773
- <p>An error that affects a particular row but is not likely to impact other rows. An example of such an error would be a malformed input value or divide by zero problem. A reasonable handling of this situation would be to emit a warning and return a null value. <span class="codefrag">ABS</span> function in the next section demonstrates this approach. The current approach is to write the warning to <span class="codefrag">stderr</span>. Eventually we would like to pass a logger to the UDFs. Note that returning a NULL value only makes sense if the malformed value is of type <span class="codefrag">bytearray</span>. Otherwise the proper type has been already created and should have an appropriate value. If this is not the case, it is an internal error and should cause the system to fail. Both cases can be seen in the implementation of the <span class="codefrag">ABS</span> function in the next section. </p>
774
-
775
- </li>
776
-
777
- <li>
778
- <p>An error that affects the entire processing but can succeed on retry. An example of such a failure is the inability to open a lookup file because the file could not be found. This could be a temporary environmental issue that can go away on retry. A UDF can signal this to Pig by throwing an <span class="codefrag">IOException</span> as with the case of the <span class="codefrag">ABS</span> function below. </p>
779
-
780
- </li>
781
-
782
- <li>
783
- <p>An error that affects the entire processing and is not likely to succeed on retry. An example of such a failure is the inability to open a lookup file because of file permission problems. Pig currently does not have a way to handle this case. Hadoop does not have a way to handle this case either. It will be handled the same way as 2 above. </p>
784
-
785
- </li>
786
-
787
- </ol>
788
- <p>Pig provides a helper class <span class="codefrag">WrappedIOException</span>. The intent here is to allow you to convert any exception into <span class="codefrag">IOException</span>. Its usage can be seen in the <span class="codefrag">UPPER</span> function in our first example. </p>
789
- <a name="N1030B"></a><a name="Function+Overloading"></a>
790
- <h3 class="h4">Function Overloading</h3>
791
- <p>Before the type system was available in Pig, all values for the purpose of arithmetic calculations were assumed to be doubles as the safest choice. However, this is not very efficient if the data is actually of type integer or long. (We saw about a 2x slowdown of a query when using double where integer could be used.) Now that Pig supports types we can take advantage of the type information and choose the function that is most efficient for the provided operands. </p>
792
- <p>UDF writers are encouraged to provide type-specific versions of a function if this can result in better performance. On the other hand, we don't want the users of the functions to worry about different functions - the right thing should just happen. Pig allows for this via a function table mechanism as shown in the next example. </p>
793
- <p>This example shows the implementation of the <span class="codefrag">ABS</span> function that returns the absolute value of a numeric value passed to it as input. </p>
794
- <pre class="code">
795
- import java.io.IOException;
796
- import java.util.List;
797
- import java.util.ArrayList;
798
- import org.apache.pig.EvalFunc;
799
- import org.apache.pig.FuncSpec;
800
- import org.apache.pig.data.Tuple;
801
- import org.apache.pig.impl.logicalLayer.FrontendException;
802
- import org.apache.pig.impl.util.WrappedIOException;
803
- import org.apache.pig.impl.logicalLayer.schema.Schema;
804
- import org.apache.pig.data.DataType;
805
-
806
- public class ABS extends EvalFunc (Double) {
807
- public Double exec(Tuple input) throws IOException {
808
- if (input == null || input.size() == 0)
809
- return null;
810
- Double d;
811
- try{
812
- d = DataType.toDouble(input.get(0));
813
- } catch (NumberFormatException nfe){
814
- System.err.println("Failed to process input; error - " + nfe.getMessage());
815
- return null;
816
- } catch (Exception e){
817
- throw WrappedIOException.wrap("Caught exception processing input row ", e);
818
- }
819
- return Math.abs(d);
820
- }
821
- public List (FuncSpec) getArgToFuncMapping() throws FrontendException {
822
- List (FuncSpec) funcList = new ArrayList (FuncSpec) ();
823
- funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.BYTEARRAY))));
824
- funcList.add(new FuncSpec(DoubleAbs.class.getName(), new Schema(new Schema.FieldSchema(null, DataType.DOUBLE))));
825
- funcList.add(new FuncSpec(FloatAbs.class.getName(), new Schema(new Schema.FieldSchema(null, DataType.FLOAT))));
826
- funcList.add(new FuncSpec(IntAbs.class.getName(), new Schema(new Schema.FieldSchema(null, DataType.INTEGER))));
827
- funcList.add(new FuncSpec(LongAbs.class.getName(), new Schema(new Schema.FieldSchema(null, DataType.LONG))));
828
- return funcList;
829
- }
830
- }
831
- </pre>
832
- <p>The main thing to notice in this example is the <span class="codefrag">getArgToFuncMapping()</span> method. This method returns a list that contains a mapping from the input schema to the class that should be used to handle it. In this example the main class handles the <span class="codefrag">bytearray</span> input and outsources the rest of the work to other classes implemented in separate files in the same package. The example of one such class is below. This class handles integer input values. </p>
833
- <pre class="code">
834
- import java.io.IOException;
835
- import org.apache.pig.impl.util.WrappedIOException;
836
- import org.apache.pig.EvalFunc;
837
- import org.apache.pig.data.Tuple;
838
-
839
- public class IntAbs extends EvalFunc (Integer) {
840
- public Integer exec(Tuple input) throws IOException {
841
- if (input == null || input.size() == 0)
842
- return null;
843
- Integer d;
844
- try{
845
- d = (Integer)input.get(0);
846
- } catch (Exception e){
847
- throw WrappedIOException.wrap("Caught exception processing input row ", e);
848
- }
849
- return Math.abs(d);
850
- }
851
- }
852
- </pre>
853
- <p>A note on error handling. The <span class="codefrag">ABS</span> class covers the case of the <span class="codefrag">bytearray</span> which means the data has not been converted yet to its actual type. This is why a null value is returned when <span class="codefrag">NumberFormatException</span> is encountered. However, the <span class="codefrag">IntAbs</span> function is only called if the data is already of type <span class="codefrag">Integer</span> which means it has already been converted to the real type and bad format has been dealt with. This is why an exception is thrown if the input can't be cast to <span class="codefrag">Integer</span>. </p>
854
- <p>The example above covers a reasonably simple case where the UDF only takes one parameter and there is a separate function for each parameter type. However, this will not always be the case. If Pig can't find an <span class="codefrag">exact&nbsp;match</span> it tries to do a <span class="codefrag">best&nbsp;match</span>. The rule for the best match is to find the most efficient function that can be used safely. This means that Pig must find the function that, for each input parameter, provides the smallest type that is equal to or greater than the input type. The type progression rules are: <span class="codefrag">int=-&gt;=long=-&gt;=float=-&gt;=double</span>. </p>
855
- <p>For instance, let's consider function <span class="codefrag">MAX</span> which is part of the <span class="codefrag">piggybank</span> described later in this document. Given two values, the function returns the larger value. The function table for <span class="codefrag">MAX</span> looks like this: </p>
856
- <pre class="code">
857
- public List (FuncSpec) getArgToFuncMapping() throws FrontendException {
858
- List (FuncSpec) funcList = new ArrayList (FuncSpec) ();
859
- Util.addToFunctionList(funcList, IntMax.class.getName(), DataType.INTEGER);
860
- Util.addToFunctionList(funcList, DoubleMax.class.getName(), DataType.DOUBLE);
861
- Util.addToFunctionList(funcList, FloatMax.class.getName(), DataType.FLOAT);
862
- Util.addToFunctionList(funcList, LongMax.class.getName(), DataType.LONG);
863
-
864
- return funcList;
865
- }
866
- </pre>
867
- <p>The <span class="codefrag">Util.addToFunctionList</span> function is a helper function that adds an entry to the list as the first argument, with the key of the class name passed as the second argument, and the schema containing two fields of the same type as the third argument. </p>
868
- <p>Let's now see how this function can be used in a Pig script: </p>
869
- <pre class="code">
870
- REGISTER piggybank.jar
871
- A = LOAD 'student_data' AS (name: chararray, gpa1: float, gpa2: double);
872
- B = FOREACH A GENERATE name, org.apache.pig.piggybank.evaluation.math.MAX(gpa1, gpa2);
873
- DUMP B;
874
- </pre>
875
- <p>In this example, the function gets one parameter of type <span class="codefrag">float</span> and another of type <span class="codefrag">double</span>. The best fit will be the function that takes two double values. Pig makes this choice on the user's behalf by inserting implicit casts for the parameters. Running the script above is equivalent to running the script below: </p>
876
- <pre class="code">
877
- A = LOAD 'student_data' AS (name: chararray, gpa1: float, gpa2: double);
878
- B = FOREACH A GENERATE name, org.apache.pig.piggybank.evaluation.math.MAX((double)gpa1, gpa2);
879
- DUMP B;
880
- </pre>
881
- <p>A special case of the <span class="codefrag">best&nbsp;fit</span> approach is handling data without a schema specified. The type for this data is interpreted as <span class="codefrag">bytearray</span>. Since the type of the data is not known, there is no way to choose a best fit version. The only time a cast is performed is when the function table contains only a single entry. This works well to maintain backward compatibility. </p>
882
- <p>Let's revisit the <span class="codefrag">UPPER</span> function from our first example. As it is written now, it would only work if the data passed to it is of type <span class="codefrag">chararray</span>. To make it work with data whose type is not explicitly set, a function table with a single entry needs to be added: </p>
883
- <pre class="code">
884
- package myudfs;
885
- import java.io.IOException;
886
- import org.apache.pig.EvalFunc;
887
- import org.apache.pig.data.Tuple;
888
-
889
- public class UPPER extends EvalFunc (String)
890
- {
891
- public String exec(Tuple input) throws IOException {
892
- if (input == null || input.size() == 0)
893
- return null;
894
- try{
895
- String str = (String)input.get(0);
896
- return str.toUpperCase();
897
- }catch(Exception e){
898
- System.err.println("WARN: UPPER: failed to process input; error - " + e.getMessage());
899
- return null;
900
- }
901
- }
902
- public List (FuncSpec) getArgToFuncMapping() throws FrontendException {
903
- List (FuncSpec) funcList = new ArrayList (FuncSpec) ();
904
- funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY))));
905
- return funcList;
906
- }
907
- }
908
- </pre>
909
- <p>Now the following script will ran: </p>
910
- <pre class="code">
911
- -- this is myscript.pig
912
- REGISTER myudfs.jar;
913
- A = LOAD 'student_data' AS (name, age, gpa);
914
- B = FOREACH A GENERATE myudfs.UPPER(name);
915
- DUMP B;
916
- </pre>
917
- <a name="N10397"></a><a name="Reporting+Progress"></a>
918
- <h3 class="h4">Reporting Progress</h3>
919
- <p>A challenge of running a large shared system is to make sure system resources are used efficiently. One aspect of this challenge is detecting runaway processes that are no longer making progress. Pig uses a heartbeat mechanism for this purpose. If any of the tasks stops sending a heartbeat, the system assumes that it is dead and kills it. </p>
920
- <p>Most of the time, single-tuple processing within a UDF is very short and does not require a UDF to heartbeat. The same is true for aggregate functions that operate on large bags because bag iteration code takes care of it. However, if you have a function that performs a complex computation that can take an order of minutes to execute, you should add a progress indicator to your code. This is very easy to accomplish. The <span class="codefrag">EvalFunc</span> function provides a <span class="codefrag">progress</span> function that you need to call in your <span class="codefrag">exec</span> method. </p>
921
- <p>For instance, the <span class="codefrag">UPPER</span> function would now look as follows: </p>
922
- <pre class="code">
923
- public class UPPER extends EvalFunc (String)
924
- {
925
- public String exec(Tuple input) throws IOException {
926
- if (input == null || input.size() == 0)
927
- return null;
928
- try{
929
- reporter.progress();
930
- String str = (String)input.get(0);
931
- return str.toUpperCase();
932
- }catch(Exception e){
933
- throw WrappedIOException.wrap("Caught exception processing input row ", e);
934
- }
935
- }
936
- }
937
- </pre>
938
- <a name="N103B7"></a><a name="Import+Lists"></a>
939
- <h3 class="h4">Import Lists</h3>
940
- <p>An import list allows you to specify the package to which a UDF or a group of UDFs belong,
941
- eliminating the need to qualify the UDF on every call. An import list can be specified via the udf.import.list Java
942
- property on the Pig command line: </p>
943
- <pre class="code">
944
- pig -Dudf.import.list=com.yahoo.yst.sds.ULT
945
- </pre>
946
- <p>You can supply multiple locations as well: </p>
947
- <pre class="code">
948
- pig -Dudf.import.list=com.yahoo.yst.sds.ULT:org.apache.pig.piggybank.evaluation
949
- </pre>
950
- <p>To make use of import scripts, do the following:</p>
951
- <pre class="code">
952
- myscript.pig:
953
- A = load '/data/SDS/data/searcg_US/20090820' using ULTLoader as (s, m, l);
954
- ....
955
-
956
- command:
957
- pig -cp sds.jar -Dudf.import.list=com.yahoo.yst.sds.ULT myscript.pig
958
- </pre>
959
- </div>
960
-
961
- <!-- BEGIN LOAD/STORE FUNCTIONS -->
962
-
963
- <a name="N103D6"></a><a name="Load%2FStore+Functions"></a>
964
- <h2 class="h3"> Load/Store Functions</h2>
965
- <div class="section">
966
- <p>The load/store user-defined functions control how data goes into Pig and comes out of Pig. Often, the same function handles both input and output but that does not have to be the case. </p>
967
- <p>
968
- With Pig 0.7.0, the Pig load/store API moves closer to using Hadoop's InputFormat and OutputFormat classes.
969
- This enables Pig users/developers to create new LoadFunc and StoreFunc implementation based on existing Hadoop InputFormat and OutputFormat classes with minimal code. The complexity of reading the data and creating a record will now lie in the InputFormat and likewise on the writing end, the complexity of writing will lie in the OutputFormat. This enables Pig to easily read/write data in new storage formats as and when an Hadoop InputFormat and OutputFormat is available for them. </p>
970
- <p>
971
-
972
- <strong>Note:</strong> Both the LoadFunc and StoreFunc implementations should use the Hadoop 20 API based classes (InputFormat/OutputFormat and related classes) under the <strong>new</strong> org.apache.hadoop.mapreduce package instead of the old org.apache.hadoop.mapred package.
973
- </p>
974
- <a name="N103EB"></a><a name="Load+Functions"></a>
975
- <h3 class="h4"> Load Functions</h3>
976
- <p>
977
- <a href="http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/LoadFunc.java?view=markup">LoadFunc</a>
978
- abstract class has the main methods for loading data and for most use cases it would suffice to extend it. There are three other optional interfaces which can be implemented to achieve extended functionality: </p>
979
- <ul>
980
-
981
- <li>
982
- <a href="http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/LoadMetadata.java?view=markup">LoadMetadata</a>
983
- has methods to deal with metadata - most implementation of loaders don't need to implement this unless they interact with some metadata system. The getSchema() method in this interface provides a way for loader implementations to communicate the schema of the data back to pig. If a loader implementation returns data comprised of fields of real types (rather than DataByteArray fields), it should provide the schema describing the data returned through the getSchema() method. The other methods are concerned with other types of metadata like partition keys and statistics. Implementations can return null return values for these methods if they are not applicable for that implementation.</li>
984
-
985
- <li>
986
- <a href="http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/LoadPushDown.java?view=markup">LoadPushDown</a>
987
- has methods to push operations from pig runtime into loader implementations - currently only projections .i.e the pushProjection() method is called by Pig to communicate to the loader what exact fields are required in the pig script. The loader implementation can choose to honor the request or respond that it will not honor the request and return all fields in the data. If a loader implementation is able to efficiently return only required fields, it should implement LoadPushDown to improve query performance. (Irrespective of whether the implementation can or cannot return only the required fields, if the implementation also implements getSchema(), the schema returned in getSchema() should be for the entire tuple of data.) </li>
988
-
989
- <li>
990
- <a href="http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/LoadCaster.java?view=markup">LoadCaster</a>
991
- has methods to convert byte arrays to specific types. A loader implementation should implement this if casts (implicit or explicit) from DataByteArray fields to other types need to be supported. </li>
992
-
993
- </ul>
994
- <p>The LoadFunc abstract class is the main class to extend for implementing a loader. The methods which need to be overriden are explained below:</p>
995
- <ul>
996
-
997
- <li>getInputFormat() :This method is called by Pig to get the InputFormat used by the loader. The methods in the InputFormat (and underlying RecordReader) are called by Pig in the same manner (and in the same context) as by Hadoop in a MapReduce java program. If the InputFormat is a Hadoop packaged one, the implementation should use the new API based one under org.apache.hadoop.mapreduce. If it is a custom InputFormat, it should be implemented using the new API in org.apache.hadoop.mapreduce.<br>
998
- <br>
999
-
1000
- If a custom loader using a text-based InputFormat or a file-based InputFormat would like to read files in all subdirectories under a given input directory recursively, then it should use the PigTextInputFormat and PigFileInputFormat classes provided in org.apache.pig.backend.hadoop.executionengine.mapReduceLayer. The Pig InputFormat classes work around a current limitation in the Hadoop TextInputFormat and FileInputFormat classes which only read one level down from the provided input directory. For example, if the input in the load statement is 'dir1' and there are subdirs 'dir2' and 'dir2/dir3' beneath dir1, the Hadoop TextInputFormat and FileInputFormat classes read the files under 'dir1' only. Using PigTextInputFormat or PigFileInputFormat (or by extending them), the files in all the directories can be read. </li>
1001
-
1002
-
1003
- <li>setLocation() :This method is called by Pig to communicate the load location to the loader. The loader should use this method to communicate the same information to the underlying InputFormat. This method is called multiple times by pig - implementations should bear this in mind and should ensure there are no inconsistent side effects due to the multiple calls. </li>
1004
-
1005
-
1006
- <li>prepareToRead() : Through this method the RecordReader associated with the InputFormat provided by the LoadFunc is passed to the LoadFunc. The RecordReader can then be used by the implementation in getNext() to return a tuple representing a record of data back to pig. </li>
1007
-
1008
- <li>getNext() :The meaning of getNext() has not changed and is called by Pig runtime to get the next tuple in the data - in this method the implementation should use the the underlying RecordReader and construct the tuple to return. </li>
1009
-
1010
- </ul>
1011
- <p>The following methods have default implementations in LoadFunc and should be overridden only if needed: </p>
1012
- <ul>
1013
-
1014
- <li>setUdfContextSignature():This method will be called by Pig both in the front end and back end to pass a unique signature to the Loader. The signature can be used to store into the UDFContext any information which the Loader needs to store between various method invocations in the front end and back end. A use case is to store RequiredFieldList passed to it in LoadPushDown.pushProjection(RequiredFieldList) for use in the back end before returning tuples in getNext(). The default implementation in LoadFunc has an empty body. This method will be called before other methods. </li>
1015
-
1016
- <li>relativeToAbsolutePath():Pig runtime will call this method to allow the Loader to convert a relative load location to an absolute location. The default implementation provided in LoadFunc handles this for FileSystem locations. If the load source is something else, loader implementation may choose to override this.</li>
1017
-
1018
- </ul>
1019
- <p>
1020
- <strong>Example Implementation</strong>
1021
- </p>
1022
- <p>
1023
- The loader implementation in the example is a loader for text data with line delimiter as '\n' and '\t' as default field delimiter (which can be overridden by passing a different field delimiter in the constructor) - this is similar to current PigStorage loader in Pig. The implementation uses an existing Hadoop supported Inputformat - TextInputFormat - as the underlying InputFormat.
1024
- </p>
1025
- <pre class="code">
1026
- public class SimpleTextLoader extends LoadFunc {
1027
- protected RecordReader in = null;
1028
- private byte fieldDel = '\t';
1029
- private ArrayList&lt;Object&gt; mProtoTuple = null;
1030
- private TupleFactory mTupleFactory = TupleFactory.getInstance();
1031
- private static final int BUFFER_SIZE = 1024;
1032
-
1033
- public SimpleTextLoader() {
1034
- }
1035
-
1036
- /**
1037
- * Constructs a Pig loader that uses specified character as a field delimiter.
1038
- *
1039
- * @param delimiter
1040
- * the single byte character that is used to separate fields.
1041
- * ("\t" is the default.)
1042
- */
1043
- public SimpleTextLoader(String delimiter) {
1044
- this();
1045
- if (delimiter.length() == 1) {
1046
- this.fieldDel = (byte)delimiter.charAt(0);
1047
- } else if (delimiter.length() &gt; 1 &amp; &amp; delimiter.charAt(0) == '\\') {
1048
- switch (delimiter.charAt(1)) {
1049
- case 't':
1050
- this.fieldDel = (byte)'\t';
1051
- break;
1052
-
1053
- case 'x':
1054
- fieldDel =
1055
- Integer.valueOf(delimiter.substring(2), 16).byteValue();
1056
- break;
1057
-
1058
- case 'u':
1059
- this.fieldDel =
1060
- Integer.valueOf(delimiter.substring(2)).byteValue();
1061
- break;
1062
-
1063
- default:
1064
- throw new RuntimeException("Unknown delimiter " + delimiter);
1065
- }
1066
- } else {
1067
- throw new RuntimeException("PigStorage delimeter must be a single character");
1068
- }
1069
- }
1070
-
1071
- @Override
1072
- public Tuple getNext() throws IOException {
1073
- try {
1074
- boolean notDone = in.nextKeyValue();
1075
- if (!notDone) {
1076
- return null;
1077
- }
1078
- Text value = (Text) in.getCurrentValue();
1079
- byte[] buf = value.getBytes();
1080
- int len = value.getLength();
1081
- int start = 0;
1082
-
1083
- for (int i = 0; i &lt; len; i++) {
1084
- if (buf[i] == fieldDel) {
1085
- readField(buf, start, i);
1086
- start = i + 1;
1087
- }
1088
- }
1089
- // pick up the last field
1090
- readField(buf, start, len);
1091
-
1092
- Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple);
1093
- mProtoTuple = null;
1094
- return t;
1095
- } catch (InterruptedException e) {
1096
- int errCode = 6018;
1097
- String errMsg = "Error while reading input";
1098
- throw new ExecException(errMsg, errCode,
1099
- PigException.REMOTE_ENVIRONMENT, e);
1100
- }
1101
-
1102
- }
1103
-
1104
- private void readField(byte[] buf, int start, int end) {
1105
- if (mProtoTuple == null) {
1106
- mProtoTuple = new ArrayList&lt;Object&gt;();
1107
- }
1108
-
1109
- if (start == end) {
1110
- // NULL value
1111
- mProtoTuple.add(null);
1112
- } else {
1113
- mProtoTuple.add(new DataByteArray(buf, start, end));
1114
- }
1115
- }
1116
-
1117
- @Override
1118
- public InputFormat getInputFormat() {
1119
- return new TextInputFormat();
1120
- }
1121
-
1122
- @Override
1123
- public void prepareToRead(RecordReader reader, PigSplit split) {
1124
- in = reader;
1125
- }
1126
-
1127
- @Override
1128
- public void setLocation(String location, Job job)
1129
- throws IOException {
1130
- FileInputFormat.setInputPaths(job, location);
1131
- }
1132
- }
1133
- </pre>
1134
- <a name="N1043C"></a><a name="Store+Functions"></a>
1135
- <h3 class="h4"> Store Functions</h3>
1136
- <p>
1137
- <a href="http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/StoreFunc.java?view=markup">StoreFunc</a>
1138
- abstract class has the main methods for storing data and for most use cases it should suffice to extend it. There is an optional interface which can be implemented to achieve extended functionality: </p>
1139
- <ul>
1140
-
1141
- <li>
1142
- <a href="http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/StoreMetadata.java?view=markup">StoreMetadata:</a>
1143
- This interface has methods to interact with metadata systems to store schema and store statistics. This interface is truely optional and should only be implemented if metadata needs to stored. </li>
1144
-
1145
- </ul>
1146
- <p>The methods which need to be overridden in StoreFunc are explained below: </p>
1147
- <ul>
1148
-
1149
- <li>getOutputFormat(): This method will be called by Pig to get the OutputFormat used by the storer. The methods in the OutputFormat (and underlying RecordWriter and OutputCommitter) will be called by pig in the same manner (and in the same context) as by Hadoop in a map-reduce java program. If the OutputFormat is a hadoop packaged one, the implementation should use the new API based one under org.apache.hadoop.mapreduce. If it is a custom OutputFormat, it should be implemented using the new API under org.apache.hadoop.mapreduce. The checkOutputSpecs() method of the OutputFormat will be called by pig to check the output location up-front. This method will also be called as part of the Hadoop call sequence when the job is launched. So implementations should ensure that this method can be called multiple times without inconsistent side effects. </li>
1150
-
1151
- <li>setStoreLocation(): This method is called by Pig to communicate the store location to the storer. The storer should use this method to communicate the same information to the underlying OutputFormat. This method is called multiple times by pig - implementations should bear in mind that this method is called multiple times and should ensure there are no inconsistent side effects due to the multiple calls. </li>
1152
-
1153
- <li>prepareToWrite(): In the new API, writing of the data is through the OutputFormat provided by the StoreFunc. In prepareToWrite() the RecordWriter associated with the OutputFormat provided by the StoreFunc is passed to the StoreFunc. The RecordWriter can then be used by the implementation in putNext() to write a tuple representing a record of data in a manner expected by the RecordWriter. </li>
1154
-
1155
- <li>putNext(): The meaning of putNext() has not changed and is called by Pig runtime to write the next tuple of data - in the new API, this is the method wherein the implementation will use the the underlying RecordWriter to write the Tuple out.</li>
1156
-
1157
- </ul>
1158
- <p>The following methods have default implementations in StoreFunc and should be overridden only if necessary: </p>
1159
- <ul>
1160
-
1161
- <li>setStoreFunc!UDFContextSignature(): This method will be called by Pig both in the front end and back end to pass a unique signature to the Storer. The signature can be used to store into the UDFContext any information which the Storer needs to store between various method invocations in the front end and back end. The default implementation in StoreFunc has an empty body. This method will be called before other methods.
1162
- </li>
1163
-
1164
- <li>relToAbsPathForStoreLocation(): Pig runtime will call this method to allow the Storer to convert a relative store location to an absolute location. An implementation is provided in StoreFunc which handles this for FileSystem based locations. </li>
1165
-
1166
- <li>checkSchema(): A Store function should implement this function to check that a given schema describing the data to be written is acceptable to it. The default implementation in StoreFunc has an empty body. This method will be called before any calls to setStoreLocation(). </li>
1167
-
1168
- </ul>
1169
- <p>
1170
- <strong>Example Implementation</strong>
1171
- </p>
1172
- <p>
1173
- The storer implementation in the example is a storer for text data with line delimiter as '\n' and '\t' as default field delimiter (which can be overridden by passing a different field delimiter in the constructor) - this is similar to current PigStorage storer in Pig. The implementation uses an existing Hadoop supported OutputFormat - TextOutputFormat as the underlying OutputFormat.
1174
- </p>
1175
- <pre class="code">
1176
- public class SimpleTextStorer extends StoreFunc {
1177
- protected RecordWriter writer = null;
1178
-
1179
- private byte fieldDel = '\t';
1180
- private static final int BUFFER_SIZE = 1024;
1181
- private static final String UTF8 = "UTF-8";
1182
- public PigStorage() {
1183
- }
1184
-
1185
- public PigStorage(String delimiter) {
1186
- this();
1187
- if (delimiter.length() == 1) {
1188
- this.fieldDel = (byte)delimiter.charAt(0);
1189
- } else if (delimiter.length() &gt; 1delimiter.charAt(0) == '\\') {
1190
- switch (delimiter.charAt(1)) {
1191
- case 't':
1192
- this.fieldDel = (byte)'\t';
1193
- break;
1194
-
1195
- case 'x':
1196
- fieldDel =
1197
- Integer.valueOf(delimiter.substring(2), 16).byteValue();
1198
- break;
1199
- case 'u':
1200
- this.fieldDel =
1201
- Integer.valueOf(delimiter.substring(2)).byteValue();
1202
- break;
1203
-
1204
- default:
1205
- throw new RuntimeException("Unknown delimiter " + delimiter);
1206
- }
1207
- } else {
1208
- throw new RuntimeException("PigStorage delimeter must be a single character");
1209
- }
1210
- }
1211
-
1212
- ByteArrayOutputStream mOut = new ByteArrayOutputStream(BUFFER_SIZE);
1213
-
1214
- @Override
1215
- public void putNext(Tuple f) throws IOException {
1216
- int sz = f.size();
1217
- for (int i = 0; i &lt; sz; i++) {
1218
- Object field;
1219
- try {
1220
- field = f.get(i);
1221
- } catch (ExecException ee) {
1222
- throw ee;
1223
- }
1224
-
1225
- putField(field);
1226
-
1227
- if (i != sz - 1) {
1228
- mOut.write(fieldDel);
1229
- }
1230
- }
1231
- Text text = new Text(mOut.toByteArray());
1232
- try {
1233
- writer.write(null, text);
1234
- mOut.reset();
1235
- } catch (InterruptedException e) {
1236
- throw new IOException(e);
1237
- }
1238
- }
1239
-
1240
- @SuppressWarnings("unchecked")
1241
- private void putField(Object field) throws IOException {
1242
- //string constants for each delimiter
1243
- String tupleBeginDelim = "(";
1244
- String tupleEndDelim = ")";
1245
- String bagBeginDelim = "{";
1246
- String bagEndDelim = "}";
1247
- String mapBeginDelim = "[";
1248
- String mapEndDelim = "]";
1249
- String fieldDelim = ",";
1250
- String mapKeyValueDelim = "#";
1251
-
1252
- switch (DataType.findType(field)) {
1253
- case DataType.NULL:
1254
- break; // just leave it empty
1255
-
1256
- case DataType.BOOLEAN:
1257
- mOut.write(((Boolean)field).toString().getBytes());
1258
- break;
1259
-
1260
- case DataType.INTEGER:
1261
- mOut.write(((Integer)field).toString().getBytes());
1262
- break;
1263
-
1264
- case DataType.LONG:
1265
- mOut.write(((Long)field).toString().getBytes());
1266
- break;
1267
-
1268
- case DataType.FLOAT:
1269
- mOut.write(((Float)field).toString().getBytes());
1270
- break;
1271
-
1272
- case DataType.DOUBLE:
1273
- mOut.write(((Double)field).toString().getBytes());
1274
- break;
1275
-
1276
- case DataType.BYTEARRAY: {
1277
- byte[] b = ((DataByteArray)field).get();
1278
- mOut.write(b, 0, b.length);
1279
- break;
1280
- }
1281
-
1282
- case DataType.CHARARRAY:
1283
- // oddly enough, writeBytes writes a string
1284
- mOut.write(((String)field).getBytes(UTF8));
1285
- break;
1286
-
1287
- case DataType.MAP:
1288
- boolean mapHasNext = false;
1289
- Map&lt;String, Object&gt; m = (Map&lt;String, Object&gt;)field;
1290
- mOut.write(mapBeginDelim.getBytes(UTF8));
1291
- for(Map.Entry&lt;String, Object&gt; e: m.entrySet()) {
1292
- if(mapHasNext) {
1293
- mOut.write(fieldDelim.getBytes(UTF8));
1294
- } else {
1295
- mapHasNext = true;
1296
- }
1297
- putField(e.getKey());
1298
- mOut.write(mapKeyValueDelim.getBytes(UTF8));
1299
- putField(e.getValue());
1300
- }
1301
- mOut.write(mapEndDelim.getBytes(UTF8));
1302
- break;
1303
-
1304
- case DataType.TUPLE:
1305
- boolean tupleHasNext = false;
1306
- Tuple t = (Tuple)field;
1307
- mOut.write(tupleBeginDelim.getBytes(UTF8));
1308
- for(int i = 0; i &lt; t.size(); ++i) {
1309
- if(tupleHasNext) {
1310
- mOut.write(fieldDelim.getBytes(UTF8));
1311
- } else {
1312
- tupleHasNext = true;
1313
- }
1314
- try {
1315
- putField(t.get(i));
1316
- } catch (ExecException ee) {
1317
- throw ee;
1318
- }
1319
- }
1320
- mOut.write(tupleEndDelim.getBytes(UTF8));
1321
- break;
1322
-
1323
- case DataType.BAG:
1324
- boolean bagHasNext = false;
1325
- mOut.write(bagBeginDelim.getBytes(UTF8));
1326
- Iterator&lt;Tuple&gt; tupleIter = ((DataBag)field).iterator();
1327
- while(tupleIter.hasNext()) {
1328
- if(bagHasNext) {
1329
- mOut.write(fieldDelim.getBytes(UTF8));
1330
- } else {
1331
- bagHasNext = true;
1332
- }
1333
- putField((Object)tupleIter.next());
1334
- }
1335
- mOut.write(bagEndDelim.getBytes(UTF8));
1336
- break;
1337
-
1338
- default: {
1339
- int errCode = 2108;
1340
- String msg = "Could not determine data type of field: " + field;
1341
- throw new ExecException(msg, errCode, PigException.BUG);
1342
- }
1343
-
1344
- }
1345
- }
1346
-
1347
- @Override
1348
- public OutputFormat getOutputFormat() {
1349
- return new TextOutputFormat&lt;WritableComparable, Text&gt;();
1350
- }
1351
-
1352
- @Override
1353
- public void prepareToWrite(RecordWriter writer) {
1354
- this.writer = writer;
1355
- }
1356
-
1357
- @Override
1358
- public void setStoreLocation(String location, Job job) throws IOException {
1359
- job.getConfiguration().set("mapred.textoutputformat.separator", "");
1360
- FileOutputFormat.setOutputPath(job, new Path(location));
1361
- if (location.endsWith(".bz2")) {
1362
- FileOutputFormat.setCompressOutput(job, true);
1363
- FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
1364
- } else if (location.endsWith(".gz")) {
1365
- FileOutputFormat.setCompressOutput(job, true);
1366
- FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
1367
- }
1368
- }
1369
- }
1370
- </pre>
1371
- </div>
1372
- <!-- END LOAD/STORE FUNCTIONS -->
1373
-
1374
-
1375
-
1376
- <a name="N10483"></a><a name="Builtin+Functions+and+Function+Repositories"></a>
1377
- <h2 class="h3">Builtin Functions and Function Repositories</h2>
1378
- <div class="section">
1379
- <p>Pig comes with a set of builtin functions. Two main properties differentiate builtin functions from UDFs. First, they don't need to be registered because Pig knows where they are. Second, they don't need to be qualified when used because Pig knows where to find them. </p>
1380
- <p>Pig also hosts a UDF repository called <span class="codefrag">piggybank</span> that allows users to share UDFs that they have written. The details are described in <a href="http://wiki.apache.org/pig/PiggyBank"> PiggyBank</a>. </p>
1381
- </div>
1382
-
1383
-
1384
- <a name="N10497"></a><a name="Accumulator+Interface"></a>
1385
- <h2 class="h3">Accumulator Interface</h2>
1386
- <div class="section">
1387
- <p>In Pig, problems with memory usage can occur when data, which results from a group or cogroup operation, needs to be placed in a bag and passed in its entirety to a UDF.</p>
1388
- <p>This problem is partially addressed by Algebraic UDFs that use the combiner and can deal with data being passed to them incrementally during different processing phases (map, combiner, and reduce.) However, there are a number of UDFs that are not Algebraic, don't use the combiner, but still don&rsquo;t need to be given all data at once. </p>
1389
- <p>The new Accumulator interface is designed to decrease memory usage by targeting such UDFs. For the functions that implement this interface, Pig guarantees that the data for the same key is passed continuously but in small increments. To work with incremental data, here is the interface a UDF needs to implement:</p>
1390
- <pre class="code">
1391
- public interface Accumulator &lt;T&gt; {
1392
- /**
1393
- * Process tuples. Each DataBag may contain 0 to many tuples for current key
1394
- */
1395
- public void accumulate(Tuple b) throws IOException;
1396
- /**
1397
- * Called when all tuples from current key have been passed to the accumulator.
1398
- * @return the value for the UDF for this key.
1399
- */
1400
- public T getValue();
1401
- /**
1402
- * Called after getValue() to prepare processing for next key.
1403
- */
1404
- public void cleanup();
1405
- }
1406
- </pre>
1407
- <p>There are several things to note here:</p>
1408
- <ol>
1409
-
1410
- <li>Each UDF must extend the EvalFunc class and implement all necessary functions there.</li>
1411
-
1412
- <li>If a function is algebraic but can be used in a FOREACH statement with accumulator functions, it needs to implement the Accumulator interface in addition to the Algebraic interface.</li>
1413
-
1414
- <li>The interface is parameterized with the return type of the function.</li>
1415
-
1416
- <li>The accumulate function is guaranteed to be called one or more times, passing one or more tuples in a bag, to the UDF. (Note that the tuple that is passed to the accumulator has the same content as the one passed to exec &ndash; all the parameters passed to the UDF &ndash; one of which should be a bag).</li>
1417
-
1418
- <li>The getValue function is called after all the tuples for a particular key have been processed to retrieve the final value.</li>
1419
-
1420
- <li>The cleanup function is called after getValue but before the next value is processed.</li>
1421
-
1422
- </ol>
1423
- <p>Here us a code snippet of the integer version of the MAX function that implements the interface:</p>
1424
- <pre class="code">
1425
- public class IntMax extends EvalFunc&lt;Integer&gt; implements Algebraic, Accumulator&lt;Integer&gt; {
1426
- &hellip;&hellip;.
1427
- /* Accumulator interface */
1428
-
1429
- private Integer intermediateMax = null;
1430
-
1431
- @Override
1432
- public void accumulate(Tuple b) throws IOException {
1433
- try {
1434
- Integer curMax = max(b);
1435
- if (curMax == null) {
1436
- return;
1437
- }
1438
- /* if bag is not null, initialize intermediateMax to negative infinity */
1439
- if (intermediateMax == null) {
1440
- intermediateMax = Integer.MIN_VALUE;
1441
- }
1442
- intermediateMax = java.lang.Math.max(intermediateMax, curMax);
1443
- } catch (ExecException ee) {
1444
- throw ee;
1445
- } catch (Exception e) {
1446
- int errCode = 2106;
1447
- String msg = "Error while computing max in " + this.getClass().getSimpleName();
1448
- throw new ExecException(msg, errCode, PigException.BUG, e);
1449
- }
1450
- }
1451
-
1452
- @Override
1453
- public void cleanup() {
1454
- intermediateMax = null;
1455
- }
1456
-
1457
- @Override
1458
- public Integer getValue() {
1459
- return intermediateMax;
1460
- }
1461
- }
1462
- </pre>
1463
- </div>
1464
-
1465
-
1466
-
1467
- <a name="N104CA"></a><a name="Advanced+Topics"></a>
1468
- <h2 class="h3">Advanced Topics</h2>
1469
- <div class="section">
1470
- <a name="N104D0"></a><a name="Function+Instantiation"></a>
1471
- <h3 class="h4">Function Instantiation</h3>
1472
- <p>One problem that users run into is when they make assumption about how many times a constructor for their UDF is called. For instance, they might be creating side files in the store function and doing it in the constructor seems like a good idea. The problem with this approach is that in most cases Pig instantiates functions on the client side to, for instance, examine the schema of the data. </p>
1473
- <p>Users should not make assumptions about how many times a function is instantiated; instead, they should make their code resilient to multiple instantiations. For instance, they could check if the files exist before creating them. </p>
1474
- <a name="N104DD"></a><a name="Schemas"></a>
1475
- <h3 class="h4">Schemas</h3>
1476
- <p>One request from users is to have the ability to examine the input schema of the data before processing the data. For example, they would like to know how to convert an input tuple to a map such that the keys in the map are the names of the input columns. The current answer is that there is now way to do this. This is something we would like to support in the future. </p>
1477
- <a name="N104E7"></a><a name="Passing+Configurations+to+UDFs"></a>
1478
- <h3 class="h4">Passing Configurations to UDFs</h3>
1479
- <p>The singleton UDFContext class provides two features to UDF writers. First, on the backend, it allows UDFs to get access to the JobConf object, by calling getJobConf. This is only available on the backend (at run time) as the JobConf has not yet been constructed on the front end (during planning time).</p>
1480
- <p>Second, it allows UDFs to pass configuration information between instantiations of the UDF on the front and backends. UDFs can store information in a configuration object when they are constructed on the front end, or during other front end calls such as describeSchema. They can then read that information on the backend when exec (for EvalFunc) or getNext (for LoadFunc) is called. Note that information will not be passed between instantiations of the function on the backend. The communication channel only works from front end to back end.</p>
1481
- <p>To store information, the UDF calls getUDFProperties. This returns a Properties object which the UDF can record the information in or read the information from. To avoid name space conflicts UDFs are required to provide a signature when obtaining a Properties object. This can be done in two ways. The UDF can provide its Class object (via this.getClass()). In this case, every instantiation of the UDF will be given the same Properties object. The UDF can also provide its Class plus an array of Strings. The UDF can pass its constructor arguments, or some other identifying strings. This allows each instantiation of the UDF to have a different properties object thus avoiding name space collisions between instantiations of the UDF.</p>
1482
- </div>
1483
-
1484
-
1485
- </div>
1486
- <!--+
1487
- |end content
1488
- +-->
1489
- <div class="clearboth">&nbsp;</div>
1490
- </div>
1491
- <div id="footer">
1492
- <!--+
1493
- |start bottomstrip
1494
- +-->
1495
- <div class="lastmodified">
1496
- <script type="text/javascript"><!--
1497
- document.write("Last Published: " + document.lastModified);
1498
- // --></script>
1499
- </div>
1500
- <div class="copyright">
1501
- Copyright &copy;
1502
- 2007-2010 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
1503
- </div>
1504
- <!--+
1505
- |end bottomstrip
1506
- +-->
1507
- </div>
1508
- </body>
1509
- </html>