wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,5 +0,0 @@
1
-
2
-
3
- #
4
- # Placeholder -- avro will go here
5
- #
@@ -1,40 +0,0 @@
1
- Settings.define :cassandra_keyspace, :required => true, :description => "The keyspace to bulk load"
2
- Settings.define :cassandra_col_family, :required => true, :description => "The column family to bulk load"
3
- Settings.define :cassandra_home, :env_var => 'CASSANDRA_HOME', :default => '/usr/local/share/cassandra'
4
-
5
- module Wukong
6
- class CassandraScript < Wukong::Script
7
- def hadoop_other_args *args
8
- opts = super(*args)
9
- opts << "-D stream.map.output=\'cassandra_avro_output\'"
10
- opts << "-D stream.io.identifier.resolver.class=\'org.apache.cassandra.hadoop.streaming.AvroResolver\'"
11
- opts << "-D cassandra.output.keyspace=\'#{Settings.cassandra_keyspace}\'"
12
- opts << "-D cassandra.output.columnfamily=\'#{Settings.cassandra_col_family}\'"
13
- opts << "-D cassandra.partitioner.class=\'org.apache.cassandra.dht.RandomPartitioner\'"
14
- opts << "-D cassandra.thrift.address=\'#{[Settings.cassandra_hosts].flatten.map{|s| s.gsub(/:.*/, '')}.join(",")}\'"
15
- opts << "-D cassandra.thrift.port=\'9160\'"
16
- # opts << "-D mapreduce.output.columnfamilyoutputformat.batch.threshold=\'1024\'"
17
- # ORDER MATTERS
18
- opts << "-libjars \'#{cassandra_jars}\'"
19
- opts << "-file \'#{avro_schema}\'"
20
- opts << "-outputformat \'org.apache.cassandra.hadoop.ColumnFamilyOutputFormat\'"
21
- opts
22
- end
23
-
24
- #
25
- # Return paths to cassandra jars as a string
26
- #
27
- def cassandra_jars
28
- jars = []
29
- Dir["#{Settings.cassandra_home}/build/apache-cassandra*.jar", "#{Settings.cassandra_home}/build/lib/jars/*.jar", "#{Settings.cassandra_home}/lib/*.jar"].each do |jar|
30
- jars << jar
31
- end
32
- jars.join(',')
33
- end
34
-
35
- def avro_schema
36
- File.join(Settings.cassandra_home, "interface/avro/cassandra.avpr")
37
- end
38
-
39
- end
40
- end
@@ -1,168 +0,0 @@
1
- require 'right_aws'
2
- require 'configliere/config_block'
3
- #
4
- EMR_CONFIG_DIR = '~/.wukong' unless defined?(EMR_CONFIG_DIR)
5
- #
6
- Settings.define :emr_credentials_file, :description => 'A .json file holding your AWS access credentials. See http://bit.ly/emr_credentials_file for format'
7
- Settings.define :access_key, :description => 'AWS Access key', :env_var => 'AWS_ACCESS_KEY_ID'
8
- Settings.define :secret_access_key, :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
9
- Settings.define :emr_runner, :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
10
- Settings.define :emr_root, :description => 'S3 bucket and path to use as the base for Elastic MapReduce storage, organized by job name'
11
- Settings.define :emr_data_root, :description => 'Optional '
12
- Settings.define :emr_bootstrap_script, :description => 'Bootstrap actions for Elastic Map Reduce machine provisioning', :default => EMR_CONFIG_DIR+'/emr_bootstrap.sh', :type => :filename, :finally => lambda{ Settings.emr_bootstrap_script = File.expand_path(Settings.emr_bootstrap_script) }
13
- Settings.define :emr_extra_args, :description => 'kludge: allows you to stuff extra args into the elastic-mapreduce invocation', :type => Array, :wukong => true
14
- Settings.define :alive, :description => 'Whether to keep machine running after job invocation', :type => :boolean
15
- #
16
- Settings.define :key_pair_file, :description => 'AWS Key pair file', :type => :filename
17
- Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') if Settings.key_pair_file }
18
- Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
19
- Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
20
- Settings.define :jobflow, :description => "ID of an existing EMR job flow. Wukong will create a new job flow"
21
- #
22
- Settings.read(File.expand_path(EMR_CONFIG_DIR+'/emr.yaml'))
23
-
24
- module Wukong
25
- #
26
- # EMR Options
27
- #
28
- module EmrCommand
29
-
30
- def execute_emr_workflow
31
- copy_script_to_cloud
32
- execute_emr_runner
33
- end
34
-
35
- def copy_script_to_cloud
36
- Log.info " Copying this script to the cloud."
37
- S3Util.store(this_script_filename, mapper_s3_uri)
38
- S3Util.store(this_script_filename, reducer_s3_uri)
39
- S3Util.store(File.expand_path(Settings.emr_bootstrap_script), bootstrap_s3_uri)
40
- end
41
-
42
- def copy_jars_to_cloud
43
- S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), wukong_libs_s3_uri)
44
- # "--cache-archive=#{wukong_libs_s3_uri}#vendor",
45
- end
46
-
47
- def hadoop_options_for_emr_runner
48
- [hadoop_jobconf_options, hadoop_other_args].flatten.compact.uniq.map do |hdp_opt|
49
- hdp_opt.split(' ').map {|part| "--arg '#{part}'"}
50
- end.flatten
51
- end
52
-
53
- def execute_emr_runner
54
- # fix_paths!
55
- command_args = []
56
- if Settings.jobflow
57
- command_args << Settings.dashed_flag_for(:jobflow)
58
- else
59
- command_args << "--create --name=#{job_name}"
60
- command_args << Settings.dashed_flag_for(:alive)
61
- command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type, :hadoop_version).join(' ')
62
- command_args << Settings.dashed_flags(:availability_zone, :key_pair, :key_pair_file).join(' ')
63
- command_args << "--bootstrap-action=#{bootstrap_s3_uri}"
64
- end
65
- command_args << Settings.dashed_flags(:enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
66
- command_args += emr_credentials
67
- command_args += [
68
- "--log-uri=#{log_s3_uri}",
69
- "--stream",
70
- "--mapper=#{mapper_s3_uri} ",
71
- "--reducer=#{reducer_s3_uri} ",
72
- "--input=#{input_paths.join(",")} --output=#{output_path}",
73
- ]
74
- # eg to specify zero reducers:
75
- # Settings[:emr_extra_args] = "--arg '-D mapred.reduce.tasks=0'"
76
- command_args += Settings[:emr_extra_args] unless Settings[:emr_extra_args].blank?
77
- command_args += hadoop_options_for_emr_runner
78
- Log.info 'Follow along at http://localhost:9000/job'
79
- execute_command!( File.expand_path(Settings.emr_runner), *command_args )
80
- end
81
-
82
- def emr_credentials
83
- command_args = []
84
- if Settings.emr_credentials_file
85
- command_args << "--credentials #{File.expand_path(Settings.emr_credentials_file)}"
86
- else
87
- command_args << %Q{--access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} }
88
- end
89
- command_args
90
- end
91
-
92
- # A short name for this job
93
- def job_handle
94
- File.basename($0,'.rb')
95
- end
96
-
97
- # Produces an s3 URI within the Wukong emr sandbox from a set of path
98
- # segments
99
- #
100
- # @example
101
- # Settings.emr_root = 's3://emr.yourmom.com/wukong'
102
- # emr_s3_path('log', 'my_happy_job', 'run-97.log')
103
- # # => "s3://emr.yourmom.com/wukong/log/my_happy_job/run-97.log"
104
- #
105
- def emr_s3_path *path_segs
106
- File.join(Settings.emr_root, path_segs.flatten.compact)
107
- end
108
-
109
- def mapper_s3_uri
110
- emr_s3_path(job_handle, 'code', job_handle+'-mapper.rb')
111
- end
112
- def reducer_s3_uri
113
- emr_s3_path(job_handle, 'code', job_handle+'-reducer.rb')
114
- end
115
- def log_s3_uri
116
- emr_s3_path(job_handle, 'log', 'emr_jobs')
117
- end
118
- def bootstrap_s3_uri
119
- emr_s3_path(job_handle, 'bin', "emr_bootstrap.sh")
120
- end
121
- def wukong_libs_s3_uri
122
- emr_s3_path(job_handle, 'code', "wukong-libs.jar")
123
- end
124
-
125
- ABSOLUTE_URI = %r{^/|^\w+://}
126
- #
127
- # Walk through the input paths and the output path. Prepends
128
- # Settings.emr_data_root to any that does NOT look like
129
- # an absolute path ("/foo") or a URI ("s3://yourmom/data")
130
- #
131
- def fix_paths!
132
- return if Settings.emr_data_root.blank?
133
- unless input_paths.blank?
134
- @input_paths = input_paths.map{|path| (path =~ ABSOLUTE_URI) ? path : File.join(Settings.emr_data_root, path) }
135
- end
136
- unless output_path.blank?
137
- @output_path = [output_path].map{|path| (path =~ ABSOLUTE_URI) ? path : File.join(Settings.emr_data_root, path) }
138
- end
139
- end
140
-
141
- #
142
- # Simple class to coordinate s3 operations
143
- #
144
- class S3Util
145
- # class methods
146
- class << self
147
- def s3
148
- @s3 ||= RightAws::S3Interface.new(
149
- Settings.access_key, Settings.secret_access_key,
150
- {:multi_thread => true, :logger => Log, :port => 80, :protocol => 'http' })
151
- end
152
- def bucket_and_path_from_uri uri
153
- uri =~ %r{^s3\w*://([\w\.\-]+)\W*(.*)} and return([$1, $2])
154
- end
155
- def store filename, uri
156
- dest_bucket, dest_key = bucket_and_path_from_uri(uri)
157
- Log.debug " #{filename} => #{dest_bucket} / #{dest_key}"
158
- contents = File.read(filename)
159
- s3.store_object(:bucket => dest_bucket, :key => dest_key, :data => contents)
160
- end
161
- end
162
- end
163
-
164
- end
165
- Script.class_eval do
166
- include EmrCommand
167
- end
168
- end
@@ -1,237 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- module Wukong
3
- module HadoopCommand
4
-
5
- # ===========================================================================
6
- #
7
- # Hadoop Options
8
- #
9
- Settings.define :hadoop_home, :default => '/usr/lib/hadoop', :description => "Path to hadoop installation; ENV['HADOOP_HOME'] by default. HADOOP_HOME/bin/hadoop is used to run hadoop.", :env_var => 'HADOOP_HOME', :wukong => true
10
- Settings.define :hadoop_runner, :description => "Path to hadoop script. Usually set --hadoop_home instead of this.", :wukong => true
11
-
12
- #
13
- # Translate simplified args to their hairy hadoop equivalents
14
- #
15
- Settings.define :io_sort_mb, :jobconf => true, :description => 'io.sort.mb', :wukong => true
16
- Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
17
- Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
18
- Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
19
- Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
20
- Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
21
- Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
22
- Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
23
- Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
24
- Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
25
- Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
26
- Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
27
- Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
28
- Settings.define :max_tracker_failures, :jobconf => true, :description => 'mapred.max.tracker.failures', :wukong => true
29
- Settings.define :max_map_attempts, :jobconf => true, :description => 'mapred.map.max.attempts', :wukong => true
30
- Settings.define :max_reduce_attempts, :jobconf => true, :description => 'mapred.reduce.max.attempts', :wukong => true
31
- Settings.define :min_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
32
- Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
33
- Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
34
- Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
35
- Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
36
- Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
37
- Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
38
- Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
39
- Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
40
- Settings.define :split_on_xml_tag, :description => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'", :wukong => true
41
-
42
- # emit a -jobconf hadoop option if the simplified command line arg is present
43
- # if not, the resulting nil will be elided later
44
- def jobconf option
45
- if options[option]
46
- # "-jobconf %s=%s" % [options.definition_of(option, :description), options[option]]
47
- "-D %s=%s" % [options.definition_of(option, :description), options[option]]
48
- end
49
- end
50
-
51
- #
52
- # Assemble the hadoop command to execute
53
- # and launch the hadoop runner to execute the script across all tasktrackers
54
- #
55
- # FIXME: Should add some simple logic to ensure that commands are in the
56
- # right order or hadoop will complain. ie. -D options MUST come before
57
- # others
58
- #
59
- def execute_hadoop_workflow
60
- # Input paths join by ','
61
- input_paths = @input_paths.join(',')
62
- #
63
- # Use Settings[:hadoop_home] to set the path your config install.
64
- hadoop_commandline = [
65
- hadoop_runner,
66
- "jar #{options[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
67
- hadoop_jobconf_options,
68
- "-D mapred.job.name='#{job_name}'",
69
- hadoop_other_args,
70
- "-mapper '#{mapper_commandline(:hadoop)}'",
71
- "-reducer '#{reducer_commandline(:hadoop)}'",
72
- "-input '#{input_paths}'",
73
- "-output '#{output_path}'",
74
- "-file '#{this_script_filename}'",
75
- hadoop_recycle_env,
76
- ].flatten.compact.join(" \t\\\n ")
77
- Log.info " Launching hadoop!"
78
- execute_command!(hadoop_commandline)
79
- end
80
-
81
- def hadoop_jobconf_options
82
- jobconf_options = []
83
- # Fixup these options
84
- options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
85
- options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
86
- # If no reducer and no reduce_command, then skip the reduce phase
87
- options[:reduce_tasks] = 0 if (! reducer) && (! options[:reduce_command]) && (! options[:reduce_tasks])
88
- # Fields hadoop should use to distribute records to reducers
89
- unless options[:partition_fields].blank?
90
- jobconf_options += [
91
- jobconf(:partition_fields),
92
- jobconf(:output_field_separator),
93
- ]
94
- end
95
- jobconf_options += [
96
- :io_sort_mb, :io_sort_record_percent,
97
- :map_speculative, :map_tasks,
98
- :max_maps_per_cluster, :max_maps_per_node,
99
- :max_node_map_tasks, :max_node_reduce_tasks,
100
- :max_reduces_per_cluster, :max_reduces_per_node,
101
- :max_record_length, :min_split_size,
102
- :output_field_separator, :key_field_separator,
103
- :partition_fields, :sort_fields,
104
- :reduce_tasks, :respect_exit_status,
105
- :reuse_jvms, :timeout,
106
- :max_tracker_failures, :max_map_attempts,
107
- :max_reduce_attempts
108
- ].map{|opt| jobconf(opt)}
109
- jobconf_options.flatten.compact
110
- end
111
-
112
- def hadoop_other_args
113
- extra_str_args = [ options[:extra_args] ]
114
- if options.split_on_xml_tag
115
- extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{options.split_on_xml_tag}>,end=</#{options.split_on_xml_tag}>'}
116
- end
117
- extra_str_args << ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
118
- extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless options[:partition_fields].blank?
119
- extra_str_args
120
- end
121
-
122
- def hadoop_recycle_env
123
- %w[RUBYLIB].map do |var|
124
- %Q{-cmdenv '#{var}=#{ENV[var]}'} if ENV[var]
125
- end.compact
126
- end
127
-
128
- # The path to the hadoop runner script
129
- def hadoop_runner
130
- options[:hadoop_runner] || (options[:hadoop_home]+'/bin/hadoop')
131
- end
132
-
133
- module ClassMethods
134
- #
135
- # Via @pskomoroch via @tlipcon,
136
- #
137
- # "there is a little known Hadoop Streaming trick buried in this Python
138
- # script. You will notice that the date is not actually in the raw log
139
- # data itself, but is part of the filename. It turns out that Hadoop makes
140
- # job parameters you would fetch in Java with something like
141
- # job.get("mapred.input.file") available as environment variables for
142
- # streaming jobs, with periods replaced with underscores:
143
- #
144
- # filepath = os.environ["map_input_file"]
145
- # filename = os.path.split(filepath)[-1]
146
- # Thanks to Todd Lipcon for directing me to that hack.
147
- #
148
-
149
- # HDFS pathname to the input file currently being processed.
150
- def input_file
151
- ENV['map_input_file']
152
- end
153
-
154
- # Directory of the input file
155
- def input_dir
156
- ENV['mapred_input_dir']
157
- end
158
-
159
- # Offset of this chunk within the input file
160
- def map_input_start_offset
161
- ENV['map_input_start']
162
- end
163
-
164
- # length of the mapper's input chunk
165
- def map_input_length
166
- ENV['map_input_length']
167
- end
168
-
169
- def attempt_id
170
- ENV['mapred_task_id']
171
- end
172
- def curr_task_id
173
- ENV['mapred_tip_id']
174
- end
175
-
176
- def script_cmdline_urlenc
177
- ENV['stream_map_streamprocessor']
178
- end
179
- end
180
-
181
- # Standard ClassMethods-on-include trick
182
- def self.included base
183
- base.class_eval do
184
- extend ClassMethods
185
- end
186
- end
187
- end
188
- end
189
-
190
- # -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
191
- # -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
192
- # -D mapred.text.key.comparator.options=-k2,2nr\
193
- # -D mapred.text.key.partitioner.options=-k1,2\
194
- # -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
195
- # -D stream.num.map.output.key.fields=\"$sortfields\"
196
- #
197
- # -D stream.map.output.field.separator=\"'/t'\"
198
- # -D map.output.key.field.separator=. \
199
- # -D mapred.data.field.separator=. \
200
- # -D map.output.key.value.fields.spec=6,5,1-3:0- \
201
- # -D reduce.output.key.value.fields.spec=0-2:5- \
202
-
203
- # "HADOOP_HOME" =>"/usr/lib/hadoop-0.20/bin/..",
204
- # "HADOOP_IDENT_STRING" =>"hadoop",
205
- # "HADOOP_LOGFILE" =>"hadoop-hadoop-tasktracker-ip-10-242-14-223.log",
206
- # "HADOOP_LOG_DIR" =>"/usr/lib/hadoop-0.20/bin/../logs",
207
- # "HOME" =>"/var/run/hadoop-0.20",
208
- # "JAVA_HOME" =>"/usr/lib/jvm/java-6-sun",
209
- # "LD_LIBRARY_PATH" =>"/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386:/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386",
210
- # "PATH" =>"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games",
211
- # "USER" =>"hadoop",
212
- #
213
- # "dfs_block_size" =>"134217728",
214
- # "map_input_start" =>"0",
215
- # "map_input_length" =>"125726898",
216
- # "mapred_output_key_class" =>"org.apache.hadoop.io.Text",
217
- # "mapred_output_value_class" =>"org.apache.hadoop.io.Text",
218
- # "mapred_output_format_class" =>"org.apache.hadoop.mapred.TextOutputFormat",
219
- # "mapred_output_compression_codec" =>"org.apache.hadoop.io.compress.DefaultCodec",
220
- # "mapred_output_compression_type" =>"BLOCK",
221
- # "mapred_task_partition" =>"0",
222
- # "mapred_tasktracker_map_tasks_maximum" =>"4",
223
- # "mapred_tasktracker_reduce_tasks_maximum" =>"2",
224
- # "mapred_tip_id" =>"task_200910221152_0023_m_000000",
225
- # "mapred_task_id" =>"attempt_200910221152_0023_m_000000_0",
226
- # "mapred_job_tracker" =>"ec2-174-129-141-78.compute-1.amazonaws.com:8021",
227
- #
228
- # "mapred_input_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809",
229
- # "map_input_file" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809/com.twitter.search+20090809233441-56735-womper.tsv.bz2",
230
- # "mapred_working_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip",
231
- # "mapred_work_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809/_temporary/_attempt_200910221152_0023_m_000000_0",
232
- # "mapred_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809",
233
- # "mapred_temp_dir" =>"/mnt/tmp/hadoop-hadoop/mapred/temp",
234
- # "PWD" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work",
235
- # "TMPDIR" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work/tmp",
236
- # "stream_map_streamprocessor" =>"%2Fusr%2Fbin%2Fruby1.8+%2Fmnt%2Fhome%2Fflip%2Fics%2Fwuclan%2Fexamples%2Ftwitter%2Fparse%2Fparse_twitter_search_requests.rb+--map+--rm",
237
- # "user_name" =>"flip",