wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,121 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- #
4
- # Taken from the [Aruba project](). Original license:
5
- #
6
- # Copyright (c) 2010 Aslak Hellesøy, David Chelimsky
7
- #
8
- # Permission is hereby granted, free of charge, to any person obtaining
9
- # a copy of this software and associated documentation files (the
10
- # "Software"), to deal in the Software without restriction, including
11
- # without limitation the rights to use, copy, modify, merge, publish,
12
- # distribute, sublicense, and/or sell copies of the Software, and to
13
- # permit persons to whom the Software is furnished to do so, subject to
14
- # the following conditions:
15
- #
16
- # The above copyright notice and this permission notice shall be
17
- # included in all copies or substantial portions of the Software.
18
- #
19
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20
- # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21
- # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22
- # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
23
- # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24
- # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
25
- # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26
-
27
-
28
- require 'childprocess'
29
- require 'tempfile'
30
-
31
- module Wukong
32
- class Runner
33
-
34
- class Process
35
- attr_reader :keep_ansi
36
-
37
- def initialize(cmd, exit_timeout=2.0, io_wait=2.0, keep_ansi=true)
38
- @exit_timeout = exit_timeout
39
- @io_wait = io_wait
40
- @keep_ansi = true
41
-
42
- @process = ChildProcess.build(*cmd)
43
- @process.io.stdout = raw_out_io
44
- @process.io.stderr = raw_err_io
45
- @process.duplex = true
46
- end
47
-
48
- def raw_out_io
49
- @raw_out_io ||= StringIO.new('', 'w')
50
- end
51
-
52
- def raw_err_io
53
- @raw_err_io ||= StringIO.new('', 'w')
54
- end
55
-
56
- def run!(&block)
57
- @process.start
58
- yield self if block_given?
59
- end
60
-
61
- def stdin
62
- wait_for_io do
63
- @process.io.stdin.sync = true
64
- @process.io.stdin
65
- end
66
- end
67
-
68
- def output
69
- stdout + stderr
70
- end
71
-
72
- def stdout
73
- wait_for_io do
74
- @raw_out_io.rewind
75
- filter_ansi(@raw_out_io.read)
76
- end
77
- end
78
-
79
- def stderr
80
- wait_for_io do
81
- @raw_err_io.rewind
82
- filter_ansi(@raw_err_io.read)
83
- end
84
- end
85
-
86
- def stop(reader)
87
- return unless @process
88
- unless @process.exited?
89
- reader.stdout stdout
90
- reader.stderr stderr
91
- @process.poll_for_exit(@exit_timeout)
92
- end
93
- @process.exit_code
94
- end
95
-
96
- def terminate
97
- if @process
98
- flush
99
- @process.stop
100
- flush
101
- end
102
- end
103
-
104
- def flush
105
- stdout && stderr # flush output
106
- end
107
-
108
- private
109
-
110
- def wait_for_io(&block)
111
- sleep @io_wait if @process.alive?
112
- yield
113
- end
114
-
115
- def filter_ansi(string)
116
- keep_ansi ? string : string.gsub(/\e\[\d+(?>(;\d+)*)m/, '')
117
- end
118
-
119
- end
120
- end
121
- end
@@ -1,161 +0,0 @@
1
- require 'wukong'
2
- require 'wukong/script/hadoop_command'
3
- require 'wukong/experimental'
4
-
5
- #
6
- # Runner settings
7
- #
8
-
9
- Settings.define :mode, :type => Symbol, :default => :mapper, :env_var => 'WUKONG_MODE', :description => "run the script's workflow: Specify 'hadoop' to use hadoop streaming; 'local' to run your_script.rb --map | sort | your_script.rb --reduce; 'emr' to launch on the amazon cloud; 'mapper' or 'reducer' to run that phase.", :wukong => true
10
- Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
11
- Settings.define :rm, :description => "Recursively remove the destination directory. Only used in hadoop mode.", :wukong => true
12
- Settings.define :script_file, :type => :filename, :description => "script file to execute, or give as first arg", :wukong => true
13
-
14
- module Wukong
15
- # adds ability to execute
16
- extend Wukong::Mixin::FromFile
17
-
18
- def self.from_file(filename)
19
- filename = filename.to_s
20
- filename += ".rb" if filename !~ /\.rb$/
21
- super(filename)
22
- end
23
-
24
- def self.run(filename=nil)
25
- if filename
26
- self.from_file(filename)
27
- else
28
- Settings.resolve!
29
- end
30
- if @main_run then return false ; end
31
- Wukong::Script.new(Settings).run
32
- @main_run = true
33
- end
34
-
35
- #
36
- # sources a script file,
37
- #
38
- class Script
39
- attr_reader :settings # configliere hash of settings
40
- attr_reader :script_file # File to execute
41
- attr_reader :input_paths
42
- attr_reader :output_path
43
-
44
- include Wukong::Script::HadoopCommand
45
-
46
- def initialize(settings)
47
- @settings = settings
48
-
49
- @output_path = settings.rest.pop
50
- @input_paths = settings.rest.reject(&:blank?)
51
- end
52
-
53
-
54
- # Execute the script file in the context of the Wukong module
55
- def run_flow
56
- Log.debug( "Running #{script_file} with settings #{settings}")
57
- script_file = settings.script_file
58
- mode = settings.mode
59
- Wukong.flow(mode).run
60
- end
61
-
62
- #
63
- # In --run mode, use the framework (local, hadoop, emr, etc) to re-launch
64
- # the script as mapper, reducer, etc.
65
- # If --map or --reduce, dispatch to the mapper or reducer.
66
- #
67
- def run
68
- case settings.mode
69
- when :local then execute_local_workflow
70
- when :hadoop, :mapred then execute_hadoop_workflow
71
- else
72
- run_flow
73
- end
74
- end
75
-
76
- #
77
- # Shell command for map phase. By default, calls the script in --map mode
78
- # In hadoop mode, this is given to the hadoop streaming command.
79
- # In local mode, it's given to the system() call
80
- #
81
- def mapper_commandline
82
- "#{ruby_interpreter_path} #{this_script_filename} --mode=mapper " + non_wukong_params
83
- end
84
-
85
- #
86
- # Shell command for reduce phase. By default, calls the script in --reduce mode
87
- # In hadoop mode, this is given to the hadoop streaming command.
88
- # In local mode, it's given to the system() call
89
- #
90
- def reducer_commandline
91
- "#{ruby_interpreter_path} #{this_script_filename} --mode=reducer " + non_wukong_params
92
- end
93
-
94
- def job_name
95
- settings[:job_name] ||
96
- "#{File.basename(this_script_filename)}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
97
- end
98
-
99
- # Wrapper for dangerous operations to catch errors
100
- def safely action, &block
101
- begin
102
- block.call
103
- rescue StandardError => e ; handle_error(action, e); end
104
- end
105
-
106
- protected
107
-
108
- #
109
- # Execute the runner phase:
110
- # use the running framework to relaunch the script in map and in reduce mode
111
- #
112
- def execute_command! *args
113
- command = args.flatten.reject(&:blank?).join(" \\\n ")
114
- Log.info "Running\n\n#{command}\n"
115
- if settings[:dry_run]
116
- Log.info '== [Not running preceding command: dry run] =='
117
- else
118
- maybe_overwrite_output_paths! output_path
119
- $stdout.puts `#{command}`
120
- raise "Streaming command failed!" unless $?.success?
121
- end
122
- end
123
-
124
- #
125
- # In hadoop mode only, removes the destination path before launching
126
- #
127
- # To the panic-stricken: look in .Trash/current/path/to/accidentally_deleted_files
128
- #
129
- def maybe_overwrite_output_paths! output_path
130
- if (settings.rm && (settings.mode == :hadoop))
131
- cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
132
- Log.info "Removing output file #{output_path}: #{cmd}"
133
- puts `#{cmd}`
134
- end
135
- end
136
-
137
- # Reassemble all the non-internal-to-wukong settings into a command line for
138
- # the map/reducer phase scripts
139
- def non_wukong_params
140
- settings.
141
- reject{|param, val| settings.definition_of(param, :wukong) }.
142
- reject{|param, val| param.to_s =~ /catalog_root/ }.
143
- map{|param,val| "--#{param}=#{val}" }.
144
- join(" ")
145
- end
146
-
147
- # the full, real path to the script file
148
- def this_script_filename
149
- Pathname.new($0).realpath
150
- end
151
-
152
- # use the full ruby interpreter path to run slave processes
153
- def ruby_interpreter_path
154
- Pathname.new(File.join(
155
- Config::CONFIG["bindir"],
156
- Config::CONFIG["RUBY_INSTALL_NAME"]+Config::CONFIG["EXEEXT"])).realpath
157
- end
158
-
159
-
160
- end
161
- end
@@ -1,240 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- module Wukong
3
- class Script
4
- module HadoopCommand
5
-
6
- # ===========================================================================
7
- #
8
- # Hadoop Options
9
- #
10
- Settings.define :hadoop_home, :default => '/usr/lib/hadoop', :description => "Path to hadoop installation; ENV['HADOOP_HOME'] by default. HADOOP_HOME/bin/hadoop is used to run hadoop.", :env_var => 'HADOOP_HOME', :wukong => true
11
- Settings.define :hadoop_runner, :description => "Path to hadoop script. Usually set --hadoop_home instead of this.", :wukong => true
12
-
13
- #
14
- # Translate simplified args to their hairy hadoop equivalents
15
- #
16
- Settings.define :io_sort_mb, :jobconf => true, :description => 'io.sort.mb', :wukong => true
17
- Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
18
- Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
19
- Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
20
- Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
21
- Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
22
- Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
23
- Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
24
- Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
25
- Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
26
- Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
27
- Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
28
- Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
29
- Settings.define :max_tracker_failures, :jobconf => true, :description => 'mapred.max.tracker.failures', :wukong => true
30
- Settings.define :max_map_attempts, :jobconf => true, :description => 'mapred.map.max.attempts', :wukong => true
31
- Settings.define :max_reduce_attempts, :jobconf => true, :description => 'mapred.reduce.max.attempts', :wukong => true
32
- Settings.define :min_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
33
- Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
34
- Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
35
- Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
36
- Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
37
- Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
38
- Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
39
- Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
40
- Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
41
- Settings.define :split_on_xml_tag, :description => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'", :wukong => true
42
-
43
- # emit a -jobconf hadoop option if the simplified command line arg is present
44
- # if not, the resulting nil will be elided later
45
- def jobconf option
46
- if settings[option]
47
- # "-jobconf %s=%s" % [settings.definition_of(option, :description), settings[option]]
48
- "-D %s=%s" % [settings.definition_of(option, :description), settings[option]]
49
- end
50
- end
51
-
52
- #
53
- # Assemble the hadoop command to execute
54
- # and launch the hadoop runner to execute the script across all tasktrackers
55
- #
56
- # FIXME: Should add some simple logic to ensure that commands are in the
57
- # right order or hadoop will complain. ie. -D settings MUST come before
58
- # others
59
- #
60
- def execute_hadoop_workflow
61
- # Input paths join by ','
62
- input_paths = @input_paths.join(',')
63
- #
64
- # Use Settings[:hadoop_home] to set the path your config install.
65
- hadoop_commandline = [
66
- hadoop_runner,
67
- "jar #{settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
68
- hadoop_jobconf_settings,
69
- "-D mapred.job.name='#{job_name}'",
70
- hadoop_other_args,
71
- "-mapper '#{mapper_commandline}'",
72
- "-reducer '#{reducer_commandline}'",
73
- "-input '#{input_paths}'",
74
- "-output '#{output_path}'",
75
- "-file '#{this_script_filename}'",
76
- hadoop_recycle_env,
77
- ].flatten.compact.join(" \t\\\n ")
78
- Log.info " Launching hadoop!"
79
- execute_command!(hadoop_commandline)
80
- end
81
-
82
- def hadoop_jobconf_settings
83
- jobconf_settings = []
84
- # Fixup these settings
85
- settings[:reuse_jvms] = '-1' if (settings[:reuse_jvms] == true)
86
- settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
87
- # If no reducer and no reduce_command, then skip the reduce phase
88
- # FIXME: auto-detect nil reducer
89
- # settings[:reduce_tasks] = 0 if (! reducer) && (! settings[:reduce_command]) && (! settings[:reduce_tasks])
90
- # Fields hadoop should use to distribute records to reducers
91
- unless settings[:partition_fields].blank?
92
- jobconf_settings += [
93
- jobconf(:partition_fields),
94
- jobconf(:output_field_separator),
95
- ]
96
- end
97
- jobconf_settings += [
98
- :io_sort_mb, :io_sort_record_percent,
99
- :map_speculative, :map_tasks,
100
- :max_maps_per_cluster, :max_maps_per_node,
101
- :max_node_map_tasks, :max_node_reduce_tasks,
102
- :max_reduces_per_cluster, :max_reduces_per_node,
103
- :max_record_length, :min_split_size,
104
- :output_field_separator, :key_field_separator,
105
- :partition_fields, :sort_fields,
106
- :reduce_tasks, :respect_exit_status,
107
- :reuse_jvms, :timeout,
108
- :max_tracker_failures, :max_map_attempts,
109
- :max_reduce_attempts
110
- ].map{|opt| jobconf(opt)}
111
- jobconf_settings.flatten.compact
112
- end
113
-
114
- def hadoop_other_args
115
- extra_str_args = [ settings[:extra_args] ]
116
- if settings.split_on_xml_tag
117
- extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{settings.split_on_xml_tag}>,end=</#{settings.split_on_xml_tag}>'}
118
- end
119
- extra_str_args << ' -lazyOutput' if settings[:noempty] # don't create reduce file if no records
120
- extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless settings[:partition_fields].blank?
121
- extra_str_args
122
- end
123
-
124
- def hadoop_recycle_env
125
- %w[RUBYLIB].map do |var|
126
- %Q{-cmdenv '#{var}=#{ENV[var]}'} if ENV[var]
127
- end.compact
128
- end
129
-
130
- # The path to the hadoop runner script
131
- def hadoop_runner
132
- settings[:hadoop_runner] || (settings[:hadoop_home]+'/bin/hadoop')
133
- end
134
-
135
- module ClassMethods
136
- #
137
- # Via @pskomoroch via @tlipcon,
138
- #
139
- # "there is a little known Hadoop Streaming trick buried in this Python
140
- # script. You will notice that the date is not actually in the raw log
141
- # data itself, but is part of the filename. It turns out that Hadoop makes
142
- # job parameters you would fetch in Java with something like
143
- # job.get("mapred.input.file") available as environment variables for
144
- # streaming jobs, with periods replaced with underscores:
145
- #
146
- # filepath = os.environ["map_input_file"]
147
- # filename = os.path.split(filepath)[-1]
148
- # Thanks to Todd Lipcon for directing me to that hack.
149
- #
150
-
151
- # HDFS pathname to the input file currently being processed.
152
- def input_file
153
- ENV['map_input_file']
154
- end
155
-
156
- # Directory of the input file
157
- def input_dir
158
- ENV['mapred_input_dir']
159
- end
160
-
161
- # Offset of this chunk within the input file
162
- def map_input_start_offset
163
- ENV['map_input_start']
164
- end
165
-
166
- # length of the mapper's input chunk
167
- def map_input_length
168
- ENV['map_input_length']
169
- end
170
-
171
- def attempt_id
172
- ENV['mapred_task_id']
173
- end
174
- def curr_task_id
175
- ENV['mapred_tip_id']
176
- end
177
-
178
- def script_cmdline_urlenc
179
- ENV['stream_map_streamprocessor']
180
- end
181
- end
182
-
183
- # Standard ClassMethods-on-include trick
184
- def self.included base
185
- base.class_eval do
186
- extend ClassMethods
187
- end
188
- end
189
- end
190
- end
191
- end
192
-
193
- # -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
194
- # -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
195
- # -D mapred.text.key.comparator.options=-k2,2nr\
196
- # -D mapred.text.key.partitioner.options=-k1,2\
197
- # -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
198
- # -D stream.num.map.output.key.fields=\"$sortfields\"
199
- #
200
- # -D stream.map.output.field.separator=\"'/t'\"
201
- # -D map.output.key.field.separator=. \
202
- # -D mapred.data.field.separator=. \
203
- # -D map.output.key.value.fields.spec=6,5,1-3:0- \
204
- # -D reduce.output.key.value.fields.spec=0-2:5- \
205
-
206
- # "HADOOP_HOME" =>"/usr/lib/hadoop-0.20/bin/..",
207
- # "HADOOP_IDENT_STRING" =>"hadoop",
208
- # "HADOOP_LOGFILE" =>"hadoop-hadoop-tasktracker-ip-10-242-14-223.log",
209
- # "HADOOP_LOG_DIR" =>"/usr/lib/hadoop-0.20/bin/../logs",
210
- # "HOME" =>"/var/run/hadoop-0.20",
211
- # "JAVA_HOME" =>"/usr/lib/jvm/java-6-sun",
212
- # "LD_LIBRARY_PATH" =>"/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386:/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386",
213
- # "PATH" =>"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games",
214
- # "USER" =>"hadoop",
215
- #
216
- # "dfs_block_size" =>"134217728",
217
- # "map_input_start" =>"0",
218
- # "map_input_length" =>"125726898",
219
- # "mapred_output_key_class" =>"org.apache.hadoop.io.Text",
220
- # "mapred_output_value_class" =>"org.apache.hadoop.io.Text",
221
- # "mapred_output_format_class" =>"org.apache.hadoop.mapred.TextOutputFormat",
222
- # "mapred_output_compression_codec" =>"org.apache.hadoop.io.compress.DefaultCodec",
223
- # "mapred_output_compression_type" =>"BLOCK",
224
- # "mapred_task_partition" =>"0",
225
- # "mapred_tasktracker_map_tasks_maximum" =>"4",
226
- # "mapred_tasktracker_reduce_tasks_maximum" =>"2",
227
- # "mapred_tip_id" =>"task_200910221152_0023_m_000000",
228
- # "mapred_task_id" =>"attempt_200910221152_0023_m_000000_0",
229
- # "mapred_job_tracker" =>"ec2-174-129-141-78.compute-1.amazonaws.com:8021",
230
- #
231
- # "mapred_input_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809",
232
- # "map_input_file" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809/com.twitter.search+20090809233441-56735-womper.tsv.bz2",
233
- # "mapred_working_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip",
234
- # "mapred_work_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809/_temporary/_attempt_200910221152_0023_m_000000_0",
235
- # "mapred_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809",
236
- # "mapred_temp_dir" =>"/mnt/tmp/hadoop-hadoop/mapred/temp",
237
- # "PWD" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work",
238
- # "TMPDIR" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work/tmp",
239
- # "stream_map_streamprocessor" =>"%2Fusr%2Fbin%2Fruby1.8+%2Fmnt%2Fhome%2Fflip%2Fics%2Fwuclan%2Fexamples%2Ftwitter%2Fparse%2Fparse_twitter_search_requests.rb+--map+--rm",
240
- # "user_name" =>"flip",