wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,129 +0,0 @@
1
- require 'htmlentities'
2
- require 'addressable/uri'
3
-
4
- # Fix a bug (?) in the HTMLEntities encoder class with $KCODE='NONE'
5
- HTMLEntities::Encoder.class_eval do
6
- private
7
- def extended_entity_regexp
8
- @extended_entity_regexp ||= (
9
- if encoding_aware?
10
- regexp = '[^\u{20}-\u{7E}]'
11
- else
12
- # regexp = '[^\x20-\x7E]'
13
- regexp = '[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+'
14
- end
15
- regexp += "|'" if @flavor == 'html4'
16
- Regexp.new(regexp)
17
- )
18
- end
19
- end
20
-
21
- module Wukong
22
- #
23
- # By default (or explicitly with the :xml strategy), convert string to
24
- # * XML-encoded ASCII,
25
- #
26
- # * with a guarantee that the characters " quote, ' apos \\ backslash,
27
- # carriage-return \r newline \n and tab \t (as well as all other control
28
- # characters) are encoded.
29
- #
30
- # * Any XML-encoding in the original text is encoded with no introspection:
31
- # encode_str("<a href=\"foo\">")
32
- # # => "<a href="foo">"
33
- #
34
- # * Useful: http://rishida.net/scripts/uniview/conversion.php
35
- #
36
- # With the :url strategy,
37
- # * URL-encode the string
38
- # * This is as strict as possible: encodes all but alphanumeric and _ underscore.
39
- # The resulting string is thus XML- and URL-safe.
40
- # http://addressable.rubyforge.org/api/classes/Addressable/URI.html#M000010
41
- #
42
- # Wukong.decode_str(Wukong.encode_str(str)) returns the original str
43
- #
44
- # If you're seeing bad_encoding errors, try
45
- # $KCODE='u' unless "1.9".respond_to?(:encoding)
46
- # at the start of your script.
47
- #
48
- def self.encode_str str, strategy=:xml
49
- begin
50
- case strategy
51
- when :xml then self.html_encoder.encode(str, :basic, :named, :decimal).gsub(/\\/, '\')
52
- when :url then Addressable::URI.encode_component(str, /[^\w]/)
53
- else raise "Don't know how to encode with strategy #{strategy}"
54
- end
55
- rescue ArgumentError => e
56
- '!bad_encoding!! ' + str.gsub(/[^\w\s\.\-@#%]+/, '')
57
- end
58
- end
59
- # HTMLEntities encoder instance
60
- def self.html_encoder
61
- @html_encoder ||= HTMLEntities.new
62
- end
63
-
64
- #
65
- # Decode string from its encode_str representation. This can include
66
- # dangerous things such as tabs, newlines, backslashes and cryptofascist
67
- # propaganda.
68
- #
69
- def self.decode_str str, strategy=:xml
70
- case strategy
71
- when :xml then self.html_encoder.decode(str)
72
- when :url then Addressable::URI.unencode_component(str)
73
- else raise "Don't know how to decode with strategy #{strategy}"
74
- end
75
- end
76
-
77
- #
78
- # Replace each given field in the hash with its
79
- # encoded value
80
- #
81
- def self.encode_components hsh, *fields
82
- fields.each do |field|
83
- hsh[field] = hsh[field].to_s.wukong_encode if hsh[field]
84
- end
85
- end
86
- end
87
-
88
- String.class_eval do
89
-
90
- #
91
- # Strip control characters that might harsh our buzz, TSV-wise
92
- # See Wukong.encode_str
93
- #
94
- def wukong_encode! *args
95
- replace self.wukong_encode(*args)
96
- end
97
-
98
- def wukong_encode(*args)
99
- Wukong.encode_str(self, *args)
100
- end
101
-
102
- #
103
- # Decode string into original (and possibly unsafe) form
104
- # See Wukong.encode_str and Wukong.decode_str
105
- #
106
- def wukong_decode!(*args)
107
- replace self.wukong_decode(*args)
108
- end
109
-
110
- def wukong_decode(*args)
111
- Wukong.decode_str(self, *args)
112
- end
113
-
114
- #
115
- # Takes an XML-encoded or plaintext string and forces it into canonical encoding
116
- #
117
- def wukong_recode!(*args)
118
- replace self.wukong_decode(*args).wukong_encode(*args)
119
- end
120
- def wukong_recode
121
- Wukong.encode_str(Wukong.decode_str(self, *args), *args)
122
- end
123
- end
124
-
125
- Struct.class_eval do
126
- def recode!(*args)
127
- each_pair{|k,v| v.wukong_recode!(*args) if (v && v.respond_to?(:wukong_recode!)) }
128
- end
129
- end
@@ -1,11 +0,0 @@
1
- #
2
- # This file must be explicitly required
3
- #
4
-
5
- require 'wukong'
6
-
7
- #
8
-
9
- Kernel.at_exit do
10
- puts "Not implemented yet"
11
- end
@@ -1,5 +0,0 @@
1
- def Wu(&block)
2
- Wukong.class_eval{ def self.load_examples_helper() require File.expand_path("../../examples/examples_helper", File.dirname(__FILE__)) ; end }
3
- Wukong.instance_eval(&block)
4
- Wukong.run
5
- end
@@ -1,52 +0,0 @@
1
- #
2
- # Borrowed from Opscode Chef -- thanks guys
3
- #
4
- # Author:: Adam Jacob (<adam@opscode.com>)
5
- # Author:: Christopher Walters (<cw@opscode.com>)
6
- # Copyright:: Copyright (c) 2008 Opscode, Inc.
7
- # License:: Apache License, Version 2.0
8
- #
9
- # Licensed under the Apache License, Version 2.0 (the "License");
10
- # you may not use this file except in compliance with the License.
11
- # You may obtain a copy of the License at
12
- #
13
- # http://www.apache.org/licenses/LICENSE-2.0
14
- #
15
- # Unless required by applicable law or agreed to in writing, software
16
- # distributed under the License is distributed on an "AS IS" BASIS,
17
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
- # See the License for the specific language governing permissions and
19
- # limitations under the License.
20
- #
21
-
22
- module Wukong
23
- module Mixin
24
- module FromFile
25
-
26
- # Loads a given ruby file, and runs instance_eval against it in the context of the current
27
- # object.
28
- #
29
- # Raises an IOError if the file cannot be found, or is not readable.
30
- def from_file(filename)
31
- if File.exists?(filename) && File.readable?(filename)
32
- self.instance_eval(IO.read(filename), filename, 1)
33
- else
34
- raise IOError, "Cannot open or read #{filename}!"
35
- end
36
- end
37
-
38
- # Loads a given ruby file, and runs class_eval against it in the context of the current
39
- # object.
40
- #
41
- # Raises an IOError if the file cannot be found, or is not readable.
42
- def class_from_file(filename)
43
- if File.exists?(filename) && File.readable?(filename)
44
- self.class_eval(IO.read(filename), filename, 1)
45
- else
46
- raise IOError, "Cannot open or read #{filename}!"
47
- end
48
- end
49
-
50
- end
51
- end
52
- end
@@ -1,56 +0,0 @@
1
- module Wukong
2
- #
3
- #
4
- #
5
- #
6
- class Job < Wukong::Graph
7
- # invokable resources
8
- attr_reader :resources
9
-
10
- def to_s
11
- ['<job', handle,
12
- "resources={#{resources.join(' | ')}}",
13
- "chain={#{chain.join(' | ')}}",
14
- ].join(' ')+'>'
15
- end
16
-
17
-
18
- def add_resource(type, handle=nil, *args, &block)
19
- rsrc = Wukong.create(type, handle, *args, &block)
20
- rsrc.graph = self
21
- @resources << rsrc
22
- rsrc
23
- end
24
-
25
- end
26
-
27
- module Task
28
- extend Gorillib::Concern
29
- include Wukong::Stage
30
-
31
- module ClassMethods
32
- def define_action(name, options={}, &block)
33
- self.actions = self.actions.merge(name => options.merge(:block => block))
34
- end
35
-
36
- def class_defaults
37
- super
38
- # field :actions, Array, :of => Symbol, :description => 'list of actions this stage responds to'
39
- class_attribute :actions
40
- self.actions ||= Hash.new
41
- class_attribute :default_action
42
-
43
- define_action :nothing, :description => 'ze goggles, zey do nussing'
44
- end
45
-
46
- end
47
- included do
48
- self.class_defaults
49
- end
50
- end
51
-
52
- def self.job(handle, *args, &block)
53
- @jobs ||= Hash.new
54
- @jobs[handle] ||= Job.new(handle, *args, &block)
55
- end
56
- end
@@ -1,17 +0,0 @@
1
- module Wukong
2
-
3
- #
4
- # Let `Wukong::Job`s invoke and depend on `Rake::Task`s.
5
- #
6
- # for example, Rails defines the `:environment` task:
7
- #
8
- # task :email_expiring, :depends => :environment do
9
- # desc "Email expiring accounts to let them know"
10
- # date = ENV['from'] ? Date.parse(ENV['from']) : Date.today
11
- # Account.notify_expiring(date)
12
- # end
13
- #
14
- #
15
- module RakeCompat
16
- end
17
- end
@@ -1,79 +0,0 @@
1
- module Wukong
2
- @@registries ||= Hash.new
3
-
4
- def self.registry(type, options={})
5
- type = type.to_sym
6
- plural = options[:plural] || "#{type}s"
7
- return if @@registries[type]
8
- @@registries[type] = Registry.new(type)
9
- self.singleton_class.class_eval do
10
- # def self.sources() @sources ; end
11
- define_method(plural){ @@registries[type] }
12
- # def self.source(handle) sources.find(handle) ; end
13
- define_method("#{type}_klass"){|handle| @@registries[type].find(handle) }
14
- # def self.register_source(klass) sources.register(klass) ; end
15
- define_method("register_#{type}"){ |klass| @@registries[type].register(klass) }
16
- # def self.unregister_source(handle) sources.register(klass) ; end
17
- define_method("unregister_#{type}"){ |handle| @@registries[type].unregister(handle) }
18
- # def self.source_exists?(handle) sources.exists?(handle) ; end
19
- define_method("#{type}_exists?"){ |handle| @@registries[type].exists?(handle) }
20
- end
21
- end
22
-
23
- def self.create(type, *args, &block)
24
- @@registries[type].create(*args, &block)
25
- end
26
-
27
- class Registry < Mash
28
- attr_reader :type
29
-
30
- def initialize(type)
31
- @type = type
32
- end
33
-
34
- def all
35
- self.dup.freeze
36
- end
37
-
38
- # given example of registry's class, return it;
39
- # otherwise, look up the handle and return that.
40
- def find(handle)
41
- return handle if handle.is_a?(Class)
42
- self[handle]
43
- end
44
-
45
- def find!(handle, *args)
46
- find(handle, *args) or raise ArgumentError, "cannot find #{type} named '#{handle}'"
47
- end
48
-
49
- def exists?(handle)
50
- self.has_key?(handle)
51
- end
52
-
53
- def create(handle, *args, &block)
54
- find!(handle).new(*args, &block)
55
- end
56
-
57
- # add given class to registry
58
- def register(klass)
59
- self[klass.handle] = klass
60
- end
61
-
62
- def unregister(klass)
63
- self.delete(klass.handle)
64
- end
65
-
66
- def convert_key(key)
67
- key.is_a?(Class) ? key.handle : super(key)
68
- end
69
-
70
- # A valid identifier starts with a letter and has only letters, numbers and underscores
71
- VALID_IDENTIFIER_RE = /\A[a-z]\w+\z/i
72
-
73
- def self.valid_handle?(handle)
74
- handle.to_s =~ VALID_IDENTIFIER_RE
75
- end
76
-
77
- end
78
-
79
- end
@@ -1,276 +0,0 @@
1
- module Wukong
2
- class RunnerResult
3
- field :runner, Runner, :doc => 'Runner object that created this job'
4
- field :command, Array, :of => String, :doc => 'launch command'
5
- field :beg_time, Time
6
- field :end_time, Time
7
- field :raw_out, String
8
- field :raw_err, String
9
- end
10
-
11
- #
12
- # A uniform interface for launching processes.
13
- #
14
- # * accepts humanized and standardized args
15
- # * synthesize args into a command
16
- # * launch the process
17
- # * parse its output
18
- #
19
- class Runner
20
- class_attribute :result_parser ; self.result_parser = RunnerResult
21
-
22
- field :name, Symbol, :required => true
23
- field :executor_path, Pathname, :required => true
24
-
25
- def to_long_params(arg_hsh, dash='-')
26
- arg_hsh.inject([]) do |acc, (param, val)|
27
- param = param.to_s.gsub(/[\-_\W]+/, dash)
28
- acc << "--#{param.to_s}" << val.to_s
29
- end
30
- end
31
-
32
- def native_args(arg_hsh)
33
- to_dashed_params(arg_hsh)
34
- end
35
-
36
- def command(arg_hsh)
37
- [executor_path, *native_args(arg_hsh)]
38
- end
39
-
40
- def run(input, arg_hsh)
41
- cmd = command(input, arg_hsh)
42
- beg_time = Time.now
43
- out, err = launch( *cmd )
44
- end_time = Time.now
45
-
46
- result_parser.new({
47
- :runner => self,
48
- :command => cmd,
49
- :beg_time => beg_time,
50
- :end_time => end_time,
51
- :input => input,
52
- :arg_hsh => arg_hsh,
53
- :raw_out => out,
54
- :raw_err => err,
55
- })
56
- end
57
-
58
- class << self
59
- def executor(*args)
60
- ArgumentError.check_arity!(args, 1)
61
- @executor = args.first if args.present?
62
- @executor
63
- end
64
-
65
- def launch(*cmd)
66
- out = `#{cmd.join(' ')}`
67
- end
68
-
69
- def which(basename)
70
- raise ArgumentError, "which wants a basename, not a path (#{basename})" if basename =~ %r{\/}
71
- out, err = launch('which', basename)
72
- out.chomp
73
- end
74
-
75
- end
76
- end
77
-
78
- module RunnerWithInputOutput
79
- extend Gorillib::Concern
80
- include Hanuman::IsOwnInputSlot
81
- include Hanuman::IsOwnOutputSlot
82
-
83
- # sugar for a command that takes input to produce output.
84
- #
85
- # @param [Array<String>, String] inputs -- added as the `:inputs` arg (converting to an array if necessary)
86
- # @param [String] output -- added as the `:output` arg
87
- #
88
- def run(inputs, output, args={})
89
- inputs = Array.wrap(inputs)
90
- super args.merge(:inputs => inputs, :output => output)
91
- end
92
- end
93
-
94
- #
95
- # Wukong::Runner interface for the `cp` command
96
- #
97
- # @example
98
- # runner = Wukong::CpRunner.new
99
- # runner.run('my_src.jpg', 'my_dest.jpg')
100
- #
101
- class CpRunner
102
- include RunnerWithInputOutput
103
- executor which('cp')
104
-
105
- argument :verbose, Boolean, :native => '-v', :solo => true, :doc => 'show files as they are copied'
106
- argument :duplicate, Boolean, :native => '-a', :solo => true, :doc => 'Preserves structure and attributes of files'
107
- end
108
-
109
- class ScpRunner
110
- include RunnerWithInputOutput
111
- executor which('scp')
112
-
113
- argument :verbose, Boolean, :native => '-v', :solo => true, :doc => 'show files as they are copied'
114
- argument :duplicate, Boolean, :native => '-p', :solo => true, :doc => 'Preserves structure and attributes of files'
115
- #
116
- argument :ssh_user, String
117
- argument :dest_host, String
118
- argument :ssh_key_file, Pathname, :native => '-i'
119
- argument :dest_port, Integer, :native => '-P'
120
-
121
- argument :compression, Boolean, :native => '-C'
122
- argument :recursive, Boolean, :native => '-r'
123
-
124
- self.success_exit_status = 0
125
- end
126
-
127
- module RunnerForJava
128
-
129
- argument :java_home, :env_var => 'JAVA_HOME', :doc => 'path to the java environment; $JAVA_HOME/bin usually holds your java runner'
130
-
131
- argument :java_prog, :finally => ->(){ path_to(arg_val(:java_home), 'bin', 'java') }
132
-
133
- argument :jar
134
-
135
- argument :classpath
136
-
137
- def java_conf
138
- end
139
-
140
- end
141
-
142
- class HadoopRunner
143
- include RunnerWithInputOutput
144
- executor which('hadoop')
145
-
146
- argument :verbose, Boolean, :native => '-v', :solo => true, :doc => 'show files as they are copied'
147
-
148
- argument :hadoop_home, :default => '/usr/lib/hadoop', :doc => "Path to hadoop installation; ENV['HADOOP_HOME'] by default. HADOOP_HOME/bin/hadoop is used to run hadoop.", :env_var => 'HADOOP_HOME'
149
- argument :hadoop_runner, :doc => "Path to hadoop script. Usually set --hadoop_home instead of this."
150
-
151
- #
152
- # Translate simplified args to their hairy hadoop equivalents
153
- #
154
- argument :job_name, :jobconf => 'mapred.job.name'
155
- #
156
- argument :io_sort_mb, :jobconf => 'io.sort.mb'
157
- argument :io_sort_record_percent, :jobconf => 'io.sort.record.percent'
158
- argument :key_field_separator, :jobconf => 'map.output.key.field.separator'
159
- argument :map_speculative, :jobconf => 'mapred.map.tasks.speculative.execution'
160
- argument :map_tasks, :jobconf => 'mapred.map.tasks'
161
- argument :max_maps_per_cluster, :jobconf => 'mapred.max.maps.per.cluster'
162
- argument :max_maps_per_node, :jobconf => 'mapred.max.maps.per.node'
163
- argument :max_node_map_tasks, :jobconf => 'mapred.tasktracker.map.tasks.maximum'
164
- argument :max_node_reduce_tasks, :jobconf => 'mapred.tasktracker.reduce.tasks.maximum'
165
- argument :max_record_length, :jobconf => 'mapred.linerecordreader.maxlength', :doc => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
166
- argument :max_reduces_per_cluster,:jobconf => 'mapred.max.reduces.per.cluster'
167
- argument :max_reduces_per_node, :jobconf => 'mapred.max.reduces.per.node'
168
- argument :max_tracker_failures, :jobconf => 'mapred.max.tracker.failures'
169
- argument :max_map_attempts, :jobconf => 'mapred.map.max.attempts'
170
- argument :max_reduce_attempts, :jobconf => 'mapred.reduce.max.attempts'
171
- argument :min_split_size, :jobconf => 'mapred.min.split.size'
172
- argument :output_field_separator, :jobconf => 'stream.map.output.field.separator'
173
- argument :partition_fields, :jobconf => 'num.key.fields.for.partition'
174
- argument :reduce_tasks, :jobconf => 'mapred.reduce.tasks'
175
- argument :respect_exit_status, :jobconf => 'stream.non.zero.exit.is.failure'
176
- argument :reuse_jvms, :jobconf => 'mapred.job.reuse.jvm.num.tasks'
177
- argument :sort_fields, :jobconf => 'stream.num.map.output.key.fields'
178
- argument :timeout, :jobconf => 'mapred.task.timeout'
179
- argument :noempty, :doc => "don't create zero-byte reduce files (hadoop mode only)"
180
- argument :split_on_xml_tag, :doc => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'"
181
-
182
-
183
- argument :mapper_command, String, :native => '-mapper'
184
- argument :reducer_command, String, :native => '-reducer'
185
-
186
- repeated_argument :file, String, :native => '-file'
187
-
188
- # emit a -jobconf hadoop option if the simplified command line arg is present
189
- def jobconf option
190
- if settings[option]
191
- # "-jobconf %s=%s" % [settings.definition_of(option, :description), settings[option]]
192
- "-D %s=%s" % [settings.definition_of(option, :description), settings[option]]
193
- end
194
- end
195
-
196
- def finalize_settings
197
- settings[:reuse_jvms] = '-1' if (settings[:reuse_jvms] == true)
198
- settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
199
- settings[:reduce_tasks] = 0 if (! settings[:reduce_command])
200
- end
201
-
202
- def hadoop_other_args
203
- extra_str_args = [ settings[:extra_args] ]
204
- if settings.split_on_xml_tag
205
- extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{settings.split_on_xml_tag}>,end=</#{settings.split_on_xml_tag}>'}
206
- end
207
- extra_str_args << ' -lazyOutput' if settings[:noempty] # don't create reduce file if no records
208
- extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless settings[:partition_fields].blank?
209
- extra_str_args
210
- end
211
-
212
- def hadoop_recycle_env
213
- %w[RUBYLIB].map do |var|
214
- %Q{-cmdenv '#{var}=#{ENV[var]}'} if ENV[var]
215
- end.compact
216
- end
217
-
218
- # The path to the hadoop runner script
219
- def hadoop_runner
220
- settings[:hadoop_runner] || (settings[:hadoop_home]+'/bin/hadoop')
221
- end
222
-
223
- #
224
- # Assemble the hadoop command to execute
225
- # and launch the hadoop runner to execute the script across all tasktrackers
226
- #
227
- # FIXME: Should add some simple logic to ensure that commands are in the
228
- # right order or hadoop will complain. ie. -D settings MUST come before
229
- # others
230
- #
231
- def execute_hadoop_workflow
232
- # Input paths join by ','
233
- input_paths = @input_paths.join(',')
234
- #
235
- # Use Settings[:hadoop_home] to set the path your config install.
236
- hadoop_commandline = [
237
- hadoop_runner,
238
- "jar #{settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
239
- hadoop_jobconf_settings,
240
- "-D mapred.job.name='#{job_name}'",
241
- hadoop_other_args,
242
- "-mapper '#{mapper_commandline}'",
243
- "-reducer '#{reducer_commandline}'",
244
- "-input '#{input_paths}'",
245
- "-output '#{output_path}'",
246
- "-file '#{this_script_filename}'",
247
- hadoop_recycle_env,
248
- ].flatten.compact.join(" \t\\\n ")
249
- Log.info " Launching hadoop!"
250
- execute_command!(hadoop_commandline)
251
- end
252
-
253
- end
254
-
255
-
256
- #
257
- # Req
258
- #
259
- class HadoopJob
260
- field :job_id
261
- field :k
262
-
263
- def from_jobtracker(jobtracker_host)
264
- contents = fetch_jobtracker_raw(jobtracker_host)
265
- attrs = parse_jobtracker_raw(contents)
266
- end
267
-
268
- def fetch_jobtracker_raw(jobtracker_host)
269
- end
270
-
271
- def parse_jobtracker_raw(contents)
272
- end
273
- end
274
-
275
-
276
- end