wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,35 +0,0 @@
1
- require File.expand_path('stopwords', File.dirname(__FILE__))
2
- module Wukong
3
- module Helper
4
-
5
- module Tokenize
6
- #
7
- # Split a string into its constituent words.
8
- #
9
- # This is pretty simpleminded:
10
- # * downcase the word
11
- # * Split at any non-alphanumeric boundary, including '_'
12
- # * However, preserve the special cases of 's, 'd or 't at the end of a
13
- # word.
14
- #
15
- # tokenize("Ability is a poor man's wealth #johnwoodenquote")
16
- # # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
17
- #
18
- def self.tokenize str
19
- return [] if str.blank?
20
- str = str.downcase;
21
- # kill off all punctuation except [stuff]'s or [stuff]'t
22
- # this includes hyphens (words are split)
23
- str = str.
24
- gsub(/[^a-zA-Z0-9\']+/, ' ').
25
- gsub(/(\w)\'([stdm]|re|ve|ll)\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
26
- # Busticate at whitespace
27
- words = str.split(/\s+/)
28
- words.reject!{|w| w.length < 3 || Wukong::Corpus::STOPWORDS_3.include?(w) }
29
- words
30
- end
31
-
32
- end
33
-
34
- end
35
- end
@@ -1,38 +0,0 @@
1
- module Wukong
2
- # Common logger
3
- #
4
- # Set your own at any time with
5
- # Wukong.logger = YourAwesomeLogger.new(...)
6
- # If you have log4r installed you can use
7
- # Wukong.logger = Wukong.default_log4r_logger
8
- #
9
- # If Wukong.logger is too much typing for you,
10
- # use the Log constant
11
- #
12
- # Default format:
13
- # I, [2009-07-26T19:58:46-05:00 #12332]: Up to 2000 char message
14
- #
15
- def self.logger
16
- return @logger if defined?(@logger)
17
- require 'logger'
18
- @logger = Logger.new STDERR
19
- @logger.instance_eval do
20
- def dump *args
21
- debug args.inspect
22
- end
23
- end
24
- @logger
25
- end
26
-
27
- def self.logger= logger
28
- @logger = logger
29
- end
30
- end
31
-
32
- #
33
- # A convenient logger.
34
- #
35
- # define Log yourself to prevent its creation
36
- #
37
- Log = Wukong.logger unless defined?(Log)
38
-
@@ -1,72 +0,0 @@
1
- Settings.define :log_interval, :default => 10_000, :type => Integer, :description => 'How many iterations between log statements'
2
- Settings.define :log_seconds, :default => 30, :type => Integer, :description => 'How many seconds between log statements'
3
-
4
- #
5
- # Periodic monitor
6
- #
7
- #
8
- # This is very much a work in progress
9
- #
10
- class PeriodicMonitor
11
- attr_reader :iter, :start_time, :options
12
- attr_accessor :interval
13
- attr_accessor :time_interval
14
-
15
- def initialize extra_options={}
16
- @options = {}
17
- @options.deep_merge!( extra_options || {} )
18
- @iter = 0
19
- @start_time = now
20
- @last_report = @start_time
21
- @interval = (options[:log_interval] || Settings[:log_interval]).to_i
22
- @interval = 1000 unless @interval >= 1
23
- @time_interval = (options[:log_seconds] || Settings[:log_seconds]).to_i
24
- end
25
-
26
- def periodically *args, &block
27
- incr!
28
- if ready?
29
- @last_report = Time.now
30
- if block
31
- emit block.call(self, *args)
32
- else
33
- emit progress(*args)
34
- end
35
- end
36
- end
37
-
38
- def emit log_line
39
- Log.info log_line
40
- end
41
-
42
- def incr!
43
- @iter += 1
44
- end
45
-
46
- def ready?
47
- (iter % @interval == 0) || (since > time_interval)
48
- end
49
-
50
- def progress *stuff
51
- [
52
- "%15d" % iter,
53
- "%7.1f"% elapsed_time, "sec",
54
- "%7.1f"% rate, "/sec",
55
- now.to_flat,
56
- *stuff
57
- ].flatten.join("\t")
58
- end
59
-
60
- def elapsed_time
61
- now - start_time
62
- end
63
- def since
64
- now - @last_report
65
- end
66
- def now
67
- Time.now.utc
68
- end
69
- def rate
70
- iter.to_f / elapsed_time
71
- end
72
- end
@@ -1,269 +0,0 @@
1
- require 'extlib/inflection'
2
- require 'wukong'
3
-
4
-
5
- #
6
- # Basic types: SQL conversion
7
- #
8
- class << Integer ; def to_sql() 'INT' end ; end
9
- class << Bignum ; def to_sql() 'BIGINT' end ; end
10
- class << String ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end
11
- class << Symbol ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end
12
- class << BigDecimal ; def to_sql() 'DECIMAL' end ; end if defined?(BigDecimal)
13
- class << EpochTime ; def to_sql() 'INT' end ; end if defined?(EpochTime)
14
- class << FilePath ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(FilePath)
15
- class << Flag ; def to_sql() 'CHAR(1) CHARACTER SET ASCII' end ; end if defined?(Flag)
16
- class << IPAddress ; def to_sql() 'CHAR(15) CHARACTER SET ASCII' end ; end if defined?(IPAddress)
17
- class << URI ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(URI)
18
- class << Csv ; def to_sql() 'TEXT' end ; end if defined?(Csv)
19
- class << Yaml ; def to_sql() 'TEXT' end ; end if defined?(Yaml)
20
- class << Json ; def to_sql() 'TEXT' end ; end if defined?(Json)
21
- class << Regex ; def to_sql() 'TEXT' end ; end if defined?(Regex)
22
- class String ; def to_sql() self ; end ; end
23
- class Symbol ; def to_sql() self.to_s.upcase ; end ; end
24
-
25
- #
26
- # Basic types: Pig conversion
27
- #
28
- class << Integer ; def to_pig() 'int' end ; end
29
- class << Bignum ; def to_pig() 'long' end ; end
30
- class << Float ; def to_pig() 'float' end ; end
31
- class << Symbol ; def to_pig() 'chararray' end ; end
32
- class << Date ; def to_pig() 'long' end ; end
33
- class << Time ; def to_pig() 'long' end ; end
34
- class << DateTime ; def to_pig() 'long' end ; end
35
- class << String ; def to_pig() 'chararray' end ; end
36
- class << Text ; def to_pig() 'chararray' end ; end if defined?(Text)
37
- class << Blob ; def to_pig() 'bytearray' end ; end if defined?(Blob)
38
- class << Boolean ; def to_pig() 'bytearray' end ; end if defined?(Boolean)
39
- class String ; def to_pig() self.to_s ; end ; end
40
- class Symbol ; def to_pig() self.to_s ; end ; end
41
-
42
- class << BigDecimal ; def to_pig() 'long' end ; end if defined?(BigDecimal)
43
- class << EpochTime ; def to_pig() 'integer' end ; end if defined?(EpochTime)
44
- class << FilePath ; def to_pig() 'chararray' end ; end if defined?(FilePath)
45
- class << Flag ; def to_pig() 'chararray' end ; end if defined?(Flag)
46
- class << IPAddress ; def to_pig() 'chararray' end ; end if defined?(IPAddress)
47
- class << URI ; def to_pig() 'chararray' end ; end if defined?(URI)
48
- class << Csv ; def to_pig() 'chararray' end ; end if defined?(Csv)
49
- class << Yaml ; def to_pig() 'chararray' end ; end if defined?(Yaml)
50
- class << Json ; def to_pig() 'chararray' end ; end if defined?(Json)
51
- class << Regex ; def to_pig() 'chararray' end ; end if defined?(Regex)
52
-
53
-
54
- #
55
- # Basic types: Avro conversion
56
- #
57
- class << Integer ; def to_avro() 'int' end ; end
58
- class << Bignum ; def to_avro() 'long' end ; end
59
- class << Float ; def to_avro() 'float' end ; end
60
- class << Symbol ; def to_avro() 'string' end ; end
61
- class << Date ; def to_avro() 'long' end ; end
62
- class << Time ; def to_avro() 'long' end ; end
63
- class << DateTime ; def to_avro() 'long' end ; end
64
- class << String ; def to_avro() 'string' end ; end
65
- class << Text ; def to_avro() 'string' end ; end if defined?(Text)
66
- class << Blob ; def to_avro() 'bytearray' end ; end if defined?(Blob)
67
- class << Boolean ; def to_avro() 'bytearray' end ; end if defined?(Boolean)
68
- class String ; def to_avro() self.to_s ; end ; end
69
- class Symbol ; def to_avro() self.to_s ; end ; end
70
-
71
- class << BigDecimal ; def to_avro() 'long' end ; end if defined?(BigDecimal)
72
- class << EpochTime ; def to_avro() 'integer' end ; end if defined?(EpochTime)
73
- class << FilePath ; def to_avro() 'string' end ; end if defined?(FilePath)
74
- class << Flag ; def to_avro() 'string' end ; end if defined?(Flag)
75
- class << IPAddress ; def to_avro() 'string' end ; end if defined?(IPAddress)
76
- class << URI ; def to_avro() 'string' end ; end if defined?(URI)
77
- class << Csv ; def to_avro() 'string' end ; end if defined?(Csv)
78
- class << Yaml ; def to_avro() 'string' end ; end if defined?(Yaml)
79
- class << Json ; def to_avro() 'string' end ; end if defined?(Json)
80
- class << Regex ; def to_avro() 'string' end ; end if defined?(Regex)
81
-
82
- module Wukong
83
- #
84
- # Export model's structure for loading and manipulating in other frameworks,
85
- # such as SQL and Pig
86
- #
87
- # Your class should support the #resource_name and #mtypes methods
88
- # An easy way to do this is by being a TypedStruct.
89
- #
90
- # You can use this to do silly stunts like
91
- #
92
- # % ruby -rubygems -r'wukong/schema' -e 'require "/path/to/user_model.rb" ; puts User.pig_load ; '
93
- #
94
- # If you include the classes from Wukong::Datatypes::MoreTypes, you can draw
95
- # on a richer set of type definitions
96
- #
97
- # require 'wukong/datatypes/more_types'
98
- # include Wukong::Datatypes::MoreTypes
99
- # require 'wukong/schema'
100
- #
101
- # (if you're using Wukong to bulk-process Datamapper records, these should
102
- # fall right in line as well -- make sure *not* to include
103
- # Wukong::Datatypes::MoreTypes, and to require 'dm-more' before 'wukong/schema')
104
- #
105
- module Schema
106
- module ClassMethods
107
-
108
- #
109
- # Table name for this class
110
- #
111
- def table_name
112
- resource_name.to_s.pluralize
113
- end
114
-
115
- # ===========================================================================
116
- #
117
- # Pig
118
- #
119
-
120
- # Export schema as Pig
121
- #
122
- # Won't correctly handle complex types (struct having struct as member, eg)
123
- #
124
- def to_pig
125
- members.zip(mtypes).map do |member, type|
126
- member.to_s + ': ' + type.to_pig
127
- end.join(', ')
128
- end
129
-
130
- #
131
- # A pig snippet to load a tsv file containing
132
- # serialized instances of this class.
133
- #
134
- # Assumes the first column is the resource name (you can, and probably
135
- # should, follow with an immediate GENERATE to ditch that field.)
136
- #
137
- def pig_load filename=nil
138
- filename ||= resource_name.to_s+'.tsv'
139
- cmd = [
140
- "%-23s" % self.to_s.gsub(/^.*\W/, ""),
141
- "= LOAD '#{filename}'",
142
- "AS ( rsrc:chararray,", self.to_pig, ') ;',
143
- ].join(" ")
144
- end
145
-
146
- # ===========================================================================
147
- #
148
- # SQL
149
-
150
- #
151
- # Schema definition for use in a CREATE TABLE statement
152
- #
153
- def to_sql
154
- sql_str = []
155
- members.zip(mtypes).each do |attr, type|
156
- type_str = type.respond_to?(:to_sql) ? type.to_sql : type.to_s.upcase
157
- sql_str << " %-29s\t%s" %["`#{attr}`", type_str]
158
- end
159
- sql_str.join(",\n")
160
- end
161
-
162
- #
163
- # List off member names, to be stuffed into a SELECT or a LOAD DATA
164
- #
165
- def sql_members
166
- members.map{|attr| "`#{attr}`" }.join(", ")
167
- end
168
-
169
- #
170
- # Creates a table for the wukong class.
171
- #
172
- # * primary_key gives the name of one column to be set as the primary key
173
- #
174
- # * if drop_first is given, a "DROP TABLE IF EXISTS" statement will
175
- # precede the snippet.
176
- #
177
- # * table_options sets the table parameters. Useful table_options for a
178
- # read-only database in MySQL:
179
- # ENGINE=MyISAM PACK_KEYS=0
180
- #
181
- def sql_create_table primary_key=nil, drop_first=nil, table_options=''
182
- str = []
183
- str << %Q{DROP TABLE IF EXISTS `#{self.table_name}`; } if drop_first
184
- str << %Q{CREATE TABLE `#{self.table_name}` ( }
185
- str << self.to_sql
186
- if primary_key then str.last << ',' ; str << %Q{ PRIMARY KEY \t(`#{primary_key}`)} ; end
187
- str << %Q{ ) #{table_options} ;}
188
- str.join("\n")
189
- end
190
-
191
- #
192
- # A mysql snippet to bulk load the tab-separated-values file emitted by a
193
- # Wukong script.
194
- #
195
- # Let's say your class is ClickLog; its resource_name is "click_log"
196
- # and thus its table_name is 'click_logs'. sql_load_mysql will:
197
- #
198
- # * disable indexing on the table
199
- # * import the file, replacing any existing rows. (Replacement is governed
200
- # by primary key and unique index constraints -- see the mysql docs).
201
- # * re-enable indexing on that table
202
- # * show the number of
203
- #
204
- # The load portion will
205
- #
206
- # * Load into a table named click_logs
207
- # * from a file named click_logs.tsv
208
- # * where all rows have the string 'click_logs' in their first column
209
- # * and all remaining fields in their #members order
210
- # * assuming strings are wukong_encode'd and so shouldn't be escaped or enclosed.
211
- #
212
- # Why the "LINES STARTING BY" part? For map/reduce outputs that have many
213
- # different objects jumbled together, you can just dump in the whole file,
214
- # landing each object in its correct table.
215
- #
216
- def sql_load_mysql(filename=nil)
217
- filename ||= ":resource_name.tsv"
218
- filename.gsub!(/:resource_name/, self.table_name)
219
- str = []
220
- # disable indexing during bulk load
221
- str << %Q{ALTER TABLE `#{self.table_name}` DISABLE KEYS; }
222
- # Bulk load the tab-separated-values file.
223
- str << %Q{LOAD DATA LOCAL INFILE '#{filename}'}
224
- str << %Q{ REPLACE INTO TABLE `#{self.table_name}` }
225
- str << %Q{ COLUMNS }
226
- str << %Q{ TERMINATED BY '\\t' }
227
- str << %Q{ OPTIONALLY ENCLOSED BY '' }
228
- str << %Q{ ESCAPED BY '' }
229
- str << %Q{ LINES STARTING BY '#{self.resource_name}' }
230
- str << %Q{ ( @dummy,\n }
231
- str << ' '+self.sql_members
232
- str << %Q{\n ); }
233
- # Re-enable indexing
234
- str << %Q{ALTER TABLE `#{self.table_name}` ENABLE KEYS ; }
235
- # Show it loaded correctly
236
- str << %Q{SELECT NOW(), COUNT(*), '#{self.table_name}' FROM `#{self.table_name}`; }
237
- str.join("\n")
238
- end
239
-
240
-
241
-
242
-
243
- #
244
- # Avro
245
- #
246
- def to_avro
247
- require 'json' # yikes
248
- h = {}
249
- h[:name] = self.name
250
- h[:type] = "record"
251
- h[:fields] = []
252
- members.zip(mtypes).each do |member, type|
253
- h[:fields] << {:name => member.to_s, :type => type.to_avro}
254
- end
255
- h.to_json
256
- end
257
-
258
- end
259
- # standard stanza for making methods appear on the class itself on include
260
- def self.included base
261
- base.class_eval{ extend ClassMethods }
262
- end
263
- end
264
- end
265
-
266
- #
267
- # TypedStructs are class-schematizeable
268
- #
269
- Struct.class_eval do include(Wukong::Schema) ; end
@@ -1,286 +0,0 @@
1
- require 'pathname'
2
- require 'wukong/extensions'
3
- require 'configliere' ; Settings.use(:commandline, :env_var, :define)
4
- require 'wukong'
5
- require 'wukong/script/hadoop_command'
6
- require 'wukong/script/local_command'
7
- require 'rbconfig' # for uncovering ruby_interpreter_path
8
- require 'wukong/streamer' ; include Wukong::Streamer
9
- module Wukong
10
- # == How to run a Wukong script
11
- #
12
- # your/script.rb --run path/to/input_files path/to/output_dir
13
- #
14
- # All of the file paths are HDFS paths ; your script path, of course, is on the local filesystem.
15
- #
16
- # == Command-line options
17
- #
18
- # If you'd like to listen for any command-line options, specify them at the
19
- # command line:
20
- #
21
- # your/script.rb --my_bool_opt --my_val_taking_opt=val \
22
- # --run path/to/input_files path/to/output_dir
23
- #
24
- # In this case the options hash for both Mapper and Reducer will contain
25
- #
26
- # :my_bool_opt => true,
27
- # :my_val_taking_opt => 'val'
28
- #
29
- # == Complicated input paths
30
- #
31
- # To use more than one file as input, you can use normal * ? [] wildcards or
32
- # give a comma-separated list -- see the hadoop documentation for syntax.
33
- #
34
- # == Run in Elastic MapReduce Mode (--run=emr)
35
- #
36
- # Wukong can be used to start scripts on the amazon cloud
37
- #
38
- # * copies the script to s3 in two parts
39
- # * invokes it using the amazon API
40
- #
41
- # == Run locally (--run=local)
42
- #
43
- # To run your script locally, use --run=local
44
- #
45
- # your/script.rb --run=local path/to/input_files path/to/output_dir
46
- #
47
- # This will pipe the contents of path/to/input_files through first your
48
- # mapper, then sort, then the reducer, storing the results in the given output
49
- # directory.
50
- #
51
- # All paths refer to the /local/ filesystem -- hadoop is never involved and in
52
- # fact doesn't even have to be installed.
53
- #
54
- # == How to test your scripts
55
- #
56
- # You can supply the --map argument in place of --run to run the mapper on its
57
- # own (and similarly, --reduce to run the reducer standalone):
58
- #
59
- # cat ./local/test/input.tsv | ./examples/word_count.rb --map | more
60
- #
61
- # or, if your test data lies on the HDFS,
62
- #
63
- # hdp-cat test/input.tsv | ./examples/word_count.rb --map | more
64
- #
65
- #
66
- class Script
67
- include Wukong::HadoopCommand
68
- include Wukong::LocalCommand
69
- attr_reader :mapper, :reducer, :options
70
- attr_reader :input_paths, :output_path
71
-
72
- # ---------------------------------------------------------------------------
73
- #
74
- # Default options for Wukong
75
- # http://github.com/infochimps/wukong
76
- #
77
- # If you set an environment variable WUKONG_CONFIG, *or* if the file
78
- # $HOME/.wukong.rb exists, that file will be +require+'d as well.
79
- #
80
- # Important values to set:
81
- #
82
- # * hadoop_home -- Path to root of hadoop install. If your hadoop runner is
83
- # /usr/local/share/hadoop/bin/hadoop
84
- # then your hadoop_home is
85
- # /usr/local/share/hadoop.
86
- # You can also set a :hadoop_runner that gives the full path to the hadoop script
87
- #
88
- # * default_run_mode -- Whether to run using hadoop (and
89
- # thus, requiring a working hadoop install), or to run in local mode
90
- # (script --map | sort | script --reduce)
91
- #
92
- Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run mode: local, hadoop, emr (elastic mapreduce)', :wukong => true, :hide_help => false
93
- Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
94
- Settings.define :reduce_command, :description => "shell command to run as reducer, in place of this wukong script", :wukong => true
95
- Settings.define :run, :env_var => 'WUKONG_RUN_MODE', :description => "run the script's workflow: Specify 'hadoop' to use hadoop streaming; 'local' to run your_script.rb --map | sort | your_script.rb --reduce; 'emr' to launch on the amazon cloud; 'map' or 'reduce' to run that phase.", :wukong => true
96
- Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
97
- Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
98
- Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
99
- Settings.define :rm, :description => "Recursively remove the destination directory. Only used in hadoop mode.", :wukong => true
100
-
101
- #
102
- # Instantiate the Script with the Mapper and the Reducer class (each a
103
- # Wukong::Streamer) it should call back.
104
- #
105
- #
106
- # == Identity or External program as map or reduce
107
- #
108
- # To use the identity reducer ('cat'), instantiate your Script class with
109
- # +nil+ as the reducer class. (And similarly to use an identity mapper,
110
- # supply +nil+ for the mapper class.)
111
- #
112
- # To use an external program as your reducer (mapper), subclass the
113
- # reduce_command (map_command) method to return the full command line
114
- # expression to call.
115
- #
116
- # class MyMapper < Wukong::Streamer::Base
117
- # # ... awesome stuff ...
118
- # end
119
- #
120
- # class MyScript < Wukong::Script
121
- # # prefix each unique line with the count of its occurrences.
122
- # def reduce_command
123
- # '/usr/bin/uniq -c'
124
- # end
125
- # end
126
- # MyScript.new(MyMapper, nil).run
127
- #
128
- def initialize mapper, reducer=nil, extra_options={}
129
- Settings.resolve!
130
- @options = Settings
131
- options.merge! extra_options
132
- @mapper = (case mapper when Class then mapper.new when nil then nil else mapper ; end)
133
- @reducer = (case reducer when Class then reducer.new when nil then nil else reducer ; end)
134
- @output_path = options.rest.pop
135
- @input_paths = options.rest.reject(&:blank?)
136
- if (input_paths.blank? || output_path.blank?) && (not options[:dry_run]) && (not ['map', 'reduce'].include?(run_mode))
137
- raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}"
138
- end
139
- end
140
-
141
- #
142
- # In --run mode, use the framework (local, hadoop, emr, etc) to re-launch
143
- # the script as mapper, reducer, etc.
144
- # If --map or --reduce, dispatch to the mapper or reducer.
145
- #
146
- def run
147
- case run_mode
148
- when 'map' then mapper.stream
149
- when 'reduce' then reducer.stream
150
- when 'local' then execute_local_workflow
151
- when 'cassandra' then execute_hadoop_workflow
152
- when 'hadoop', 'mapred' then execute_hadoop_workflow
153
- when 'emr'
154
- require 'wukong/script/emr_command'
155
- execute_emr_workflow
156
- else dump_help
157
- end
158
- end
159
-
160
- # if only --run is given, assume default run mode
161
- def run_mode
162
- case
163
- when options[:map] then 'map'
164
- when options[:reduce] then 'reduce'
165
- when ($0 =~ /-mapper\.rb$/) then 'map'
166
- when ($0 =~ /-reducer\.rb$/) then 'reduce'
167
- when (options[:run] == true) then options[:default_run_mode]
168
- else options[:run].to_s
169
- end
170
- end
171
-
172
- #
173
- # Shell command for map phase. By default, calls the script in --map mode
174
- # In hadoop mode, this is given to the hadoop streaming command.
175
- # In local mode, it's given to the system() call
176
- #
177
- def mapper_commandline(run_option=:local)
178
- if mapper
179
- case run_option
180
- when :local then
181
- "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
182
- when :hadoop then
183
- "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --map " + non_wukong_params
184
- end
185
- else
186
- options[:map_command]
187
- end
188
- end
189
-
190
- #
191
- # Shell command for reduce phase. By default, calls the script in --reduce mode
192
- # In hadoop mode, this is given to the hadoop streaming command.
193
- # In local mode, it's given to the system() call
194
- #
195
- def reducer_commandline(run_option=:local)
196
- if reducer
197
- case run_option
198
- when :local then
199
- "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
200
- when :hadoop then
201
- "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --reduce " + non_wukong_params
202
- end
203
- else
204
- options[:reduce_command]
205
- end
206
- end
207
-
208
- def job_name
209
- options[:job_name] ||
210
- "#{File.basename(this_script_filename)}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
211
- end
212
-
213
- # Wrapper for dangerous operations to catch errors
214
- def safely action, &block
215
- begin
216
- block.call
217
- rescue StandardError => e ; handle_error(action, e); end
218
- end
219
-
220
- protected
221
-
222
- #
223
- # Execute the runner phase:
224
- # use the running framework to relaunch the script in map and in reduce mode
225
- #
226
- def execute_command! *args
227
- command = args.flatten.reject(&:blank?).join(" \\\n ")
228
- Log.info "Running\n\n#{command}\n"
229
- if options[:dry_run]
230
- Log.info '== [Not running preceding command: dry run] =='
231
- else
232
- maybe_overwrite_output_paths! output_path
233
- $stdout.puts `#{command}`
234
- raise "Streaming command failed!" unless $?.success?
235
- end
236
- end
237
-
238
- #
239
- # In hadoop mode only, removes the destination path before launching
240
- #
241
- # To the panic-stricken: look in .Trash/current/path/to/accidentally_deleted_files
242
- #
243
- def maybe_overwrite_output_paths! output_path
244
- if (options[:overwrite] || options[:rm]) && (run_mode == 'hadoop')
245
- cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
246
- Log.info "Removing output file #{output_path}: #{cmd}"
247
- puts `#{cmd}`
248
- end
249
- end
250
-
251
- # Reassemble all the non-internal-to-wukong options into a command line for
252
- # the map/reducer phase scripts
253
- def non_wukong_params
254
- options.
255
- reject{|param, val| options.definition_of(param, :wukong) }.
256
- map{|param,val| "--#{param}=#{val}" }.
257
- join(" ")
258
- end
259
-
260
- # the full, real path to the script file
261
- def this_script_filename
262
- Pathname.new($0).realpath
263
- end
264
-
265
- # use the full ruby interpreter path to run slave processes
266
- def ruby_interpreter_path
267
- Pathname.new(File.join(
268
- Config::CONFIG["bindir"],
269
- Config::CONFIG["RUBY_INSTALL_NAME"]+Config::CONFIG["EXEEXT"])).realpath
270
- end
271
-
272
- #
273
- # Usage
274
- #
275
- def dump_help
276
- options.dump_help %Q{Please specify a run mode: you probably want to start with
277
- #{$0} --run --local input.tsv output.tsv
278
- although
279
- cat input.tsv | #{$0} --map > mapped.tsv
280
- or
281
- cat mapped.tsv | sort | #{$0} --reduce > reduced.tsv
282
- can be useful for initial testing.}
283
- end
284
-
285
- end
286
- end