wukong 3.0.0.pre → 3.0.0.pre2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,92 +0,0 @@
1
- ---
2
- layout: default
3
- title: Install Notes
4
- collapse: false
5
- ---
6
- h1(gemheader). {{ site.gemname }} %(small):: install%
7
-
8
- ** "Get the code":#getcode
9
- ** "Setup":#setup
10
- ** "Installing and Running Wukong with Hadoop":#gethadoop
11
- ** "Installing and Running Wukong with Datamapper, ActiveRecord, the command-line and more":#others
12
-
13
-
14
- <notextile><div class="toggle"></notextile>
15
-
16
- h2(#getcode). Get the code
17
-
18
- We're still actively developing {{ site.gemname }}. The newest version is available via "Git":http://git-scm.com on "github:":http://github.com/mrflip/{{ site.gemname }}
19
-
20
- pre. $ git clone git://github.com/mrflip/{{ site.gemname }}
21
-
22
- A gem is available from "gemcutter:":http://gemcutter.org/gems/{{ site.gemname }}
23
-
24
- pre. $ sudo gem install {{ site.gemname }} --source=http://gemcutter.org
25
-
26
- (don't use the gems.github.com version -- it's way out of date.)
27
-
28
- You can instead download this project in either "zip":http://github.com/mrflip/{{ site.gemname }}/zipball/master or "tar":http://github.com/mrflip/{{ site.gemname }}/tarball/master formats.
29
-
30
- h3. Get the Dependencies
31
-
32
- * Hadoop
33
- * Pig (optional)
34
- * Parts of {{ site.gemname }} require these gems:
35
- ** addressable/uri
36
- ** htmlentities
37
- ** extlib
38
- ** YAML
39
- ** JSON
40
-
41
- <notextile></div><div class="toggle"></notextile>
42
-
43
- h2(#setup). Setup
44
-
45
- 1. Allow Wukong to discover where his elephant friend lives by setting a $HADOOP_HOME environment variable: @export HADOOP_HOME="/usr/local/share/hadoop"@
46
- 2. Add wukong's @bin/@ directory to your $PATH if you'd like to use the "wutils":wutils.html
47
-
48
- <i>(see also: "Ruby Hadoop Quickstart":http://blog.pdatasolutions.com/post/191978092/ruby-on-hadoop-quickstart)</i>
49
-
50
- <notextile></div><div class="toggle"></notextile>
51
-
52
- h2(#gethadoop). Installing and Running Wukong with Hadoop
53
-
54
- Wukong was primarily developed for Hadoop, and we think it's the best way to use Hadoop (it's certainly the most fun!).
55
-
56
- h3. Run Wukong on the Amazon AWS EC2 Cloud
57
-
58
- h3. Hadoop Infrastructure
59
-
60
- Even if you have a bunch of machines with spare cycles, lots of RAM, and a shared filesystem... do yourself a favor and start out using the "Cloudera AMIs on Amazon's EC2 cloud.":http://www.cloudera.com/hadoop-ec2 There are an overwhelming number of fiddly little parameters and you'll be glad for the user experience before you get into server setup. If it's still mid-late 2009 when you read this, ignore prudence and jump straight to using Hadoop 0.20. It will be a) more fun, b) much more robust (trust me, at "v0.20" you want to live on the bleeding edge), and c) you won't have to suffer through migrating your HDFS two weeks after setup.
61
-
62
- To set up hadoop, your best bet are the Cloudera AMIs on Amazon's EC2 compute cloud:
63
-
64
- * http://www.cloudera.com/hadoop-ec2
65
- * http://www.cloudera.com/hadoop-ec2-ebs-beta
66
-
67
- EC2 means anyone with a $10 bill can rent a 10-machine cluster with 1TB of distributed storage for 8 hours.
68
-
69
- h3. Run Wukong using Amazon AWS Elastic MapReduce
70
-
71
- AWS Elastic MapReduce saves the trouble of even setting up a cluster: click, bam, there it is.
72
-
73
- Phil Ripperger has prepared a "Ruby Hadoop Quickstart":http://blog.pdatasolutions.com/post/191978092/ruby-on-hadoop-quickstart explaining how to get started with Wukong, Hadoop and the Amazon Elastic MapReduce cloud -- it's better than anything we could put here. Thanks Phil!
74
-
75
- h3. Set up a Hadoop cluster
76
-
77
- If you have a local cluster, or just want to experiment with a single-machine install, check out the Cloudera packages for both Debian/Ubuntu-based and Redhat/RPM-based Linux systems.
78
-
79
- h3. More Hadoop Notes
80
-
81
- I've braindumped some random notes on configuring and using hadoop "over here":hadoop-tips.html
82
-
83
- <notextile></div><div class="toggle"></notextile>
84
-
85
- h2(#others). Wukong isn't just Hadoop: Datamapper, ActiveRecord, command-line usage and more
86
-
87
- Wukong is used by many in an non-Hadoop environment -- anywhere you can stream data records, you can unleash its monkey power.
88
-
89
- Please see the "usage notes":usage.html#playnice for more!
90
-
91
-
92
- <notextile></div></notextile>
@@ -1,107 +0,0 @@
1
- ---
2
- layout: default
3
- title: Apache License
4
- ---
5
-
6
-
7
- h1(gemheader). {{ site.gemname }} %(small):: license%
8
-
9
-
10
- The wukong code is __Copyright (c) 2009 Philip (flip) Kromer__
11
-
12
- Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
13
-
14
- http://www.apache.org/licenses/LICENSE-2.0
15
-
16
- Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an **AS IS** BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
17
-
18
- h1. Apache License
19
-
20
- Apache License
21
- Version 2.0, January 2004
22
- http://www.apache.org/licenses/
23
-
24
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
25
-
26
- <notextile><div class="toggle"></notextile>
27
-
28
- h2. 1. Definitions.
29
-
30
- * **License** shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
31
- * **Licensor** shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
32
- * **Legal Entity** shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, **control** means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
33
- * **You** (or **Your**) shall mean an individual or Legal Entity exercising permissions granted by this License.
34
- * **Source** form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
35
- * **Object** form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
36
- * **Work** shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
37
- * **Derivative Works** shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
38
- * **Contribution** shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, **submitted** means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
39
- * **Contributor** shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
40
-
41
- <notextile></div><div class="toggle"></notextile>
42
-
43
- h2. 2. Grant of Copyright License.
44
-
45
- Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
46
-
47
-
48
- <notextile></div><div class="toggle"></notextile>
49
-
50
- h2. 3. Grant of Patent License.
51
-
52
- Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
53
-
54
- <notextile></div><div class="toggle"></notextile>
55
-
56
- h2. 4. Redistribution.
57
-
58
- You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
59
-
60
- # You must give any other recipients of the Work or Derivative Works a copy of this License; and
61
- # You must cause any modified files to carry prominent notices stating that You changed the files; and
62
- # You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
63
- # If the Work includes a __NOTICE__ text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
64
-
65
- You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
66
-
67
- <notextile></div><div class="toggle"></notextile>
68
-
69
- h2. 5. Submission of Contributions.
70
-
71
- Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
72
-
73
- <notextile></div><div class="toggle"></notextile>
74
-
75
- h2. 6. Trademarks.
76
-
77
- This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
78
-
79
- <notextile></div><div class="toggle"></notextile>
80
-
81
- h2. 7. Disclaimer of Warranty.
82
-
83
- Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an **AS IS** BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
84
-
85
- <notextile></div><div class="toggle"></notextile>
86
-
87
- h2. 8. Limitation of Liability.
88
-
89
- In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
90
-
91
- <notextile></div><div class="toggle"></notextile>
92
-
93
- h2. 9. Accepting Warranty or Additional Liability.
94
-
95
- While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
96
-
97
- END OF TERMS AND CONDITIONS
98
-
99
- <notextile></div><div class="toggle"></notextile>
100
-
101
- Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
102
-
103
- http://www.apache.org/licenses/LICENSE-2.0
104
-
105
- Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an **AS IS** BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
106
-
107
- <notextile></div></notextile>
@@ -1,377 +0,0 @@
1
- h2. Questions
2
-
3
- * can I access an EC2 resource (eg cassandra cluster)
4
-
5
-
6
- h2. Setup
7
-
8
- * download from http://developer.amazonwebservices.com/connect/entry.jspa?externalID=2264&categoryID=273
9
- * wget http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
10
- * unzip elastic-mapreduce-ruby.zip
11
- * cd elastic-mapreduce-ruby
12
- * ln -nfs ~/.wukong/credentials.json
13
- * put your keypair in ~/.wukong/keypairs/WHATEVER.pem
14
-
15
- {
16
- "access-id": "<insert your aws access id here>",
17
- "private-key": "<insert your aws secret access key here>",
18
- "key-pair": "WHATEVER",
19
- "key-pair-file": "~/.wukong/keypairs/WHATEVER.pem",
20
- "log-uri": "s3n://yourmom/emr/logs"
21
- }
22
-
23
- h4. Paths
24
-
25
-
26
- Paths:
27
- LogUri s3 s3n://yourmom/emr/logs
28
- step log files s3 {log_uri}/Steps/{step}/{syslog,stdout,controller,stderr}
29
- Script s3 s3://yourmom/emr/scripts/path/to/script
30
- Wukong s3 s3://s3scripts.infochimps.org/wukong/current/....
31
- Input s3 s3n://yourmom/data/wordcount/input
32
- Output s3 s3n://yourmom/data/wordcount/output
33
- Bootstrap Scripts s3 s3://elasticmapreduce/bootstrap-actions/{configure-hadoop,configure-daemons,run-if}
34
-
35
-
36
- Credentials desk elastic-mapreduce-ruby/credentials.json
37
-
38
- hadoop.tmp.dir inst /mnt/var/lib/hadoop/tmp
39
- local hdfs inst /mnt/var/lib/hadoop/dfs
40
- your home dir inst /home/hadoop (small space)
41
- Job Settings inst /mnt/var/lib/info/job-flow.json
42
- Instance Settings inst /mnt/var/lib/info/instance.json
43
-
44
-
45
- h4. Launching emr tasks in wukong
46
-
47
- * Uses configliere to get your credentials, log_uri, emr_root, script_path
48
- * Uploads script phases.
49
- s3://emr_root/scripts/:script_path/script_name-datetime-mapper.rb
50
- s3://emr_root/scripts/:script_path/script_name-datetime-reducer.rb
51
- ** You can use the following symbols to assemble the path:
52
- :emr_root, :script_name, :script_path, :username, :date, :datetime, :phase, :rand, :pid, :hostname, :keypair
53
- The values for :emr_root and :script_path are taken from configliere.
54
- if :script_path is missing, scripts/:username is used.
55
- The same timestamp and random number will be used for each phase
56
-
57
- * uses elastic-mapreduce-ruby to launch the job
58
-
59
- ** specify --emr.{option}
60
- ** eg --emr.alive, --emr.num-instances
61
-
62
- reads ~/.wukong/emr.yaml
63
-
64
- common
65
- jobs / jobname
66
-
67
- name same as for hadoop name
68
- alive
69
-
70
- num_instances .
71
- instance_type .
72
- master_instance_type .
73
- availability_zone us-east-1b
74
- key_pair job_handle
75
- key_pair_file ~/.wukong/keypairs/{key_pair}.pem
76
-
77
- hadoop_version 0.20
78
- plain_output Return the job flow id from create step as simple text
79
- info JSON hash
80
- emr_root
81
- log_uri emr_root/logs/:script_path/:script_name-:datetime
82
-
83
- --hadoop-version=0.20 --stream --enable_debugging --verbose --debug --alive
84
- --availability-zone AZ --key_pair KP --key_pair_file KPF --access_id EC2ID --private_key EC2PK
85
- --slave_instance_type m2.xlarge --master_instance_type m2.xlarge --num_instances NUM
86
- #
87
- --step_name
88
- --step_action CANCEL_AND_WAIT, TERMINATE_JOB_FLOW or CONTINUE
89
- --jobflow JOBFLOWID
90
- #
91
- --info Settings.emr.info.to_json
92
- #
93
- --input INPUT
94
- --output OUTPUT
95
- --mapper s3://emr_root/jobs/:script_path/script_name-datetime-mapper.rb (or class)
96
- --reducer s3://emr_root/jobs/:script_path/script_name-datetime-reducer.rb (or class)
97
- --cache s3n://emr_root/jobs/:script_path/cache/sample.py#sample.py
98
- --cache-archive s3://s3scripts.infochimps.org/wukong/current/wukong.zip
99
- --cache-archive s3n://emr_root/jobs/:script_path/cache/sample.jar
100
- --jobconf whatever
101
-
102
- ...
103
-
104
- also:
105
-
106
- --ssh
107
- --scp SRC --to DEST
108
- --terminate
109
- --logs
110
- --list
111
- --all
112
-
113
- h4. Aggregate
114
-
115
- http://hadoop.apache.org/common/docs/r0.20.1/api/org/apache/hadoop/mapred/lib/aggregate/package-summary.html
116
-
117
- DoubleValueSum sums up a sequence of double values.
118
- LongValueMax maintain the maximum of a sequence of long values.
119
- LongValueMin maintain the minimum of a sequence of long values.
120
- LongValueSum sums up a sequence of long values.
121
- StringValueMax maintain the biggest of a sequence of strings.
122
- StringValueMin maintain the smallest of a sequence of strings.
123
- UniqValueCount dedupes a sequence of objects.
124
- ValueHistogram computes the histogram of a sequence of strings.
125
-
126
- h2. Commands
127
-
128
- # create a job and run a mapper written in python and stored in Amazon S3
129
- elastic-mapreduce --create --enable_debugging \
130
- --stream
131
- --mapper s3://elasticmapreduce/samples/wordcount/wordSplitter.py \
132
- --input s3n://elasticmapreduce/samples/wordcount/input \
133
- --output s3n://mybucket/output_path
134
- --log_uri
135
-
136
- elastic-mapreduce --list # list recently created job flows
137
- elastic-mapreduce --list --active # list all running or starting job flows
138
- elastic-mapreduce --list --all # list all job flows
139
-
140
- h4. Bootstrap actions
141
-
142
- --bootstrap-action s3://elasticmapreduce/bootstrap-actions/configure-hadoop
143
- --args "--site-config-file,s3://bucket/config.xml,-s,mapred.tasktracker.map.tasks.maximum=2"
144
-
145
- --bootstrap-action s3://elasticmapreduce/bootstrap-actions/configure-daemons
146
- --args "--namenode-heap-size=2048,--namenode-opts=\"-XX:GCTimeRatio=19\""
147
-
148
- You should recompile cascading applications with the Hadoop 0.20 version specified so they can take advantage of the new features available in this version.
149
- Hadoop 0.20 fully supports Pig scripts.
150
- All Amazon Elastic MapReduce sample apps are compatible with Hadoop 0.20. The AWS Management Console supports only Hadoop 0.20, so samples will default to 0.20 once launched.
151
-
152
- For Hadoop version 0.20, Hive version 0.5 and version Pig 0.6 is used. The version can be selected by setting HadoopVersion in JobFlowInstancesConfig.
153
-
154
- h3. Pig
155
-
156
- REGISTER s3:///my-bucket/piggybank.jar
157
-
158
- Additional functions:
159
-
160
- http://developer.amazonwebservices.com/connect/entry.jspa?externalID=2730
161
-
162
-
163
- h2. Hadoop and Cluster setup
164
-
165
- h3. Data Compression
166
-
167
- Output Compression: -jobconf mapred.output.compress=true FileOutputFormat.setCompressOutput(conf, true);
168
- Intermediate Compression: -jobconf mapred.compress.map.output=true conf.setCompressMapOutput(true);
169
-
170
- You can also use a bootstrap action to automatically compress all job outputs. Here is how to do that with the Ruby client.
171
-
172
- --bootstrap-action s3://elasticmapreduce/bootstrap-actions/configure-hadoop --args "-s,mapred.output.compress=true"
173
-
174
- Compressed Input data Hadoop automatically detects the .gz extension on file names and extracts the contents. You do not need to take any action to extract gzipped files.
175
-
176
-
177
- ===========================================================================
178
-
179
-
180
- $LOAD_PATH << File.dirname(__FILE__)
181
- require 'amazon/coral/elasticmapreduceclient'
182
- require 'amazon/retry_delegator'
183
-
184
- config = {
185
- :endpoint => "https://elasticmapreduce.amazonaws.com",
186
- :ca_file => File.join(File.dirname(__FILE__), "cacert.pem"),
187
- :aws_access_key => my_access_id,
188
- :aws_secret_key => my_secret_key,
189
- :signature_algorithm => :V2
190
- }
191
- client = Amazon::Coral::ElasticMapReduceClient.new_aws_query(config)
192
-
193
- is_retryable_error_response = Proc.new do |response|
194
- if response == nil then
195
- false
196
- else
197
- ret = false
198
- if response['Error'] then
199
- ret ||= ['InternalFailure', 'Throttling', 'ServiceUnavailable', 'Timeout'].include?(response['Error']['Code'])
200
- end
201
- ret
202
- end
203
- end
204
-
205
- client = Amazon::RetryDelegator.new(client, :retry_if => is_retryable_error_response)
206
-
207
- puts client.DescribeJobFlows.inspect
208
- puts client.DescribeJobFlows('JobFlowId' => 'j-ABAYAS1019012').inspect
209
-
210
- h3. Example job-flow.json and instance.json
211
-
212
- job-flow.json {"jobFlowId":"j-1UVPY9PQ3XAXE","jobFlowCreationInstant":1271711181000,
213
- "instanceCount":4,"masterInstanceId":"i-f987ee92","masterPrivateDnsName":
214
- "localhost","masterInstanceType":"m1.small","slaveInstanceType":
215
- "m1.small","hadoopVersion":"0.18"}
216
-
217
- instance.json {"isMaster":true,"isRunningNameNode":true,"isRunningDataNode":true,
218
- "isRunningJobTracker":false,"isRunningTaskTracker":false}
219
-
220
- h3. Configuraion
221
-
222
- h4. Configure Hadoop
223
-
224
- Location: s3://elasticmapreduce/bootstrap-actions/configure-hadoop
225
-
226
- -<f>, --<file>-key-value
227
- Key/value pair that will be merged into the specified config file.
228
-
229
- -<F>, --<file>-config-file
230
- Config file in Amazon S3 or locally that will be merged with the specified config file.
231
-
232
- Acceptable config files:
233
- s/S site hadoop-site.xml
234
- d/D default hadoop-default.xml
235
- c/C core core-site.xml
236
- h/H hdfs hdfs-site.xml
237
- m/M mapred mapred-site.xml
238
-
239
-
240
- Example Usage:
241
-
242
- elastic-mapreduce --create \
243
- --bootstrap-action s3://elasticmapreduce/bootstrap-actions/configure-hadoop
244
- --args "--site-config-file,s3://bucket/config.xml,-s,mapred.tasktracker.map.tasks.maximum=2"
245
-
246
-
247
- Specify no reducers:
248
- --mapred-key-value mapred.reduce.tasks=0
249
-
250
-
251
- -cacheFile -files Comma separated URIs
252
- -cacheArchive -archives Comma separated URIs
253
- -jobconf -D key=value
254
-
255
- h4. Run If
256
-
257
- Location: s3://elasticmapreduce/bootstrap-actions/run-if <JSON path>[!]=<value> <command> [args...]
258
-
259
- JSON path A path in the instance config or job flow config for the key we should look up.
260
- Value The value we expect to find.
261
- Command The command to run if the value is what we expect (or not what we expect in the case of !=). This can be a path in S3 or a local command.
262
- Args Arguments to pass to the command as it runs.
263
-
264
- elastic-mapreduce --create --alive \
265
- --bootstrap-action s3://elasticmapreduce/bootstrap-actions/run-if
266
- --args "instance.isMaster=true,echo,Running,on,master,node"
267
-
268
-
269
- h4. Configure Daemons
270
-
271
- --<daemon>-heap-size Set the heap size in megabytes for the specified daemon.
272
- --<daemon>-opts Set additional Java options for the specified daemon.
273
- --replace Replace the existing hadoop-user-env.sh file if it exists.
274
-
275
- <daemon> is one of: namenode, datanode, jobtracker, tasktracker, client
276
-
277
- elastic-mapreduce --create --alive
278
- --bootstrap-action s3://elasticmapreduce/bootstrap-actions/configure-daemons
279
- --args "--namenode-heap-size=2048,--namenode-opts=\"-XX:GCTimeRatio=19\""
280
-
281
-
282
- h2. Command Line
283
-
284
-
285
- Creating Job Flows
286
- --create Create a new job flow
287
- --name NAME Name of the job flow
288
- --alive Create a job flow that stays running even though it has executed all its steps
289
- --num-instances NUM Number of instances in the job flow
290
- --instance-type TYPE The type of the instances to launch
291
- --slave-instance-type TYPE The type of the slave instances to launch
292
- --master-instance-type TYPE The type of the master instance to launch
293
- --key-pair KEY_PAIR The name of your Amazon EC2 Keypair
294
- --key-pair-file FILE_PATH Path to your local pem file for your EC2 key pair
295
- --log-uri LOG_URI Location in S3 to store logs from the job flow, e.g. s3n://mybucket/logs
296
- --availability-zone A_Z Specify the Availability Zone in which to launch the jobflow
297
- --info INFO Specify additional info in JSON
298
- --hadoop-version INFO Specify the Hadoop Version to install
299
- --plain-output Return the job flow id from create step as simple text
300
-
301
- Adding Jar Steps to Job Flows
302
- --jar JAR Add a step that executes a jar
303
- --wait-for-step Wait for the step to finish
304
- --main-class MAIN_CLASS Specify main class for the JAR
305
-
306
- Adding Streaming Steps to Job Flows
307
- --stream Add a step that performs hadoop streaming
308
- --input INPUT Input to the steps, e.g. s3n://mybucket/input
309
- --output OUTPUT The output to the steps, e.g. s3n://mybucket/output
310
- --mapper MAPPER The mapper program or class
311
- --cache CACHE_FILE A file to load into the cache, e.g. s3n://mybucket/sample.py#sample.py
312
- --cache-archive CACHE_FILE A file to unpack into the cache, e.g. s3n://mybucket/sample.jar
313
- --jobconf KEY=VALUE Specify jobconf arguments to pass to streaming, e.g. mapred.task.timeout=800000
314
- --reducer REDUCER The reducer program or class
315
-
316
- Job Flow Deugging Options
317
- --enable-debugging Enable job flow debugging (you must be signed up to SimpleDB for this to work)
318
-
319
- Adding Pig steps to job flows
320
- --pig-script Add a step that runs a Pig script
321
- --pig-interactive Add a step that sets up the job flow for an interactive (via SSH) pig session
322
-
323
- Configuring a Hive on a JobFlow
324
- --hive-site HIVE_SITE Override Hive configuration with configuration from HIVE_SITE
325
- --hive-script Add a step that runs a Hive script
326
- --hive-interactive Add a step that sets up the job flow for an interactive (via SSH) hive session
327
-
328
- Adding Steps from a Json File to Job Flows
329
- --json FILE Add a sequence of steps stored in a json file
330
- --param VARIABLE=VALUE subsitute <variable> with value in the json file
331
-
332
- Contacting the Master Node
333
- --no-wait Don't wait for the Master node to start before executing scp or ssh
334
- --ssh [COMMAND] SSH to the master node and optionally run a command
335
- --logs Display the step logs for the last executed step
336
- --scp SRC Copy a file to the master node
337
- --to DEST the destination to scp a file to
338
-
339
- Settings common to all step types
340
- --step-name STEP_NAME Set name for the step
341
- --step-action STEP_NAME Action to take when step finishes. One of CANCEL_AND_WAIT, TERMINATE_JOB_FLOW or CONTINUE
342
- --arg ARG Specify an argument to a bootstrap action, jar, streaming, pig-script or hive-script step
343
- --args ARGS Specify a comma seperated list of arguments, e.g --args 1,2,3 would three arguments
344
-
345
- Specifying Bootstrap Actions
346
- --bootstrap-action SCRIPT Run a bootstrap action script on all instances
347
- --bootstrap-name NAME Set the name of the bootstrap action
348
- Note --arg and --args are used to pass arguments to bootstrap actions
349
-
350
- Listing and Describing Job Flows
351
- --list List all job flows created in the last 2 days
352
- --describe Dump a JSON description of the supplied job flows
353
- --active List running, starting or shutting down job flows
354
- --all List all job flows in the last 2 months
355
- --nosteps Do not list steps when listing jobs
356
- --state STATE List job flows in STATE
357
- -n, --max-results MAX_RESULTS Maximum number of results to list
358
-
359
- Terminating Job Flows
360
- --terminate Terminate the job flow
361
-
362
- Common Options
363
- -j, --jobflow JOB_FLOW_ID
364
- --job-flow-id
365
- -c, --credentials CRED_FILE File containing access-id and private-key
366
- -a, --access-id ACCESS-ID AWS Access Id
367
- -k, --private-key PRIVATE-KEY AWS Private Key
368
- -v, --verbose Turn on verbose logging of program interaction
369
-
370
- Uncommon Options
371
- --debug Print stack traces when exceptions occur
372
- --endpoint ENDPOINT Specify the webservice endpoint to talk to
373
- --region REGION The region to use for the endpoint
374
- --apps-path APPS_PATH Specify s3:// path to the base of the emr public bucket to use. e.g s3://us-east-1.elasticmapreduce
375
- --beta-path BETA_PATH Specify s3:// path to the base of the emr public bucket to use for beta apps. e.g s3://beta.elasticmapreduce
376
- --version Print a version string
377
- -h, --help Show help message