wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
data/.document DELETED
@@ -1,5 +0,0 @@
1
- README.textile
2
- lib/**/*.rb
3
- bin/*
4
- LICENSE
5
- examples/*.rb
data/VERSION DELETED
@@ -1 +0,0 @@
1
- 3.0.0
@@ -1,44 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'rubygems'
4
- require 'wukong'
5
- require 'wukong/streamer/count_keys'
6
-
7
- #
8
- # Run locally for testing:
9
- #
10
- # hdp-cat /hdfs/sometable.tsv | head -n100 | ./hdp-bin --column=4 --bin_width=0.1 --map | sort | ./hdp-bin --reduce
11
- #
12
- # Run on a giant dataset:
13
- #
14
- # hdp-bin --run --column=4 --bin_width=0.1 /hdfs/sometable.tsv /hdfs/sometable_col4_binned
15
- #
16
-
17
- Settings.define :column, :default => 1, :type => Integer, :description => "The column to bin"
18
- Settings.define :bin_width, :default => 0.5, :type => Float, :description => "What should the bin width be?"
19
-
20
- module HadoopBinning
21
-
22
- class Mapper < Wukong::Streamer::RecordStreamer
23
-
24
- def initialize *args
25
- super(*args)
26
- @bin_width = options.bin_width
27
- @column = options.column
28
- end
29
-
30
- def process *args
31
- yield bin_field(args[@column])
32
- end
33
-
34
- def bin_field field
35
- (field.to_f/@bin_width).round*@bin_width
36
- end
37
-
38
- end
39
-
40
- class Reducer < Wukong::Streamer::CountKeys; end
41
-
42
- end
43
-
44
- Wukong::Script.new(HadoopBinning::Mapper, HadoopBinning::Reducer).run
@@ -1,23 +0,0 @@
1
- #!/bin/bash
2
-
3
- HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
4
-
5
- input_file=${1} ; shift
6
- output_file=${1} ; shift
7
-
8
- if [ "$output_file" == "" ] ; then echo "$0 input_file output_file" ; exit ; fi
9
-
10
- HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
11
-
12
- cmd="${HADOOP_HOME}/bin/hadoop \
13
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
14
- -Dmapred.output.compress=true \
15
- -Dmapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \
16
- -Dmapred.reduce.tasks=1 \
17
- -mapper \"/bin/cat\" \
18
- -reducer \"/bin/cat\" \
19
- -input \"$input_file\" \
20
- -output \"$output_file\" \
21
- "
22
- echo $cmd
23
- $cmd
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop dfs -cat "$@"
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
- args=`echo "$@" | ruby -ne 'a = $_.split(/\s+/); puts a.map{|arg| arg+"/[^_]*" }.join(" ")'`
3
- exec hadoop dfs -cat $args
data/bin/hdp-cp DELETED
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop dfs -cp "$@"
data/bin/hdp-du DELETED
@@ -1,86 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- OPTIONS={}
4
-
5
- #
6
- # grok options
7
- #
8
- if ARGV[0] =~ /\A-[sh]+\z/
9
- flags = ARGV.shift
10
- OPTIONS[:summary] = flags.include?('s')
11
- OPTIONS[:humanize] = flags.include?('h')
12
- end
13
-
14
- #
15
- # Prepare command
16
- #
17
- def prepare_command
18
- dfs_cmd = OPTIONS[:summary] ? 'dus' : 'du'
19
- dfs_args = ((!ARGV[0]) || ARGV[0]=='') ? '.' : "'#{ARGV.join("' '")}'"
20
- %Q{ hadoop dfs -#{dfs_cmd} #{dfs_args} }
21
- end
22
-
23
- Numeric.class_eval do
24
- def bytes() self ; end
25
- alias :byte :bytes
26
- def kilobytes() self * 1024 ; end
27
- alias :kilobyte :kilobytes
28
- def megabytes() self * 1024.kilobytes ; end
29
- alias :megabyte :megabytes
30
- def gigabytes() self * 1024.megabytes ; end
31
- alias :gigabyte :gigabytes
32
- def terabytes() self * 1024.gigabytes ; end
33
- alias :terabyte :terabytes
34
- def petabytes() self * 1024.terabytes ; end
35
- alias :petabyte :petabytes
36
- def exabytes() self * 1024.petabytes ; end
37
- alias :exabyte :exabytes
38
- end
39
-
40
- # Formats the bytes in +size+ into a more understandable representation
41
- # (e.g., giving it 1500 yields 1.5 KB). This method is useful for
42
- # reporting file sizes to users. This method returns nil if
43
- # +size+ cannot be converted into a number. You can change the default
44
- # precision of 1 using the precision parameter +precision+.
45
- #
46
- # ==== Examples
47
- # number_to_human_size(123) # => 123 Bytes
48
- # number_to_human_size(1234) # => 1.2 KB
49
- # number_to_human_size(12345) # => 12.1 KB
50
- # number_to_human_size(1234567) # => 1.2 MB
51
- # number_to_human_size(1234567890) # => 1.1 GB
52
- # number_to_human_size(1234567890123) # => 1.1 TB
53
- # number_to_human_size(1234567, 2) # => 1.18 MB
54
- # number_to_human_size(483989, 0) # => 4 MB
55
- def number_to_human_size(size, precision=1)
56
- size = Kernel.Float(size)
57
- case
58
- when size.to_i == 1; "1 Byte"
59
- when size < 1.kilobyte; "%d Bytes" % size
60
- when size < 1.megabyte; "%.#{precision}f KB" % (size / 1.0.kilobyte)
61
- when size < 1.gigabyte; "%.#{precision}f MB" % (size / 1.0.megabyte)
62
- when size < 1.terabyte; "%.#{precision}f GB" % (size / 1.0.gigabyte)
63
- else "%.#{precision}f TB" % (size / 1.0.terabyte)
64
- end #.sub(/([0-9]\.\d*?)0+ /, '\1 ' ).sub(/\. /,' ')
65
- rescue
66
- nil
67
- end
68
-
69
- OUTPUT_LINE_FMT = "%-71s\t%15d\t%15s"
70
- def format_output file, size
71
- human_size = number_to_human_size(size) || ""
72
- file = file.gsub(%r{hdfs://[^/]+/}, '/') # kill off hdfs paths, otherwise leave it alone
73
- OUTPUT_LINE_FMT % [file, size.to_i, human_size]
74
- end
75
-
76
- entries_count = 0
77
- total_size = 0
78
- %x{ #{prepare_command} }.split("\n").each do |line|
79
- if line =~ /^Found \d+ items$/ then puts line ; next end
80
- info = line.split(/\s+/)
81
- if OPTIONS[:summary] then file, size = info else size, file = info end
82
- puts format_output(file, size)
83
- total_size += size.to_i
84
- entries_count += 1
85
- end
86
- $stderr.puts OUTPUT_LINE_FMT%[" #{"%55d"%entries_count} entries", total_size, number_to_human_size(total_size)]
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop dfs -copyToLocal "$1" "$2"
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop job -kill "$@"
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop job -kill-task "$1"
data/bin/hdp-ls DELETED
@@ -1,11 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- if [ "$1" == "-r" ] || [ "$1" == "-R" ] ; then
4
- shift
5
- action=lsr
6
- else
7
- action=ls
8
- fi
9
-
10
- HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
11
- exec $HADOOP_HOME/bin/hadoop dfs -$action "$@"
@@ -1,2 +0,0 @@
1
- #!/usr/bin/env bash
2
- exec hadoop fs -mkdir "$@"
@@ -1,12 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- #
4
- # Despite arguments and deliberation, this IS a necessary script. Azkaban, should you choose to
5
- # use it, will fail if (it seems) ANY of its spawned subprocesses fails
6
- #
7
-
8
- hadoop fs -test -e "$@"
9
- if [ "$?" != "0" ] ; then
10
- # echo "File does not exist, making..."
11
- exec hadoop fs -mkdir "$@"
12
- fi
data/bin/hdp-mv DELETED
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop dfs -mv "$@"
@@ -1,77 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- dir_to_rename = ARGV[0]
4
- dest_ext = '.tsv'
5
-
6
- unless dir_to_rename && (! dir_to_rename.empty?)
7
- warn "Need a directory or file spec to rename."
8
- exit
9
- end
10
-
11
- #
12
- # Setup
13
- #
14
- warn "\nPlease IGNORE the 'cat: Unable to write to output stream.' errors\n"
15
-
16
- #
17
- # Examine the files
18
- #
19
- file_listings = `hdp-ls #{dir_to_rename}`.split("\n")
20
- command_lists = { }
21
- file_listings[1..-1].each do |file_listing|
22
- m = %r{[-drwx]+\s+[\-\d]+\s+\w+\s+\w+\s+(\d+)\s+[\d\-]+\s+[\d\:]+\s+(.+)$}.match(file_listing)
23
- if !m then warn "Couldn't grok #{file_listing}" ; next ; end
24
- size, filename = m.captures
25
- case
26
- when size.to_i == 0 then (command_lists[:deletes]||=[]) << filename
27
- else
28
- firstline = `hdp-cat #{filename} | head -qn1 `
29
- file_key, _ = firstline.split("\t", 2)
30
- unless file_key && (file_key =~ /\A[\w\-\.]+\z/)
31
- warn "Don't want to rename to '#{file_key}'... skipping"
32
- next
33
- end
34
- dirname = File.dirname(filename)
35
- destfile = File.join(dirname, file_key)+dest_ext
36
- (command_lists[:moves]||=[]) << "hdp-mv #{filename} #{destfile}"
37
- end
38
- end
39
-
40
- #
41
- # Execute the command_lists
42
- #
43
- command_lists.each do |type, command_list|
44
- case type
45
- when :deletes
46
- command = "hdp-rm #{command_list.join(" ")}"
47
- puts command
48
- `#{command}`
49
- when :moves
50
- command_list.each do |command|
51
- puts command
52
- `#{command}`
53
- end
54
- end
55
- end
56
-
57
-
58
- # -rw-r--r-- 3 flip supergroup 0 2008-12-20 05:51 /user/flip/out/sorted-tweets-20081220/part-00010
59
-
60
- # # Killing empty files
61
- # find . -size 0 -print -exec rm {} \;
62
- #
63
- # for foo in part-0* ; do
64
- # newname=`
65
- # head -n1 $foo |
66
- # cut -d' ' -f1 |
67
- # ruby -ne 'puts $_.chomp.gsub(/[^\-\w]/){|s| s.bytes.map{|c| "%%%02X" % c }}'
68
- # `.tsv ;
69
- # echo "moving $foo to $newname"
70
- # mv "$foo" "$newname"
71
- # done
72
- #
73
- # # dir=`basename $PWD`
74
- # # for foo in *.tsv ; do
75
- # # echo "Compressing $dir"
76
- # # bzip2 -c $foo > ../$dir-bz2/$foo.bz2
77
- # # done
data/bin/hdp-ps DELETED
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop job -list all
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop dfs -put "$@"
data/bin/hdp-rm DELETED
@@ -1,32 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- #
4
- # Documentation for hadoop fs -rmr says "acts just like Unix rm -rf command". If this
5
- # is true then we need to ignore directories that don't exist and still return 0.
6
- #
7
-
8
- #
9
- # All the dirty conditional logic here does is test whether a directory exists. If so, remove it
10
- #
11
- if [ "$1" == "-r" ] ; then
12
- shift
13
- if [ "$1" == "-skipTrash" ] ; then
14
- shift
15
- hadoop fs -test -e "$@"
16
- if [ "$?" == "0" ] ; then
17
- # echo "File exists, skipping trash, removing it..."
18
- echo hadoop dfs -rmr -skipTrash "$@"
19
- exec hadoop dfs -rmr -skipTrash "$@"
20
- fi
21
- else
22
- hadoop fs -test -e "$@"
23
- if [ "$?" == "0" ] ; then
24
- # echo "File exists, removing it..."
25
- echo hadoop dfs -rmr "$@"
26
- exec hadoop dfs -rmr "$@"
27
- fi
28
- fi
29
- else
30
- echo hadoop dfs -rm "$@"
31
- exec hadoop dfs -rm "$@"
32
- fi
@@ -1,40 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- input_file=${1} ; shift
4
- output_file=${1} ; shift
5
- map_script=${1-/bin/cat} ; shift
6
- reduce_script=${1-/usr/bin/uniq} ; shift
7
- partfields=${1-2} ; shift
8
- sortfields=${1-2} ; shift
9
-
10
- if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
11
-
12
- HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
13
-
14
- cmd="${HADOOP_HOME}/bin/hadoop \
15
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
16
- $@
17
- -D num.key.fields.for.partition=\"$partfields\"
18
- -D stream.num.map.output.key.fields=\"$sortfields\"
19
- -D stream.map.output.field.separator=\"'/t'\"
20
- -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
21
- -D mapred.job.name=\"`basename $0`-$map_script-$input_file-$output_file\"
22
- -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
23
- -mapper \"$map_script\"
24
- -reducer \"$reduce_script\"
25
- -input \"$input_file\"
26
- -output \"$output_file\"
27
- "
28
-
29
- echo "$cmd"
30
-
31
- $cmd
32
-
33
- # For a map-side-only job specify
34
- # -jobconf mapred.reduce.tasks=0 \
35
-
36
- # Maybe?
37
- #
38
- # -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \
39
- # -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
40
- #
@@ -1,40 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- input_file=${1} ; shift
4
- output_file=${1} ; shift
5
- map_script=${1-/bin/cat} ; shift
6
- reduce_script=${1-/usr/bin/uniq} ; shift
7
- partfields=${1-2} ; shift
8
- sortfields=${1-2} ; shift
9
-
10
- if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
11
-
12
- HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
13
-
14
- cmd="${HADOOP_HOME}/bin/hadoop \
15
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
16
- $@
17
- -D num.key.fields.for.partition=\"$partfields\"
18
- -D stream.num.map.output.key.fields=\"$sortfields\"
19
- -D stream.map.output.field.separator=\"'/t'\"
20
- -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
21
- -D mapred.job.name=\"`basename $0`-$map_script-$input_file-$output_file\"
22
- -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
23
- -mapper \"$map_script\"
24
- -reducer \"$reduce_script\"
25
- -input \"$input_file\"
26
- -output \"$output_file\"
27
- "
28
-
29
- echo "$cmd"
30
-
31
- $cmd
32
-
33
- # For a map-side-only job specify
34
- # -jobconf mapred.reduce.tasks=0 \
35
-
36
- # Maybe?
37
- #
38
- # -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \
39
- # -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
40
- #
@@ -1,22 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- input_file="${1}" ; shift
4
- output_file="${1}" ; shift
5
- map_script="${1-/bin/cat}" ; shift
6
- reduce_script="${1-/usr/bin/uniq}" ; shift
7
-
8
- if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [extra_args]" ; exit ; fi
9
-
10
- HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
11
-
12
- # Can add fun stuff like
13
- # -Dmapred.reduce.tasks=0 \
14
-
15
- exec ${HADOOP_HOME}/bin/hadoop \
16
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
17
- "$@" \
18
- -Dmapred.job.name=`basename $0`-$map_script-$input_file-$output_file \
19
- -mapper "$map_script" \
20
- -reducer "$reduce_script" \
21
- -input "$input_file" \
22
- -output "$output_file"