wukong 3.0.0.pre → 3.0.0.pre2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
data/.document DELETED
@@ -1,5 +0,0 @@
1
- README.textile
2
- lib/**/*.rb
3
- bin/*
4
- LICENSE
5
- examples/*.rb
data/VERSION DELETED
@@ -1 +0,0 @@
1
- 3.0.0
@@ -1,44 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'rubygems'
4
- require 'wukong'
5
- require 'wukong/streamer/count_keys'
6
-
7
- #
8
- # Run locally for testing:
9
- #
10
- # hdp-cat /hdfs/sometable.tsv | head -n100 | ./hdp-bin --column=4 --bin_width=0.1 --map | sort | ./hdp-bin --reduce
11
- #
12
- # Run on a giant dataset:
13
- #
14
- # hdp-bin --run --column=4 --bin_width=0.1 /hdfs/sometable.tsv /hdfs/sometable_col4_binned
15
- #
16
-
17
- Settings.define :column, :default => 1, :type => Integer, :description => "The column to bin"
18
- Settings.define :bin_width, :default => 0.5, :type => Float, :description => "What should the bin width be?"
19
-
20
- module HadoopBinning
21
-
22
- class Mapper < Wukong::Streamer::RecordStreamer
23
-
24
- def initialize *args
25
- super(*args)
26
- @bin_width = options.bin_width
27
- @column = options.column
28
- end
29
-
30
- def process *args
31
- yield bin_field(args[@column])
32
- end
33
-
34
- def bin_field field
35
- (field.to_f/@bin_width).round*@bin_width
36
- end
37
-
38
- end
39
-
40
- class Reducer < Wukong::Streamer::CountKeys; end
41
-
42
- end
43
-
44
- Wukong::Script.new(HadoopBinning::Mapper, HadoopBinning::Reducer).run
@@ -1,23 +0,0 @@
1
- #!/bin/bash
2
-
3
- HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
4
-
5
- input_file=${1} ; shift
6
- output_file=${1} ; shift
7
-
8
- if [ "$output_file" == "" ] ; then echo "$0 input_file output_file" ; exit ; fi
9
-
10
- HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
11
-
12
- cmd="${HADOOP_HOME}/bin/hadoop \
13
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
14
- -Dmapred.output.compress=true \
15
- -Dmapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \
16
- -Dmapred.reduce.tasks=1 \
17
- -mapper \"/bin/cat\" \
18
- -reducer \"/bin/cat\" \
19
- -input \"$input_file\" \
20
- -output \"$output_file\" \
21
- "
22
- echo $cmd
23
- $cmd
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop dfs -cat "$@"
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
- args=`echo "$@" | ruby -ne 'a = $_.split(/\s+/); puts a.map{|arg| arg+"/[^_]*" }.join(" ")'`
3
- exec hadoop dfs -cat $args
data/bin/hdp-cp DELETED
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop dfs -cp "$@"
data/bin/hdp-du DELETED
@@ -1,86 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- OPTIONS={}
4
-
5
- #
6
- # grok options
7
- #
8
- if ARGV[0] =~ /\A-[sh]+\z/
9
- flags = ARGV.shift
10
- OPTIONS[:summary] = flags.include?('s')
11
- OPTIONS[:humanize] = flags.include?('h')
12
- end
13
-
14
- #
15
- # Prepare command
16
- #
17
- def prepare_command
18
- dfs_cmd = OPTIONS[:summary] ? 'dus' : 'du'
19
- dfs_args = ((!ARGV[0]) || ARGV[0]=='') ? '.' : "'#{ARGV.join("' '")}'"
20
- %Q{ hadoop dfs -#{dfs_cmd} #{dfs_args} }
21
- end
22
-
23
- Numeric.class_eval do
24
- def bytes() self ; end
25
- alias :byte :bytes
26
- def kilobytes() self * 1024 ; end
27
- alias :kilobyte :kilobytes
28
- def megabytes() self * 1024.kilobytes ; end
29
- alias :megabyte :megabytes
30
- def gigabytes() self * 1024.megabytes ; end
31
- alias :gigabyte :gigabytes
32
- def terabytes() self * 1024.gigabytes ; end
33
- alias :terabyte :terabytes
34
- def petabytes() self * 1024.terabytes ; end
35
- alias :petabyte :petabytes
36
- def exabytes() self * 1024.petabytes ; end
37
- alias :exabyte :exabytes
38
- end
39
-
40
- # Formats the bytes in +size+ into a more understandable representation
41
- # (e.g., giving it 1500 yields 1.5 KB). This method is useful for
42
- # reporting file sizes to users. This method returns nil if
43
- # +size+ cannot be converted into a number. You can change the default
44
- # precision of 1 using the precision parameter +precision+.
45
- #
46
- # ==== Examples
47
- # number_to_human_size(123) # => 123 Bytes
48
- # number_to_human_size(1234) # => 1.2 KB
49
- # number_to_human_size(12345) # => 12.1 KB
50
- # number_to_human_size(1234567) # => 1.2 MB
51
- # number_to_human_size(1234567890) # => 1.1 GB
52
- # number_to_human_size(1234567890123) # => 1.1 TB
53
- # number_to_human_size(1234567, 2) # => 1.18 MB
54
- # number_to_human_size(483989, 0) # => 4 MB
55
- def number_to_human_size(size, precision=1)
56
- size = Kernel.Float(size)
57
- case
58
- when size.to_i == 1; "1 Byte"
59
- when size < 1.kilobyte; "%d Bytes" % size
60
- when size < 1.megabyte; "%.#{precision}f KB" % (size / 1.0.kilobyte)
61
- when size < 1.gigabyte; "%.#{precision}f MB" % (size / 1.0.megabyte)
62
- when size < 1.terabyte; "%.#{precision}f GB" % (size / 1.0.gigabyte)
63
- else "%.#{precision}f TB" % (size / 1.0.terabyte)
64
- end #.sub(/([0-9]\.\d*?)0+ /, '\1 ' ).sub(/\. /,' ')
65
- rescue
66
- nil
67
- end
68
-
69
- OUTPUT_LINE_FMT = "%-71s\t%15d\t%15s"
70
- def format_output file, size
71
- human_size = number_to_human_size(size) || ""
72
- file = file.gsub(%r{hdfs://[^/]+/}, '/') # kill off hdfs paths, otherwise leave it alone
73
- OUTPUT_LINE_FMT % [file, size.to_i, human_size]
74
- end
75
-
76
- entries_count = 0
77
- total_size = 0
78
- %x{ #{prepare_command} }.split("\n").each do |line|
79
- if line =~ /^Found \d+ items$/ then puts line ; next end
80
- info = line.split(/\s+/)
81
- if OPTIONS[:summary] then file, size = info else size, file = info end
82
- puts format_output(file, size)
83
- total_size += size.to_i
84
- entries_count += 1
85
- end
86
- $stderr.puts OUTPUT_LINE_FMT%[" #{"%55d"%entries_count} entries", total_size, number_to_human_size(total_size)]
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop dfs -copyToLocal "$1" "$2"
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop job -kill "$@"
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop job -kill-task "$1"
data/bin/hdp-ls DELETED
@@ -1,11 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- if [ "$1" == "-r" ] || [ "$1" == "-R" ] ; then
4
- shift
5
- action=lsr
6
- else
7
- action=ls
8
- fi
9
-
10
- HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
11
- exec $HADOOP_HOME/bin/hadoop dfs -$action "$@"
@@ -1,2 +0,0 @@
1
- #!/usr/bin/env bash
2
- exec hadoop fs -mkdir "$@"
@@ -1,12 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- #
4
- # Despite arguments and deliberation, this IS a necessary script. Azkaban, should you choose to
5
- # use it, will fail if (it seems) ANY of its spawned subprocesses fails
6
- #
7
-
8
- hadoop fs -test -e "$@"
9
- if [ "$?" != "0" ] ; then
10
- # echo "File does not exist, making..."
11
- exec hadoop fs -mkdir "$@"
12
- fi
data/bin/hdp-mv DELETED
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop dfs -mv "$@"
@@ -1,77 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- dir_to_rename = ARGV[0]
4
- dest_ext = '.tsv'
5
-
6
- unless dir_to_rename && (! dir_to_rename.empty?)
7
- warn "Need a directory or file spec to rename."
8
- exit
9
- end
10
-
11
- #
12
- # Setup
13
- #
14
- warn "\nPlease IGNORE the 'cat: Unable to write to output stream.' errors\n"
15
-
16
- #
17
- # Examine the files
18
- #
19
- file_listings = `hdp-ls #{dir_to_rename}`.split("\n")
20
- command_lists = { }
21
- file_listings[1..-1].each do |file_listing|
22
- m = %r{[-drwx]+\s+[\-\d]+\s+\w+\s+\w+\s+(\d+)\s+[\d\-]+\s+[\d\:]+\s+(.+)$}.match(file_listing)
23
- if !m then warn "Couldn't grok #{file_listing}" ; next ; end
24
- size, filename = m.captures
25
- case
26
- when size.to_i == 0 then (command_lists[:deletes]||=[]) << filename
27
- else
28
- firstline = `hdp-cat #{filename} | head -qn1 `
29
- file_key, _ = firstline.split("\t", 2)
30
- unless file_key && (file_key =~ /\A[\w\-\.]+\z/)
31
- warn "Don't want to rename to '#{file_key}'... skipping"
32
- next
33
- end
34
- dirname = File.dirname(filename)
35
- destfile = File.join(dirname, file_key)+dest_ext
36
- (command_lists[:moves]||=[]) << "hdp-mv #{filename} #{destfile}"
37
- end
38
- end
39
-
40
- #
41
- # Execute the command_lists
42
- #
43
- command_lists.each do |type, command_list|
44
- case type
45
- when :deletes
46
- command = "hdp-rm #{command_list.join(" ")}"
47
- puts command
48
- `#{command}`
49
- when :moves
50
- command_list.each do |command|
51
- puts command
52
- `#{command}`
53
- end
54
- end
55
- end
56
-
57
-
58
- # -rw-r--r-- 3 flip supergroup 0 2008-12-20 05:51 /user/flip/out/sorted-tweets-20081220/part-00010
59
-
60
- # # Killing empty files
61
- # find . -size 0 -print -exec rm {} \;
62
- #
63
- # for foo in part-0* ; do
64
- # newname=`
65
- # head -n1 $foo |
66
- # cut -d' ' -f1 |
67
- # ruby -ne 'puts $_.chomp.gsub(/[^\-\w]/){|s| s.bytes.map{|c| "%%%02X" % c }}'
68
- # `.tsv ;
69
- # echo "moving $foo to $newname"
70
- # mv "$foo" "$newname"
71
- # done
72
- #
73
- # # dir=`basename $PWD`
74
- # # for foo in *.tsv ; do
75
- # # echo "Compressing $dir"
76
- # # bzip2 -c $foo > ../$dir-bz2/$foo.bz2
77
- # # done
data/bin/hdp-ps DELETED
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop job -list all
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- exec hadoop dfs -put "$@"
data/bin/hdp-rm DELETED
@@ -1,32 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- #
4
- # Documentation for hadoop fs -rmr says "acts just like Unix rm -rf command". If this
5
- # is true then we need to ignore directories that don't exist and still return 0.
6
- #
7
-
8
- #
9
- # All the dirty conditional logic here does is test whether a directory exists. If so, remove it
10
- #
11
- if [ "$1" == "-r" ] ; then
12
- shift
13
- if [ "$1" == "-skipTrash" ] ; then
14
- shift
15
- hadoop fs -test -e "$@"
16
- if [ "$?" == "0" ] ; then
17
- # echo "File exists, skipping trash, removing it..."
18
- echo hadoop dfs -rmr -skipTrash "$@"
19
- exec hadoop dfs -rmr -skipTrash "$@"
20
- fi
21
- else
22
- hadoop fs -test -e "$@"
23
- if [ "$?" == "0" ] ; then
24
- # echo "File exists, removing it..."
25
- echo hadoop dfs -rmr "$@"
26
- exec hadoop dfs -rmr "$@"
27
- fi
28
- fi
29
- else
30
- echo hadoop dfs -rm "$@"
31
- exec hadoop dfs -rm "$@"
32
- fi
@@ -1,40 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- input_file=${1} ; shift
4
- output_file=${1} ; shift
5
- map_script=${1-/bin/cat} ; shift
6
- reduce_script=${1-/usr/bin/uniq} ; shift
7
- partfields=${1-2} ; shift
8
- sortfields=${1-2} ; shift
9
-
10
- if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
11
-
12
- HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
13
-
14
- cmd="${HADOOP_HOME}/bin/hadoop \
15
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
16
- $@
17
- -D num.key.fields.for.partition=\"$partfields\"
18
- -D stream.num.map.output.key.fields=\"$sortfields\"
19
- -D stream.map.output.field.separator=\"'/t'\"
20
- -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
21
- -D mapred.job.name=\"`basename $0`-$map_script-$input_file-$output_file\"
22
- -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
23
- -mapper \"$map_script\"
24
- -reducer \"$reduce_script\"
25
- -input \"$input_file\"
26
- -output \"$output_file\"
27
- "
28
-
29
- echo "$cmd"
30
-
31
- $cmd
32
-
33
- # For a map-side-only job specify
34
- # -jobconf mapred.reduce.tasks=0 \
35
-
36
- # Maybe?
37
- #
38
- # -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \
39
- # -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
40
- #
@@ -1,40 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- input_file=${1} ; shift
4
- output_file=${1} ; shift
5
- map_script=${1-/bin/cat} ; shift
6
- reduce_script=${1-/usr/bin/uniq} ; shift
7
- partfields=${1-2} ; shift
8
- sortfields=${1-2} ; shift
9
-
10
- if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
11
-
12
- HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
13
-
14
- cmd="${HADOOP_HOME}/bin/hadoop \
15
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
16
- $@
17
- -D num.key.fields.for.partition=\"$partfields\"
18
- -D stream.num.map.output.key.fields=\"$sortfields\"
19
- -D stream.map.output.field.separator=\"'/t'\"
20
- -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
21
- -D mapred.job.name=\"`basename $0`-$map_script-$input_file-$output_file\"
22
- -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
23
- -mapper \"$map_script\"
24
- -reducer \"$reduce_script\"
25
- -input \"$input_file\"
26
- -output \"$output_file\"
27
- "
28
-
29
- echo "$cmd"
30
-
31
- $cmd
32
-
33
- # For a map-side-only job specify
34
- # -jobconf mapred.reduce.tasks=0 \
35
-
36
- # Maybe?
37
- #
38
- # -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \
39
- # -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
40
- #
@@ -1,22 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- input_file="${1}" ; shift
4
- output_file="${1}" ; shift
5
- map_script="${1-/bin/cat}" ; shift
6
- reduce_script="${1-/usr/bin/uniq}" ; shift
7
-
8
- if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [extra_args]" ; exit ; fi
9
-
10
- HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
11
-
12
- # Can add fun stuff like
13
- # -Dmapred.reduce.tasks=0 \
14
-
15
- exec ${HADOOP_HOME}/bin/hadoop \
16
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
17
- "$@" \
18
- -Dmapred.job.name=`basename $0`-$map_script-$input_file-$output_file \
19
- -mapper "$map_script" \
20
- -reducer "$reduce_script" \
21
- -input "$input_file" \
22
- -output "$output_file"