wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,263 +0,0 @@
1
- ---
2
- layout: default
3
- title: mrflip.github.com/wukong - wu-utils utilities
4
- collapse: false
5
- ---
6
-
7
- h1(gemheader). Wukong Utility Scripts
8
-
9
- ** "Overview of wutils":#wutils -- command listing
10
- ** "Stupid command-line tricks":#cmdlinetricks using the wutils
11
- ** "wu-lign":#wulign -- present a tab-separated file as aligned columns
12
- ** Dear Lazyweb, please build this for us: "tab-oriented version of the Textutils library":#wutilsinc
13
-
14
- <notextile><div class="toggle"></notextile>
15
-
16
- h2(#cmdlinetricks). Stupid command-line tricks
17
-
18
- Here are a few useful little snippets you can run from the command line:
19
-
20
- h3. Histogram
21
-
22
- Given data with a date column:
23
-
24
- <pre>
25
- message 235623 20090423012345 Now is the winter of our discontent Made glorious summer by this son of York
26
- message 235623 20080101230900 These pretzels are making me THIRSTY!
27
- ...
28
- </pre>
29
-
30
- You can calculate number of messages sent by day with
31
-
32
- <pre>
33
- cat messages | cuttab 3 | cutc 8 | sort | uniq -c
34
- </pre>
35
-
36
- (see the wuhist command, below.)
37
-
38
- h3. Simple intersection, union, etc
39
-
40
- For two datasets (batch_1 and batch_2) with unique entries (no repeated lines),
41
-
42
- * Their union is simple:
43
-
44
- <pre>
45
- cat batch_1 batch_2 | sort -u
46
- </pre>
47
-
48
- * To find their intersection, concatenate the two sets and filters out everything that only occurred once.
49
-
50
- <pre>
51
- cat batch_1 batch_2 | sort | uniq -c | egrep -v '^ *1 '
52
- </pre>
53
-
54
- * For the complement of the intersection, use @... | egrep '^ *1 '@
55
-
56
- * In both cases, if the files are each internally sorted, the commandline sort takes a --merge flag:
57
-
58
- <pre>
59
- sort --merge -u batch_1 batch_2
60
- </pre>
61
-
62
- <notextile></div><div class="toggle"></notextile>
63
-
64
- h2(#wutils). Wutils Command Listing
65
-
66
- h3. cutc
67
-
68
- @cutc [colnum]@
69
-
70
- Ex.
71
-
72
- @echo -e 'foo\tbar\tbaz' | cutc 6@
73
- @foo ba@
74
-
75
- Cuts from beginning of line to given column (default 200). A tab is one character, so right margin can still be ragged.
76
-
77
- h3. cuttab
78
-
79
- @cuttab [colspec]@
80
-
81
- Cuts given tab-separated columns. You can give a comma separated list of numbers
82
- or ranges 1-4. columns are numbered from 1.
83
-
84
- Ex.
85
-
86
- <pre>
87
- echo -e 'foo\tbar\tbaz' | cuttab 1,3
88
- foo baz
89
- </pre>
90
-
91
- h3. hdp-*
92
-
93
- These perform the corresponding commands on the HDFS filesystem. In general,
94
- where they accept command-line flags, they go with the GNU-style ones, not the
95
- hadoop-style: so, @hdp-du -s dir@ or @hdp-rm -r foo/@
96
-
97
- * @hdp-cat@
98
- * @hdp-catd@ -- cats the files that don't start with '_' in a directory. Use this for a pile of @.../part-00000@ files
99
- * @hdp-du@
100
- * @hdp-get@
101
- * @hdp-kill@
102
- * @hdp-ls@
103
- * @hdp-mkdir@
104
- * @hdp-mv@
105
- * @hdp-ps@
106
- * @hdp-put@
107
- * @hdp-rm@
108
- * @hdp-sync@
109
-
110
- h3. hdp-sort, hdp-stream, hdp-stream-flat
111
-
112
- * @hdp-sort@
113
- * @hdp-stream@
114
- * @hdp-stream-flat@
115
-
116
- <code><pre>
117
- hdp-stream input_filespec output_file map_cmd reduce_cmd num_key_fields
118
- </pre></code>
119
-
120
- h3. tabchar
121
-
122
- Outputs a single tab character.
123
-
124
- h3. wuhist
125
-
126
- Occasionally useful to gather a lexical histogram of a single column:
127
-
128
- Ex.
129
-
130
- <code><pre>
131
- $ echo -e 'foo\nbar\nbar\nfoo\nfoo\nfoo\n7' | ./wuhist
132
- 4 foo
133
- 2 bar
134
- 1 7
135
- </pre></code>
136
-
137
- (the output will have a tab between the first and second column, for futher processing.)
138
-
139
- h3. wulign
140
-
141
- Intelligently format a tab-separated file into aligned columns (while remaining tab-separated for further processing). See "below":#wulign.
142
-
143
- h3. hdp-parts_to_keys.rb
144
-
145
- A *very* clumsy script to rename reduced hadoop output files by their initial key.
146
-
147
- If your output file has an initial key in the first column and you pass it through hdp-sort, they will be distributed across reducers and thus output files. (Because of the way hadoop hashes the keys, there's no guarantee that each file will get a distinct key. You could have 2 keys with a million entries and they could land sequentially on the same reducer, always fun.)
148
-
149
- If you're willing to roll the dice, this script will rename files according to the first key in the first line.
150
-
151
- **Do you have or know of a native hadoop utility to do this?** If so, please get in touch!
152
-
153
- <notextile></div><div class="toggle"></notextile>
154
-
155
- h2(#wulign). wu-lign -- format a tab-separated file as aligned columns
156
-
157
- wu-lign will intelligently reformat a tab-separated file into a tab-separated, space aligned file that is still suitable for further processing. For example, given the log-file input
158
-
159
- <pre><code>
160
- 2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
161
- 2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
162
- 2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
163
- 2009-07-21T21:44:00 concert 500 0.29290 13367 9733414
164
- 2009-07-21T21:44:29 world 65536 1.09110 32850 200916
165
- 2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
166
- 2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
167
- </code></pre>
168
-
169
- wu-lign will reformat it to read
170
-
171
- <pre><code>
172
- 2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
173
- 2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
174
- 2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
175
- 2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414
176
- 2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
177
- 2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
178
- 2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
179
- </code></pre>
180
-
181
- The fields are still tab-delimited by exactly one tab -- only spaces are used to pad out fields. You can still use cuttab and friends to manipulate columns.
182
-
183
- wu-lign isn't intended to be smart, or correct, or reliable -- only to be useful for previewing and organizing tab-formatted files. In general @wu-lign(foo).split("\t").map(&:strip)@ *should* give output semantically equivalent to its input. (That is, the only changes should be insertion of spaces and re-formatting of numerics.) But still -- reserve its use for human inspection only.
184
-
185
- (Note: tab characters in this source code file have been converted to spaces; replace whitespace with tab in the first example if you'd like to play along at home.)
186
-
187
- h3. How it works
188
-
189
- Wu-Lign takes the first 1000 lines, splits by TAB characters into fields, and tries to guess the format -- int, float, or string -- for each. It builds a consensus of the width and type for corresponding columns in the chunk. If a column has mixed numeric and string formats it degrades to :mixed, which is basically treated as :string. If a column has mixed :float and :int elements all of them are formatted as float.
190
-
191
- h3. Command-line arguments
192
-
193
- You can give sprintf-style positional arguments on the command line that will be applied to the corresponding columns. (Blank args are used for placeholding and auto-formatting is still applied). So with the example above,
194
-
195
- @cat foo | wu-lign '' '' '' '%8.4e'@
196
-
197
- will format the fourth column with "%8.4e", while the first three columns and fifth-and-higher columns are formatted as usual.
198
-
199
- <pre><code>
200
- ...
201
- 2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
202
- 2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
203
- 2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
204
- ....
205
- </code></pre>
206
-
207
- h3. Notes
208
-
209
- * It has no knowledge of header rows. An all-text first line will screw everything up.
210
- * It also requires a unanimous vote. One screwy line can coerce the whole mess to :mixed; width formatting will still be applied, though.
211
- * It won't set columns wider than 70 chars -- this allows for the occasional super-wide column without completely breaking your screen.
212
- * For :float values, wu-lign tries to guess at the right number of significant digits to the left and right of the decimal point.
213
- * wu-lign does not parse 'TSV files' in their strict sense -- there is no quoting or escaping; every tab delimits a field, every newline a record.
214
-
215
- h2(#wutilsinc). Dear Lazyweb, please build this
216
-
217
- * uniq - report or filter out repeated lines in a file
218
- ** -c produces line<tab>count
219
- ** --ignore f1,f2,... discards given fields from consideration. field syntax same as for cut, etc.
220
-
221
- * sort - sort lines of text files
222
- ** columns indexed as tab-separated
223
- ** can specify any column order, uses same field spec as cut
224
- * tsort - topological sort of a directed graph
225
-
226
- * cut - select portions of each line of a file
227
- ** can reorder columns
228
- * nl - line numbering filter
229
- ** takes prefix, suffix
230
- ** count \t line -OR- line \t count
231
-
232
- * wc - word, line, character, and byte count
233
- ** field count (tab-separated fields)
234
- * paste - merge corresponding or subsequent lines of files
235
- * expand, unexpand - expand tabs to spaces, and vice versa
236
- * seq
237
- * simple row, column sums
238
- * join - relational database operator
239
- * tac
240
-
241
- * cat - concatenate and print files
242
- * head - display first lines of a file
243
- * tail - display the last part of a file
244
- * shuf
245
- * split - split a file into pieces
246
- * csplit - split files based on context
247
- * tee - pipe fitting
248
-
249
- * ls - list directory contents.
250
- * df - display free disk space
251
- * du - display disk usage statistics
252
- ** tab-delimited, space aligned
253
-
254
- * od - octal, decimal, hex, ASCII dump
255
- * printf - formatted output
256
- * cksum, sum - display file checksums and block counts
257
- * md5sum
258
-
259
- * diff
260
- * comm
261
-
262
-
263
- <notextile></div></notextile>
@@ -1,11 +0,0 @@
1
-
2
- # TODO: a flow with splits and stuff
3
-
4
- # parsed = map{|line| ApacheLogLine.make(line) }
5
- #
6
- # input(:default) > parsed
7
- #
8
- # parsed > split.into(
9
- # to_json > output(:dump, stdout),
10
- # to_tsv > output(:tsv, file_sink(Pathname.path_to(:tmp, 'foo.tsv')))
11
- # )
@@ -1,13 +0,0 @@
1
- Wukong.dataflow(:gotta_make_the_donuts) do
2
- input :dough_circles, dough_hopper
3
- output :donut_box, box(:capacity => 12)
4
-
5
- input(:dough_circles) >
6
- frier(:top_frier) >
7
- flipper >
8
- frier(:btm_frier) >
9
- cooling(:pre_glazer) >
10
- glazer >
11
- cooling(:ready) >
12
- output(:donut_box)
13
- end
@@ -1,92 +0,0 @@
1
- 1 1872
2
- 1 alice
3
- 2 all
4
- 15 and
5
- 1 arms
6
- 1 awhile
7
- 1 back
8
- 1 bandersnatch
9
- 1 beamish
10
- 2 beware
11
- 1 bird
12
- 1 bite
13
- 1 blade
14
- 2 borogoves
15
- 1 boy
16
- 2 brillig
17
- 1 burbled
18
- 1 callay
19
- 1 callooh
20
- 2 came
21
- 1 carroll
22
- 1 catch
23
- 1 chortled
24
- 1 claws
25
- 1 come
26
- 1 day
27
- 1 dead
28
- 2 did
29
- 1 eyes
30
- 1 flame
31
- 1 foe
32
- 1 found
33
- 1 frabjous
34
- 1 from
35
- 1 frumious
36
- 1 galumphing
37
- 2 gimble
38
- 1 glass
39
- 2 gyre
40
- 1 hand
41
- 1 has
42
- 1 head
43
- 2 his
44
- 1 its
45
- 3 jabberwock
46
- 1 jabberwocky
47
- 1 jaws
48
- 1 joy
49
- 1 jubjub
50
- 1 left
51
- 1 lewis
52
- 1 long
53
- 1 looking
54
- 1 manxome
55
- 2 mimsy
56
- 2 mome
57
- 2 one
58
- 2 outgrabe
59
- 2 raths
60
- 1 rested
61
- 1 shun
62
- 1 slain
63
- 2 slithy
64
- 1 snack
65
- 1 snicker
66
- 1 son
67
- 1 sought
68
- 2 stood
69
- 1 sword
70
- 2 that
71
- 20 the
72
- 1 there
73
- 1 thou
74
- 2 thought
75
- 4 through
76
- 1 time
77
- 1 took
78
- 2 toves
79
- 1 tree
80
- 1 tulgey
81
- 1 tumtum
82
- 2 twas
83
- 2 two
84
- 1 uffish
85
- 2 vorpal
86
- 2 wabe
87
- 2 went
88
- 2 were
89
- 1 what
90
- 1 whiffling
91
- 2 with
92
- 1 wood
@@ -1,48 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'wukong'
4
-
5
- # cat data/jabberwocky.txt | bin/wu-map examples/word_count.rb | sort | bin/wu-red examples/word_count.rb | sort -rnk2 | head
6
-
7
- Wukong.processor(:add_count) do
8
- def process(word)
9
- emit [word, 1]
10
- end
11
- end
12
-
13
- Wukong.processor(:accumulator) do
14
- attr_accessor :current, :count
15
-
16
- def setup() reset! ; end
17
-
18
- def stop() report_then_reset! ; end
19
-
20
- def reset!() @current = nil ; @count = 0 ; end
21
-
22
- def report_then_reset!
23
- emit [current, count] unless current.nil?
24
- reset!
25
- end
26
-
27
- def accumulate(word, seen)
28
- @current = word if @current.nil?
29
- @count += seen
30
- end
31
-
32
- def process(pair)
33
- word, seen = pair
34
- report_then_reset! unless word == current
35
- accumulate(word, seen.to_i)
36
- end
37
-
38
- end
39
-
40
- Wukong.dataflow(:mapper) do
41
- splitter = map { |line| line.downcase.strip.split(/\W/) }
42
- cleaner = reject { |word| word.length < 2 }
43
- splitter > flatten > cleaner > add_count > to_tsv
44
- end
45
-
46
- Wukong.dataflow(:reducer) do
47
- from_tsv > accumulator > to_tsv
48
- end
@@ -1,24 +0,0 @@
1
-
2
-
3
-
4
- connect('split:top').into('flatten:ingredient')
5
-
6
- combine << utensil('bowl') << ingredient('flour') << ingredient('salt') << ingredient('sugar') > ingredient('dough')
7
-
8
-
9
- task 'package' do
10
- slot(:docs) << directory('docs')
11
- slot(:exe) << action(:compiled)
12
- end
13
-
14
-
15
-
16
- wukong 'foo.rb', 'x.tsv', 'y.tsv', :reduce_tasks => 0, :min_split_size => '1M' > 'foo_out.tsv'
17
-
18
-
19
-
20
- wukong 'combine.rb', 'x.tsv', 'y.tsv', :reduce_tasks => 0, :min_split_size => '1M' > :raw_pie
21
-
22
- pig
23
-
24
- wukong 'bake.rb', :raw_pie > :pie