wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,220 +0,0 @@
1
- module Hanuman
2
-
3
- #
4
- # For stages that can be linked to directly
5
- # Including this means your stage has exactly one input (itself).
6
- #
7
- module IsOwnInputSlot
8
- extend Gorillib::Concern
9
- include Inlinkable
10
- included do
11
- magic :input, Hanuman::Stage, :writer => false, :tester => true, :doc => 'stage/slot in graph that feeds into this one'
12
- end
13
- def inputs
14
- input? ? [input] : []
15
- end
16
- end
17
-
18
- #
19
- # For stages that can be linked to directly
20
- # Including this means your stage has exactly one output (itself).
21
- #
22
- module IsOwnOutputSlot
23
- extend Gorillib::Concern
24
- include Outlinkable
25
- included do
26
- magic :output, Hanuman::Stage, :writer => false, :tester => true, :doc => 'stage/slot in graph this one feeds into'
27
- end
28
- def outputs
29
- output? ? [output] : []
30
- end
31
- end
32
-
33
- #
34
- # For stages with named slots
35
- #
36
- # A named slot is a special kind of field: saying
37
- #
38
- # consumes :brain
39
- #
40
- # gives your class
41
- #
42
- # * A normal attribute `brain_slot`
43
- # * methods `brain_slot`, `receive_brain_slot` to go with it
44
- # * method `brain`, returning the item (if any) connected to the brain slot
45
- # * method `brain=` (alias for `receive_brain`) that links the brain slot with the given item
46
- #
47
- # @note that at the moment you can't have an input and an output with the same name.
48
- #
49
- module Slottable
50
- extend Gorillib::Concern
51
- include Inlinkable
52
- include Outlinkable
53
-
54
- included do
55
- collection :outslots, Hanuman::OutputSlot, :key_method => :name
56
- end
57
-
58
- def inputs
59
- inslots.to_a.map{|slot| slot.input }.compact
60
- end
61
-
62
- def inslots
63
- self.class.inslot_fields.map{|_, slot_field| read_attribute(slot_field.name) }
64
- end
65
-
66
- def handle_extra_attributes(attrs)
67
- self.class.inslot_fields.each do |_, field|
68
- field_name = field.basename
69
- next unless attrs.has_key?(field_name)
70
- self.public_send(:"receive_#{field_name}", attrs.delete(field_name))
71
- end
72
- super(attrs)
73
- end
74
-
75
- module ClassMethods
76
- def consumes(name, options={})
77
- field name, Hanuman::Stage, {:field_type => InputSlotField}.merge(options)
78
- end
79
- def produces(name, options={})
80
- field name, Hanuman::Stage, {:field_type => OutputSlotField}.merge(options)
81
- end
82
-
83
- def define_slot_reader(field)
84
- meth_name = field.basename
85
- slot_name = field.name
86
- type = field.type
87
- define_meta_module_method(meth_name, true) do ||
88
- begin
89
- slot = read_attribute(slot_name) or return nil
90
- slot.other
91
- rescue StandardError => err ; err.polish("#{self.class}.#{meth_name}") rescue nil ; raise ; end
92
- end
93
- end
94
-
95
- def define_inslot_receiver(field)
96
- meth_name = field.basename
97
- slot_name = field.name
98
- type = field.type
99
- define_meta_module_method("receive_#{meth_name}", true) do |stage|
100
- begin
101
- slot = read_attribute(slot_name) or return nil
102
- slot.from(stage)
103
- self
104
- rescue StandardError => err ; err.polish("#{self.class} set slot #{meth_name} to #{stage}") rescue nil ; raise ; end
105
- end
106
- meta_module.module_eval do
107
- alias_method "#{meth_name}=", "receive_#{meth_name}"
108
- end
109
- end
110
-
111
- def define_outslot_receiver(field)
112
- meth_name = field.basename
113
- slot_name = field.name
114
- type = field.type
115
- define_meta_module_method("receive_#{meth_name}", true) do |stage|
116
- begin
117
- slot = read_attribute(slot_name) or return nil
118
- slot.into(stage)
119
- self
120
- rescue StandardError => err ; err.polish("#{self.class} set slot #{meth_name} to #{stage}") rescue nil ; raise ; end
121
- end
122
- meta_module.module_eval do
123
- alias_method "#{meth_name}=", "receive_#{meth_name}"
124
- end
125
- end
126
-
127
- def inslot_fields
128
- fields.select{|_, field| field.is_a?(InputSlotField) }
129
- end
130
-
131
- def inslot_field?(field_name)
132
- fields[field_name].is_a?(InputSlotField)
133
- end
134
- end
135
-
136
- class SlotField < Gorillib::Model::Field
137
- self.visibilities = visibilities.merge(:reader => true, :writer => false, :tester => false)
138
- field :basename, Symbol
139
- field :stage_type, Whatever, :doc => 'type for stages this slot accepts'
140
- class_attribute :slot_type
141
-
142
- def initialize(model, basename, type, options={})
143
- name = "#{basename}_slot"
144
- options[:stage_type] = type
145
- slot_type = self.slot_type
146
- options[:basename] = basename
147
- options[:default] = ->{ slot_type.new(:name => basename, :stage => self) }
148
- super(model, name, slot_type, options)
149
- end
150
- end
151
-
152
- class InputSlotField < SlotField
153
- self.slot_type = Hanuman::InputSlot
154
- def inscribe_methods(model)
155
- model.__send__(:define_slot_reader, self)
156
- model.__send__(:define_inslot_receiver, self)
157
- super
158
- end
159
- end
160
-
161
- class OutputSlotField < SlotField
162
- self.slot_type = Hanuman::OutputSlot
163
- def inscribe_methods(model)
164
- model.__send__(:define_slot_reader, self)
165
- model.__send__(:define_outslot_receiver, self)
166
- super
167
- end
168
- end
169
-
170
- end # Slottable
171
-
172
- module SplatInputs
173
- extend Gorillib::Concern
174
- include Slottable
175
-
176
- included do
177
- collection :splat_inslots, Hanuman::InputSlot, :key_method => :name
178
- end
179
-
180
- def set_input(stage)
181
- slot = Hanuman::InputSlot.new(:name => stage.name, :stage => self, :input => stage)
182
- self.splat_inslots << slot
183
- slot
184
- end
185
-
186
- def has_input?(slot_name)
187
- self.splat_inslots.keys.include?(slot_name)
188
- end
189
-
190
- def inslots
191
- super + splat_inslots.to_a
192
- end
193
- end
194
-
195
- module SplatOutputs
196
- extend Gorillib::Concern
197
- include Slottable
198
-
199
- included do
200
- collection :splat_outslots, Hanuman::OutputSlot, :key_method => :name
201
- end
202
-
203
- def set_output(stage)
204
- slot = Hanuman::OutputSlot.new(
205
- :name => stage.name, :stage => self, :output => stage)
206
- self.outslots << slot
207
- slot
208
- end
209
-
210
- def outputs
211
- outslots.to_a.map{|slot| slot.output }
212
- end
213
-
214
- def into(*others)
215
- others.each{|other| super(other)}
216
- self
217
- end
218
- end
219
-
220
- end
@@ -1,15 +0,0 @@
1
- class BadRecord
2
- include Gorillib::Model
3
- field :contents, Whatever, :doc => "The faulty contents; will be truncated at 1000 characters"
4
- field :error, Exception, :doc => "Error (optional)"
5
-
6
- def receive_contents(contents)
7
- super contents.to_s[0..1000]
8
- end
9
-
10
- def make(contents, error=nil)
11
- hsh = { :contents => contents }
12
- hsh[:error] = error if error
13
- receive(hsh)
14
- end
15
- end
@@ -1,44 +0,0 @@
1
- module Wukong
2
- class EventMetadata
3
- include Gorillib::Model
4
-
5
- field :timestamp, Time, :doc => "time the event originated, assigned by the origin (as anything they like) and unchanged afterwards. A UTC ruby time, serialized as a unix timestamp. Corresponds to Flume's `time` metadata"
6
- field :origin, String, :doc => "name for the source of this record; in flume, the dispatching `host`. This influences delivery guarantees. A downcased, dasherized, dot-separated identifier."
7
- field :nano_ctr, Bignum, :doc => "nanosecond timestamp, monotonically-increasing within each origin. The `[origin, nano_ctr]` pair may be considered globally unique. Serialized as whatever flume uses."
8
-
9
- field :topic, Symbol, :doc => "Topic this event belongs to"
10
-
11
- def event_id
12
- [origin, nano_ctr].join('!')
13
- end
14
-
15
- end
16
-
17
- module Event
18
- extend Gorillib::Concern
19
- include Gorillib::Model
20
-
21
- def _metadata
22
- @_metadata ||= {}
23
- end
24
-
25
- def _metadata= m
26
- @_metadata = m
27
- end
28
-
29
- def to_wire options={}
30
- super(options).merge(:_metadata => self._metadata)
31
- end
32
- end
33
-
34
- end
35
-
36
- #
37
- # Example Usage
38
- #
39
- # def process(blob)
40
- # record = JSON.parse(blob)
41
- # metadata = blob._metadata
42
- # { :_id => metadata.event_id, :time => metadata.timestamp, :type => metadata.topic, :data => record }
43
- # # ... now do stuff
44
- # end
@@ -1,55 +0,0 @@
1
- module Wukong
2
- class Runner
3
- include Gorillib::FancyBuilder
4
- member :flow, Wukong::Dataflow
5
-
6
- def run(slot_name)
7
- wire_flow
8
- flow.setup
9
- drive_flow(slot_name)
10
- flow.stop
11
- end
12
-
13
- def self.run(flow, slot_name)
14
- runner = self.receive(:flow => flow)
15
- runner.run(slot_name)
16
- end
17
-
18
- def validate!
19
- raise StandardError, "flow is missing for #{self}" unless flow.present?
20
- end
21
-
22
- protected
23
-
24
- # Connect sources, sinks, flows and so forth. On return, the topology of the graph should be in place.
25
- # Override in your subclass
26
- #
27
- # @abstract
28
- def wire_flow
29
- end
30
-
31
- # Launch the flow -- sources be each'ing, processors be process'n
32
- # Override in your subclass
33
- #
34
- # @abstract
35
- def drive_flow
36
- puts flow
37
- end
38
- end
39
-
40
- # Run dataflow in pure ruby
41
- class LocalRunner < Runner
42
-
43
- protected
44
-
45
- def drive_flow(slot_name)
46
- validate!
47
- flow.drive(slot_name)
48
- end
49
-
50
- def wire_flow
51
- # flow.set_output sink(:test_sink)
52
- # flow.set_output sinks.to_a.last
53
- end
54
- end
55
- end
@@ -1,3 +0,0 @@
1
- module Wukong
2
-
3
- end
@@ -1,48 +0,0 @@
1
- module Wukong
2
-
3
- #
4
- # Holds graphs, supplies `processor` and similar stage template methods
5
- #
6
- module Universe
7
- def find_or_create_class(superklass, klass_name, namespace, &block)
8
- klass_name = Gorillib::Inflector.camelize(klass_name.to_s).to_sym
9
- if namespace.const_defined?(klass_name)
10
- namespace.const_get(klass_name)
11
- else
12
- namespace.send(:const_set, klass_name, Class.new(superklass, &block))
13
- end
14
- end
15
-
16
- def processor(processor_name, &block)
17
- klass = find_or_create_class(Wukong::Processor, processor_name, Wukong::Widget) do
18
- register_processor(processor_name)
19
- end
20
- klass.class_eval(&block) if block_given?
21
- klass
22
- end
23
-
24
- def dataflow(name, attrs={}, &block)
25
- attrs[:name] = name = name.to_sym
26
- dataflow = @dataflows[name] ||= Dataflow.new(:name => name)
27
- dataflow.receive!(attrs, &block)
28
- dataflow
29
- end
30
-
31
- def workflow(name, attrs={}, &block)
32
- attrs[:name] = name = name.to_sym
33
- workflow = @workflows[name] ||= Workflow.new(:name => name)
34
- workflow.receive!(attrs, &block)
35
- workflow
36
- end
37
-
38
- def self.extended(base)
39
- base.instance_eval do
40
- @dataflows = Hash.new
41
- @workflows = Hash.new
42
- end
43
- end
44
- end
45
-
46
- # Wukong can serve as a universe
47
- extend Universe
48
- end
@@ -1,81 +0,0 @@
1
- module Wukong
2
- module Widget
3
-
4
- class Filter < Wukong::Processor
5
- def process(*args) emit(*args) if select?(*args) ; end
6
- def reject?(*args) not select?(*args) ; end
7
- end
8
-
9
- class Rejecter < Filter
10
- def process(*args) emit(*args) if not reject?(*args) ; end
11
- def select?(*args) not reject?(*args) ; end
12
- def reject?(*args) true ; end
13
- end
14
-
15
- class All < Filter
16
- def select?(*args) ; true ; end
17
- end
18
-
19
- class None < Rejecter
20
- def reject?(*args) ; true ; end
21
- end
22
-
23
- # Selects only records matching this regexp
24
- class RegexpFilter < Filter
25
- field :pattern, Regexp, :doc => 'strings matching this regular expression will be selected'
26
- def select?(str)
27
- pattern.match(str)
28
- end
29
-
30
- def self.make(workflow, pattern, attrs={}, &block)
31
- super workflow, attrs.merge(:pattern => pattern), &block
32
- end
33
- register_processor(:re)
34
- end
35
-
36
- class RegexpRejecter < Rejecter
37
- field :pattern, Regexp, :doc => 'strings matching this regular expression will be rejected'
38
- def reject?(str)
39
- pattern.match(str)
40
- end
41
-
42
- def self.make(workflow, pattern, attrs={}, &block)
43
- super workflow, attrs.merge(:pattern => pattern), &block
44
- end
45
- register_processor(:not_re)
46
- end
47
-
48
- class ProcFilter < Filter
49
- # @param [Proc] proc use for body of `reject?` method
50
- # @yield ...or supply a block directly
51
- def initialize(prc=nil, &block)
52
- prc ||= block or raise "Please supply a proc or a block to #{self.class}.new"
53
- define_singleton_method(:select?, prc)
54
- end
55
- end
56
-
57
- class ProcRejecter < Rejecter
58
- # @param [Proc] proc use for body of `reject?` method
59
- # @yield ...or supply a block directly
60
- def initialize(prc=nil, &block)
61
- prc ||= block or raise "Please supply a proc or a block to #{self.class}.new"
62
- define_singleton_method(:reject?, prc)
63
- end
64
- end
65
-
66
- class Limit < Rejecter
67
- # include CountingProcessor
68
- field :max_records, Integer, :doc => 'maximum records to allow', :writer => true
69
-
70
- def reject?(*)
71
- count >= max_records
72
- end
73
-
74
- def self.make(workflow, max, attrs={}, &block)
75
- super workflow, attrs.merge(:max_records => max), &block
76
- end
77
- register_processor
78
- end
79
-
80
- end
81
- end