wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -0,0 +1,161 @@
1
+ module Hanuman
2
+ module Graphvizzer
3
+
4
+ COL_1_WIDTH = 47
5
+
6
+ class Item
7
+ include Gorillib::Builder
8
+
9
+ field :name, Symbol
10
+ field :label, String, :default => ->{ name }
11
+ field :owner, Item
12
+
13
+ def indent(adj=0)
14
+ " " * (depth + adj)
15
+ end
16
+
17
+ def depth
18
+ owner.depth + 1
19
+ end
20
+
21
+ def quote(str) str.to_s.include?('"') ? str : %Q{"#{str}"} ; end
22
+ def attrib(attr, val) "#{attr}=#{val}" ; end
23
+ def brace(str) "#{indent}#{str} {" ; end
24
+ def close_brace() "#{indent}}" ; end
25
+
26
+ def line(str, attrs={}, term=';')
27
+ if attrs.empty?
28
+ attr_strs = ''
29
+ else
30
+ width = COL_1_WIDTH - indent.length
31
+ str = "%-#{width}s" % str
32
+ attr_strs = attrs.map{|attr, val| attrib(attr, val) }
33
+ attr_strs = "\t[ #{attr_strs.join(", ")} ]"
34
+ end
35
+ [indent, str, attr_strs, term].join
36
+ end
37
+ end
38
+
39
+ class Graph < Item
40
+ field :items, Array, :default => []
41
+ field :edges, Array, :default => []
42
+
43
+ def to_s
44
+ str = []
45
+ str << brace("subgraph #{quote("cluster_#{name}")}") ## subgraph "cluster_crust" {
46
+ str << line(attrib(" label", quote(label))) ## label="crust";
47
+ items.each do |item| ##
48
+ str << item.to_s ## "cherry_pie.crust.small_bowl" [ shape=Mrecord, label="{small\nbowl}" ];
49
+ end ## "cherry_pie.crust.flour" [ shape=Mrecord, label="{flour}" ];
50
+ edges.each do |edge| ##
51
+ str << edge.to_s ## "cherry_pie.crust.small_bowl" -> "cherry_pie.crust.add_to_4";
52
+ end ## "cherry_pie.crust.flour" -> "cherry_pie.crust.add_to_4";
53
+ str << close_brace ## }
54
+ str.join("\n")
55
+ end
56
+
57
+ def graph(name, attrs={})
58
+ obj = Graph.new(attrs.merge(:name => name, :owner => self))
59
+ items << obj
60
+ yield(obj) if block_given?
61
+ obj
62
+ end
63
+
64
+ def node(name, attrs={})
65
+ obj = Node.new(attrs.merge(:name => name, :owner => self))
66
+ items << obj
67
+ yield(obj) if block_given?
68
+ obj
69
+ end
70
+
71
+ def edge(from, into, from_slot=nil, into_slot=nil)
72
+ obj = Edge.new(
73
+ :name => name, :owner => self,
74
+ :from => from, :into => into,
75
+ :from_slot => from_slot, :into_slot => into_slot)
76
+ edges << obj
77
+ yield(obj) if block_given?
78
+ obj
79
+ end
80
+ end
81
+
82
+ class Universe < Graph
83
+ field :orient, Symbol, :doc => 'one of :TB, :BT, :LR, :RL', :default => :TB
84
+ field :engine, Symbol, :default => :dot
85
+
86
+ def to_s
87
+ str = []
88
+ str << brace("digraph #{name}") ## digraph Wukong {
89
+ str << line(" rankdir = #{orient}") ## rankdir = TB;
90
+ items.each do |item| ## subgraph "cluster_cherry_pie" {
91
+ str << item.to_s ## # ...
92
+ end ## }
93
+ str << close_brace ## }
94
+ str.join("\n")
95
+ end
96
+
97
+ def depth() 0; end
98
+
99
+ def save(path, type=nil)
100
+ File.open "#{path}.dot", "w" do |f|
101
+ f.puts self.to_s
102
+ end
103
+ system "#{engine} -T#{type} #{path}.dot > #{path}.#{type}" if type
104
+ end
105
+ end
106
+
107
+ class Node < Item
108
+ field :inslots, Array, :default => []
109
+ field :outslots, Array, :default => []
110
+ field :shape, Symbol, :default => :Mrecord
111
+
112
+ def to_s
113
+ line(
114
+ quote(name), ## "cherry_pie.crust.small_bowl" [
115
+ :shape => shape, ## shape=Mrecord,
116
+ :label => quote(structured_label), ## label="{{<in>sb}|small\nbowl}"
117
+ ) ## ];
118
+ end
119
+
120
+ def abbreviate(word)
121
+ word.to_s.
122
+ gsub(/-\d+$/,''). # remove a trailing index (-0...)
123
+ split(/[\W_]+/). # split into _ separated segment
124
+ reject(&:empty?).map{|str| str[0] }.join
125
+ end
126
+
127
+ def inslots_str
128
+ inslots.map{|slot| "<#{slot}>#{abbreviate(slot)}"}.join("|")
129
+ end
130
+
131
+ def outslots_str
132
+ outslots.map{|slot| "<_#{slot}>#{abbreviate(slot)}"}.join("|")
133
+ end
134
+
135
+ def label
136
+ super.to_s.gsub(/_\d+$/, '').gsub(/[_\.]+/, "\\n")
137
+ end
138
+
139
+ def structured_label
140
+ return label unless shape =~ /record/
141
+ str = "{"
142
+ str << "{" << (inslots.empty? ? "<i>" : inslots_str) << "}|"
143
+ str << label
144
+ str << "|{" << (outslots.empty? ? "<_o>" : outslots_str) << "}"
145
+ str << "}"
146
+ str
147
+ end
148
+ end
149
+
150
+ class Edge < Item
151
+ field :from, Whatever
152
+ field :into, Whatever
153
+
154
+ def to_s
155
+ width = COL_1_WIDTH - indent.length
156
+ "#{indent}%-#{width}s\t-> %-s;" % [ from, into ] ## "cherry_pie.crust.small_bowl" -> "cherry_pie.crust.add_to_4";
157
+ end
158
+ end
159
+
160
+ end
161
+ end
@@ -0,0 +1,97 @@
1
+ require 'hanuman/graphvizzer/gv_models'
2
+
3
+ module Hanuman
4
+
5
+ Stage.class_eval do
6
+ class_attribute :draw_shape
7
+ self.draw_shape = :record
8
+
9
+ def graphviz_id
10
+ graph_id
11
+ end
12
+
13
+
14
+ def gv_into_label() warn [self, self.class.ancestors]; %Q{"#{graphviz_id}":"i"} ; end
15
+ def gv_from_label() warn [self, self.class.ancestors]; %Q{"#{graphviz_id}":"_o"} ; end
16
+
17
+ def to_graphviz(gv)
18
+ gv.node(self.graphviz_id,
19
+ :label => name,
20
+ :shape => draw_shape,
21
+ )
22
+ end
23
+ end
24
+
25
+ Product.class_eval do
26
+ self.draw_shape = :Mrecord
27
+ end
28
+
29
+ module ::Wukong::Universe
30
+ def to_graphviz
31
+ gv = Hanuman::Graphvizzer::Universe.new(:name => self.name)
32
+ @workflows.each_value do |workflow|
33
+ workflow.to_graphviz(gv)
34
+ end
35
+ @dataflows.each_value do |dataflow|
36
+ dataflow.to_graphviz(gv)
37
+ end
38
+ gv
39
+ end
40
+ end
41
+
42
+ Graph.class_eval do
43
+ self.draw_shape = :record
44
+
45
+ def to_graphviz(gv)
46
+ gv.graph(graphviz_id, :label => name) do |gv2|
47
+ inslots.each_value{|slot| slot.to_graphviz(gv2) if slot.wired? }
48
+
49
+ stages.each_value{|stage| stage.to_graphviz(gv2) }
50
+
51
+ outslots.each_value{|slot| slot.to_graphviz(gv2) if slot.wired? }
52
+ #
53
+ edges.each_value do |edge|
54
+ gv2.edge(edge[:from].gv_from_label, edge[:into].gv_into_label)
55
+ end
56
+ end
57
+ end
58
+ end
59
+
60
+ module Hanuman::Slottable
61
+ def to_graphviz(gv)
62
+ super.tap{|node| node.receive!(
63
+ :inslots => inslots.to_a.map{|slot| slot.name },
64
+ :outslots => outslots.to_a.map{|slot| slot.name },
65
+ ) }
66
+ end
67
+ end
68
+
69
+ module InputSlotted
70
+ def gv_into_label() %Q{"#{graphviz_id}":"i"} ; end
71
+ end
72
+ module OutputSlotted
73
+ def gv_from_label() %Q{"#{graphviz_id}":"_o"} ; end
74
+ end
75
+
76
+ Slot.class_eval do
77
+ def to_graphviz(gv)
78
+ gv.node(self.graphviz_id, label: name, shape: :Mrecord)
79
+ end
80
+ def graphviz_id() (stage.is_a?(Hanuman::Graph)||stage.is_a?(Wukong::Universe)) ? graph_id : stage.graph_id ; end
81
+ end
82
+
83
+ InputSlot.class_eval do
84
+ def gv_into_label() %Q{"#{graphviz_id}":"#{name}"} ; end
85
+ def to_graphviz(gv)
86
+ super.tap{|node| node.receive!(inslots: [name] )}
87
+ end
88
+ end
89
+
90
+ OutputSlot.class_eval do
91
+ def gv_from_label() %Q{"#{graphviz_id}":"_#{name}"} ; end
92
+ def to_graphviz(gv)
93
+ super.tap{|node| node.receive!(outslots: [name] )}
94
+ end
95
+ end
96
+
97
+ end
@@ -0,0 +1,35 @@
1
+ module Hanuman
2
+ class LinkFactory
3
+
4
+ Registry = {
5
+ simple: ->(from_stage, *into_stage){ DirectedLink.new(from_stage, *into_stage) }
6
+ }
7
+
8
+ class << self
9
+
10
+ def connect(label, from_stage, *into_stage)
11
+ Registry[label].call(from_stage, *into_stage)
12
+ end
13
+
14
+ def register(label, factory_method)
15
+ Registry[label] = factory_method
16
+ end
17
+
18
+ end
19
+ end
20
+
21
+ class DirectedLink
22
+
23
+ attr_accessor :from, :into
24
+
25
+ def initialize(from, into)
26
+ @from = from
27
+ @into = into
28
+ end
29
+
30
+ def to_s
31
+ "#<#{self.class}(#{from.to_s} -> #{into.to_s})>"
32
+ end
33
+
34
+ end
35
+ end
@@ -0,0 +1,46 @@
1
+ module Hanuman
2
+ class Registry
3
+
4
+ REGISTRY = {} unless defined? REGISTRY
5
+
6
+ def create_or_update(label, builder)
7
+ create(label, builder) ? true : update(label, builder)
8
+ end
9
+
10
+ def create(label, builder)
11
+ return false if registered?(label)
12
+ REGISTRY[label] = builder
13
+ true
14
+ end
15
+
16
+ def update(label, new_definition)
17
+ return false unless registered?(label)
18
+ REGISTRY[label].merge!(new_definition)
19
+ true
20
+ end
21
+
22
+ def registered?(label)
23
+ REGISTRY.keys.include? label
24
+ end
25
+
26
+ def retrieve(label)
27
+ REGISTRY[label].clone rescue nil
28
+ end
29
+
30
+ def decorate_with_registry(graph_instance)
31
+ REGISTRY.each_pair do |label, definition|
32
+ graph_instance.define_singleton_method(label) do |*args, &blk|
33
+ builder = Hanuman::GlobalRegistry.retrieve(label)
34
+ builder = handle_dsl_arguments_for(builder, *args, &blk)
35
+ stages[builder.label] = builder
36
+ end
37
+ end
38
+ end
39
+
40
+ def show() REGISTRY.dup ; end
41
+ def clear!() REGISTRY.clear ; end
42
+ def merge!(contents) REGISTRY.merge!(contents) ; end
43
+ end
44
+
45
+ GlobalRegistry = Registry.new unless defined? GlobalRegistry
46
+ end
@@ -1,51 +1,95 @@
1
1
  module Hanuman
2
- class Stage
3
- include Gorillib::Builder
4
- alias_method :configure, :receive!
5
-
6
- magic :name, Symbol, :doc => 'name for this stage; should be unique among other stages on its containing graph', :tester => true
7
- field :owner, Whatever, :doc => 'the graph this stage sits in'
8
- magic :doc, String, :doc => 'freeform description of this stage type'
2
+ module StageClassMethods
9
3
 
10
- # @returns the stage, namespaced by the graph that owns it
11
- def fullname
12
- [owner.try(:fullname), name].compact.join('.')
4
+ def label() self.to_s.demodulize.underscore.to_sym ; end
5
+
6
+ def builder
7
+ return @builder if @builder
8
+ set_builder(StageBuilder.new(label: label))
13
9
  end
14
-
15
- def self.handle
16
- Gorillib::Inflector.underscore(Gorillib::Inflector.demodulize(self.name))
10
+
11
+ def set_builder(builder)
12
+ @builder = builder unless builder.nil?
13
+ builder.for_class = self
14
+ @builder
17
15
  end
16
+
17
+ def register(new_label = nil)
18
+ builder.label = new_label || label
19
+ Hanuman::GlobalRegistry.create_or_update(new_label || label, builder)
20
+ self
21
+ end
22
+ end
23
+
24
+ class Stage
25
+ include Gorillib::Model
26
+ extend StageClassMethods
18
27
 
19
- #
20
- # Methods
21
- #
28
+ field :label, Symbol
29
+ end
22
30
 
23
- # Called after the graph is constructed, before the flow is run
24
- def setup
31
+ class StageBuilder
32
+ include Gorillib::Model
33
+
34
+ field :args, Hash, :default => {}
35
+ field :for_class, Class
36
+ field :label, Symbol
37
+ field :links, Array, :default => []
38
+
39
+ def define(*args, &blk)
40
+ stage = for_class || define_class(label, *args)
41
+ stage.class_eval(&blk) if block_given?
42
+ stage.register
25
43
  end
26
44
 
27
- # Called to signal the flow should stop. Close any open connections, flush
28
- # buffers, stop supervised projects, etc.
29
- def stop
45
+ def build(options = {})
46
+ for_class.receive self.serialize.merge(options)
47
+ end
48
+
49
+ def handle_extra_attributes(attrs)
50
+ args.merge!(attrs)
30
51
  end
52
+
53
+ def merge!(other_builder = {})
54
+ attrs = other_builder.attributes rescue other_builder
55
+ self.receive!(attrs)
56
+ self
57
+ end
58
+
59
+ def namespace(*args) Hanuman::Stage ; end
60
+
61
+ def define_class(name, *args)
62
+ klass = namespace(*args).const_get(name.to_s.camelize, Class.new(namespace(*args))) rescue nil
63
+ klass ||= namespace(*args).const_set(name.to_s.camelize, Class.new(namespace(*args)))
64
+ klass.set_builder(self)
65
+ klass
66
+ end
67
+
68
+ def linkable_name(direction) self.label ; end
31
69
 
32
- def lookup(stage)
33
- owner.lookup(stage)
70
+ def add_link(level, from, into)
71
+ links << Hanuman::LinkFactory.connect(level, from, into)
34
72
  end
35
73
 
36
- #
37
- # Graph connections
38
- #
74
+ def into(other_stage)
75
+ self.add_link(:simple, self.linkable_name(:in), other_stage.linkable_name(:out))
76
+ other_stage
77
+ end
78
+ alias_method :|, :into
39
79
 
40
- def notify(msg)
41
- true
80
+ def serialize()
81
+ attrs = attributes
82
+ args = attrs.delete(:args)
83
+ attrs.delete(:links) ; attrs.delete(:for_class)
84
+ attrs.merge(args)
42
85
  end
43
86
 
44
- def report
45
- self.attributes
87
+ # This is a hacky method to clone a Stage ; probably could be merged into serialize?
88
+ def clone
89
+ cloned_attrs = Hash[ serialize.map{ |key, val| dup_key = key.dup rescue key ; dup_val = val.dup rescue val ; [ dup_key, dup_val ] } ]
90
+ cloned_links = links.map{ |link| link.dup }
91
+ self.class.receive(cloned_attrs.merge(links: cloned_links).merge(for_class: for_class))
46
92
  end
47
93
 
48
- def to_key() name ; end
49
- def key_method() :name ; end
50
94
  end
51
95
  end