wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,58 +1,16 @@
1
1
  module Wukong
2
- class Sink < Wukong::Processor
3
-
4
- class NullSink < Wukong::Sink
2
+ class Sink < Processor
3
+
4
+ class Stdout < Sink
5
5
  def process(record)
6
- true # do nothing
7
- end
8
- end
9
-
10
- # Write all lines to given file
11
- class IO < Wukong::Sink
12
- def process(record)
13
- file.puts(record)
14
- end
15
- end
16
-
17
- class FileSink < Wukong::Sink::IO
18
- field :filename, Pathname, :doc => "Filename to write"
19
- attr_reader :file
20
-
21
- def self.make(workflow, filename, stage_name=nil, attrs={})
22
- super(workflow, attrs.merge(:filename => filename, :name => stage_name))
23
- end
24
-
25
- def setup
26
- super
27
- filename.dirname.mkpath
28
- @file = File.open(filename, "w")
29
- end
30
-
31
- def stop
32
- @file.close if @file
33
- end
34
-
35
- register_processor
36
- end
37
-
38
- # Writes all lines to $stdout
39
- class Stdout < Wukong::Sink::IO
40
- def file() $stdout ; end
41
- register_processor
42
- end
43
-
44
- # Writes all lines to $stderr
45
- class Stderr < Wukong::Sink::IO
46
- def file() $stderr ; end
47
- register_processor
48
- end
49
-
50
- class ArraySink < Wukong::Sink
51
- field :records, Array, :default => [], :writer => :protected
52
-
53
- def process(record)
54
- self.records << record
6
+ begin
7
+ $stdout.puts record
8
+ rescue Errno::EPIPE => e
9
+ exit(2)
10
+ end
55
11
  end
12
+ register
56
13
  end
14
+
57
15
  end
58
16
  end
@@ -1,120 +1,14 @@
1
1
  module Wukong
2
- class Source < Hanuman::Action
3
- include Hanuman::IsOwnOutputSlot
4
- def self.register_source(name=nil, &block)
5
- register_action(name, &block)
6
- end
2
+ class Source < Processor
7
3
 
8
- def drive
9
- each do |record|
10
- output.process(record)
11
- end
12
- end
13
-
14
- def new_string_event string
15
- metadata_hash = Hash.new
16
- string.define_singleton_method(:_metadata) do
17
- metadata_hash
18
- end
19
- string
20
- end
21
-
22
- class Iter < Source
23
- # the enumerable object to delegate
24
- attr_reader :obj
25
-
26
- def initialize(obj)
27
- @obj = obj
28
- end
29
- def each(&block)
30
- obj.each(&block)
31
- end
32
- end
33
-
34
- class IO < Source
35
- attr_reader :file
36
-
37
- def each(&block)
38
- file.each do |line|
39
- yield line.chomp
4
+ class Stdin < Source
5
+ def process
6
+ while line = $stdin.readline.chomp! rescue nil
7
+ yield line
40
8
  end
41
9
  end
42
-
43
- def stop
44
- file.close if file
45
- end
46
- end
47
-
48
- # emits each line from $stdin
49
- class Stdin < Wukong::Source::IO
50
- def setup
51
- super
52
- @file = $stdin
53
- end
54
- register_source
55
- end
56
-
57
- class FileSource < Wukong::Source::IO
58
- field :filename, Pathname, :doc => "Filename to read from"
59
-
60
- def self.make(workflow, filename, stage_name=nil, attrs={})
61
- super(workflow, attrs.merge(:filename => filename, :name => stage_name))
62
- end
63
-
64
- def setup
65
- super
66
- @file = File.open(filename)
67
- end
68
-
69
- register_source
70
- end
71
-
72
- module CappedGenerator
73
- extend Gorillib::Concern
74
- included do
75
- attr_reader :num
76
- field :size, Integer, :default => 2**63, :doc => "Number of items to generate", :writer => true
77
- end
78
-
79
- def setup
80
- super
81
- @num = 0
82
- end
83
-
84
- def max
85
- size
86
- end
87
-
88
- def next_item
89
- end
90
-
91
- def each
92
- loop do
93
- break if @num > max
94
- yield next_item
95
- @num += 1
96
- end
97
- end
98
- end
99
-
100
- class Integers < Wukong::Source
101
- register_source :integers
102
- include CappedGenerator
103
- field :init, Integer, :default => 0, :doc => "Initial offset", :writer => true
104
-
105
- def max
106
- init + size - 1
107
- end
108
-
109
- def next_item
110
- @num
111
- end
112
-
113
- def self.make(dataflow, size=nil, attrs={})
114
- attrs[:size] = size if not size.nil?
115
- super(dataflow, attrs)
116
- end
117
- end
10
+ register
11
+ end
118
12
 
119
13
  end
120
14
  end
@@ -0,0 +1,46 @@
1
+ module Wukong
2
+
3
+ # :nodoc:
4
+ #
5
+ # This code is gross and nasty.
6
+ module DynamicGet
7
+
8
+ # :nodoc:
9
+ def self.included klass
10
+ klass.send(:field, :separator, String, :default => "\t")
11
+ end
12
+
13
+ # :nodoc:
14
+ def get field, obj
15
+ return obj unless field
16
+ case
17
+ when field.to_s.to_i > 0 && obj.is_a?(String)
18
+ obj.split(separator)[field.to_s.to_i - 1]
19
+ when field.to_s.to_i > 0
20
+ obj[field.to_s.to_i - 1]
21
+ when field.to_s.to_i == 0 && obj.is_a?(String) && obj =~ /^\s*\{/
22
+ begin
23
+ get_nested(field, MultiJson.load(obj))
24
+ rescue MultiJson::DecodeError => e
25
+ end
26
+ when field.to_s.to_i == 0 && (!field.to_s.include?('.')) && obj.respond_to?(field.to_s)
27
+ obj.send(field.to_s)
28
+ when field.to_s.to_i == 0 && obj.respond_to?(:[])
29
+ get_nested(field, obj)
30
+ else obj
31
+ end
32
+ end
33
+
34
+ # :nodoc:
35
+ def get_nested fields, obj
36
+ parts = fields.to_s.split('.')
37
+ field = parts.shift
38
+ return unless field
39
+ if slice = obj[field]
40
+ return slice if parts.empty?
41
+ get_nested(parts.join('.'), slice)
42
+ end
43
+ end
44
+
45
+ end
46
+ end
@@ -0,0 +1,6 @@
1
+ require 'wukong/widget/source'
2
+ require 'wukong/widget/sink'
3
+ require 'wukong/widget/processors'
4
+ require 'wukong/widget/reducers'
5
+ require 'wukong/widget/serializers'
6
+ require 'wukong/widget/filters'
@@ -0,0 +1,18 @@
1
+ require 'spec_helper'
2
+ # require 'wukong'
3
+
4
+ # describe_example_script :fibonacci_series, 'dataflow/fibonacci_series.rb', examples_spec: true do
5
+ # subject{ Wukong.chain(:fibbonaci_series) }
6
+
7
+ # it 'generates a fibonacci sequence' do
8
+ # subject.ticker.qty(12)
9
+ # # subject.output > subject.array_sink(name: :numbers)
10
+ # # subject.setup
11
+ # # subject.ticker.drive
12
+ # #
13
+ # # subject.numbers.records.should == [0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89]
14
+ # end
15
+
16
+ # it_generates_graphviz{|gv_filename| puts File.read(gv_filename) }
17
+
18
+ # end
@@ -1,13 +1,14 @@
1
1
  require 'spec_helper'
2
- require 'wukong'
2
+ # require 'wukong'
3
3
 
4
- describe_example_script(:parse_apache_logs, 'dataflow/parse_apache_logs.rb') do
5
- it 'runs' do
6
- out, err = Gorillib::TestHelpers.capture_output do
7
- Wukong::LocalRunner.receive(:flow => subject) do
8
- run :default
9
- end
10
- end
11
- out.string.split("\n").first.should =~ /\{\"ip_address\":\"[\d\.]+\",.*\"}/
12
- end
13
- end
4
+ # describe_example_script(:parse_apache_logs, 'dataflow/parse_apache_logs.rb') do
5
+ # it 'runs' do
6
+ # subject = Wukong.dataflow(:parse_apache_logs)
7
+ # out, err = Gorillib::TestHelpers.capture_output do
8
+ # Wukong::LocalRunner.receive(:flow => subject) do
9
+ # run :default
10
+ # end
11
+ # end
12
+ # out.string.split("\n").first.should == "127.0.0.1 - - [10/Apr/2007:10:39:11 +0300] \"GET / HTTP/1.1\" 500 606 \"-\" \"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.3) Gecko/20061201 Firefox/2.0.0.3 (Ubuntu-feisty)\"\t"
13
+ # end
14
+ # end
@@ -1,8 +1,34 @@
1
1
  require 'spec_helper'
2
- require 'wukong'
2
+ # require 'wukong'
3
3
 
4
- describe_example_script(:simple, 'dataflow/simple.rb') do
5
- it 'runs' do
6
- Wukong::LocalRunner.run(subject, :default)
7
- end
8
- end
4
+ # Hanuman::Graph.class_eval do
5
+ # def foo_graph(label, &block)
6
+ # stage(label, :_type => Hanuman::FooGraph, &block)
7
+ # end
8
+ # end
9
+ # class Hanuman::FooGraph < Hanuman::Graph
10
+ # # field :inputs, Gorillib::Collection, :of => Hanuman::InputSlot, :doc => 'inputs to this stage', :default => ->{ Gorillib::Collection.new }
11
+ # # field :outputs, Gorillib::Collection, :of => Hanuman::OutputSlot, :doc => 'outputs of this stage', :default => ->{ Gorillib::Collection.new }
12
+ #
13
+ # collection :inputs, Hanuman::InputSlot
14
+ #
15
+ # end
16
+
17
+ # describe 'example', :examples_spec do
18
+ # # describe_example_script(:simple, 'dataflow/simple.rb', :only => true) do
19
+ # # it 'runs' do
20
+ # # p subject
21
+ # # end
22
+ # # end
23
+
24
+ # it 'runs' do
25
+ # # load Pathname.path_to(:examples, 'dataflow/simple.rb')
26
+
27
+ # Wukong.dataflow(:bob) do
28
+ # ff = file_source(Pathname.path_to(:data, 'text/jabberwocky.txt')){ p self }
29
+
30
+
31
+ # end
32
+
33
+ # end
34
+ # end
@@ -1,43 +1,43 @@
1
1
  require 'spec_helper'
2
- require 'wukong'
2
+ # require 'wukong'
3
3
 
4
- describe_example_script(:telegram, 'dataflow/telegram.rb') do
5
- it 'runs' do
6
- Wukong::LocalRunner.run(subject, :default)
7
- end
4
+ # describe_example_script(:telegram, 'dataflow/telegram.rb') do
5
+ # it 'runs' do
6
+ # Wukong::LocalRunner.run(subject, :default)
7
+ # end
8
8
 
9
- context 'Recompose processor' do
10
- subject{ Wukong::Widget::Recompose }
11
- its(:field_names){ should include(:break_length) }
9
+ # context 'Recompose processor' do
10
+ # subject{ Wukong::Widget::Recompose }
11
+ # its(:field_names){ should include(:break_length) }
12
12
 
13
- let(:words ){
14
- # 0 5 1 5 2 5 3 5 4 5 5 5 6 5 7 5 8
15
- %w[
16
- If names be not correct, language is not in accordance with
17
- the truth of things. If language be not in accordance with
18
- the truth of things, affairs cannot be carried on to success. ] }
13
+ # let(:words ){
14
+ # # 0 5 1 5 2 5 3 5 4 5 5 5 6 5 7 5 8
15
+ # %w[
16
+ # If names be not correct, language is not in accordance with
17
+ # the truth of things. If language be not in accordance with
18
+ # the truth of things, affairs cannot be carried on to success. ] }
19
19
 
20
- context '#process' do
21
- it 'breaks lines correctly' do
22
- (2..80).each do |len|
23
- # run the data flow into an array sink
24
- test_sink = Wukong::Sink::ArraySink.new
25
- rc = subject.new(:break_length => len, :output => test_sink )
26
- words.each{|word| rc.process(word) }
27
- rc.stop
28
- # start and end are correct
29
- test_sink.records.first.should =~ /^If/
30
- test_sink.records.last.should =~ /success\.$/
31
- # lines should be as long as possible, but not longer
32
- test_sink.records[0..-2].zip(test_sink.records[1..-1]) do |line, nextl|
33
- nextw = nextl.split[0]
34
- ((line.length <= len) || line !~ /\s/).should be_true
35
- (line.length + nextw.length + 1 > len).should be_true
36
- end
37
- end
38
- end
20
+ # context '#process' do
21
+ # it 'breaks lines correctly' do
22
+ # (2..80).each do |len|
23
+ # # run the data flow into an array sink
24
+ # test_sink = Wukong::Sink::ArraySink.new
25
+ # rc = subject.new(:break_length => len, :output => test_sink )
26
+ # words.each{|word| rc.process(word) }
27
+ # rc.stop
28
+ # # start and end are correct
29
+ # test_sink.records.first.should =~ /^If/
30
+ # test_sink.records.last.should =~ /success\.$/
31
+ # # lines should be as long as possible, but not longer
32
+ # test_sink.records[0..-2].zip(test_sink.records[1..-1]) do |line, nextl|
33
+ # nextw = nextl.split[0]
34
+ # ((line.length <= len) || line !~ /\s/).should be_true
35
+ # (line.length + nextw.length + 1 > len).should be_true
36
+ # end
37
+ # end
38
+ # end
39
39
 
40
- end
41
- end
40
+ # end
41
+ # end
42
42
 
43
- end
43
+ # end
@@ -1,35 +1,34 @@
1
- require 'spec_helper'
2
- require 'wukong'
3
- require 'wukong/local_runner'
1
+ # require 'spec_helper'
2
+ # require 'wukong'
4
3
 
5
- load Pathname.path_to(:examples, 'graph/minimum_spanning_tree.rb')
4
+ # load Pathname.path_to(:examples, 'graph/minimum_spanning_tree.rb')
6
5
 
7
- describe 'Minimum Spanning Tree', :examples_spec => true, :helpers => true do
6
+ # describe 'Minimum Spanning Tree', :examples_spec, :helpers do
8
7
 
9
- context Wukong::Widget::DisjointForest do
10
- subject{ Wukong::Widget::DisjointForest.new }
8
+ # context Wukong::Widget::DisjointForest do
9
+ # subject{ Wukong::Widget::DisjointForest.new }
11
10
 
12
- context 'operations' do
13
- before do
14
- %w[ AUS DFW ATL JFK SFO LGA LAX ].each{|el| subject.add el }
15
- subject.union('DFW', 'AUS')
16
- subject.union('ATL', 'JFK')
17
- subject.union('ATL', 'DFW')
18
- end
19
-
20
- context '#find' do
21
- it 'collapses elements into a shallow tree during a find' do
22
- subject.parent['ATL'].should == 'JFK'
23
- subject.parent['JFK'].should == 'AUS'
24
- subject.find('ATL').should == 'AUS'
25
- subject.parent['ATL'].should == 'AUS'
26
- end
27
- end
28
- context '#union' do
29
- it 'joins shallow tree to deep tree' do
30
- end
31
- end
32
- end
33
-
34
- end
35
- end
11
+ # context 'operations' do
12
+ # before do
13
+ # %w[ AUS DFW ATL JFK SFO LGA LAX ].each{|el| subject.add el }
14
+ # subject.union('DFW', 'AUS')
15
+ # subject.union('ATL', 'JFK')
16
+ # subject.union('ATL', 'DFW')
17
+ # end
18
+
19
+ # context '#find' do
20
+ # it 'collapses elements into a shallow tree during a find' do
21
+ # subject.parent['ATL'].should == 'JFK'
22
+ # subject.parent['JFK'].should == 'AUS'
23
+ # subject.find('ATL').should == 'AUS'
24
+ # subject.parent['ATL'].should == 'AUS'
25
+ # end
26
+ # end
27
+ # context '#union' do
28
+ # it 'joins shallow tree to deep tree' do
29
+ # end
30
+ # end
31
+ # end
32
+
33
+ # end
34
+ # end