wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,16 +0,0 @@
1
- class Hash
2
-
3
- # Return a new hash with all keys converted to symbols.
4
- def symbolize_keys
5
- inject({}) do |options, (key, value)|
6
- options[(key.to_sym rescue key) || key] = value
7
- options
8
- end
9
- end
10
-
11
- # Destructively convert all keys to symbols.
12
- def symbolize_keys!
13
- self.replace(self.symbolize_keys)
14
- end
15
-
16
- end
@@ -1,150 +0,0 @@
1
- module Wukong
2
- #
3
- # A hashlike has to
4
- #
5
- # *
6
- # * The arguments to your initializer should be the same as the keys, in order
7
- # If not, you must override #from_hash
8
- #
9
- #
10
- module HashLike
11
-
12
- # List of possible keys --
13
- # delegates to the class
14
- def keys
15
- self.class.keys
16
- end
17
-
18
- #
19
- # Return a Hash containing only values for the given keys.
20
- #
21
- # Since this is intended to mirror Hash#slice it will harmlessly ignore keys
22
- # not present in the struct. They will be unset (hsh.include? is not true)
23
- # as opposed to nil.
24
- #
25
- def slice *keys
26
- keys.inject({}) do |hsh, key|
27
- hsh[key] = send(key) if respond_to?(key)
28
- hsh
29
- end
30
- end
31
-
32
- #
33
- # values_at like a hash
34
- #
35
- # Since this is intended to mirror Hash#values_at it will harmlessly ignore
36
- # keys not present in the struct
37
- #
38
- def values_of *keys
39
- keys.map{|key| self.send(key) if respond_to?(key) }
40
- end
41
-
42
- #
43
- # Convert to a hash
44
- #
45
- def to_hash
46
- slice(*self.class.members)
47
- end
48
-
49
- #
50
- # Analagous to Hash#each_pair
51
- #
52
- def pairs
53
- self.class.members.map{|attr| [attr, self[attr]] }
54
- end
55
-
56
- #
57
- # Analagous to Hash#each_pair
58
- #
59
- def each_pair *args, &block
60
- pairs.each(*args, &block)
61
- end
62
-
63
- #
64
- # Analagous to Hash#merge
65
- #
66
- def merge *args
67
- self.dup.merge!(*args)
68
- end
69
- def merge! hsh, &block
70
- raise "can't handle block arg yet" if block
71
- hsh.each_pair{|key, val| self.send("#{key}=", val) if self.respond_to?("#{key}=") }
72
- self
73
- end
74
- alias_method :update, :merge!
75
-
76
- #
77
- # Merge hashes recursively.
78
- # Nothing special happens to array values
79
- #
80
- # x = { :subhash => { 1 => :val_from_x, 222 => :only_in_x, 333 => :only_in_x }, :scalar => :scalar_from_x}
81
- # y = { :subhash => { 1 => :val_from_y, 999 => :only_in_y }, :scalar => :scalar_from_y }
82
- # x.deep_merge y
83
- # => {:subhash=>{1=>:val_from_y, 222=>:only_in_x, 333=>:only_in_x, 999=>:only_in_y}, :scalar=>:scalar_from_y}
84
- # y.deep_merge x
85
- # => {:subhash=>{1=>:val_from_x, 222=>:only_in_x, 333=>:only_in_x, 999=>:only_in_y}, :scalar=>:scalar_from_x}
86
- #
87
- def deep_merge hsh2
88
- merge hsh2, &Hash::DEEP_MERGER
89
- end
90
-
91
- #
92
- # remove all key-value pairs where the value is blank
93
- #
94
- def compact_blank
95
- to_hash.compact_blank!
96
- end
97
-
98
- module ClassMethods
99
- #
100
- # Instantiate an instance of the struct from a hash
101
- #
102
- # Specify has_symbol_keys if the supplied hash's keys are symbolic;
103
- # otherwise they must be uniformly strings
104
- #
105
- def from_hash(hsh, has_symbol_keys=false)
106
- extract_keys = has_symbol_keys ? self.keys.map(&:to_sym) : self.keys.map(&:to_s)
107
- self.new(*hsh.values_of(*extract_keys))
108
- end
109
- #
110
- # The last portion of the class in underscored form
111
- # memoized
112
- #
113
- def resource_name
114
- @resource_name ||= self.class_basename.underscore.to_sym
115
- end
116
- # The last portion of the class name
117
- # memoized
118
- #
119
- # @example
120
- # This::That::TheOther.new.class_basename # => TheOther
121
- def class_basename
122
- @class_basename ||= self.to_s.gsub(%r{.*::}, '')
123
- end
124
- end
125
-
126
- def self.included base
127
- base.class_eval do
128
- extend ClassMethods
129
- end
130
- end
131
-
132
- def coerce_attr attr, coerce_blank_to_nil=false, &block
133
- orig_val = self.send(attr)
134
- new_val = (coerce_blank_to_nil && orig_val.blank?) ? nil : block.call(orig_val)
135
- self.send("#{attr}=", new_val)
136
- end
137
-
138
- def coerce_to_int! attr, *args
139
- coerce_attr(attr, *args) do |val|
140
- val.to_i
141
- end
142
- end
143
-
144
- def coerce_to_date! attr, *args
145
- coerce_attr(attr, *args){|val| val.is_a?(DateTime) ? val : DateTime.parse(val) rescue nil }
146
- end
147
-
148
- end
149
-
150
- end
@@ -1,47 +0,0 @@
1
- require 'wukong/extensions/class'
2
- module Wukong
3
-
4
- module HashlikeClass
5
- module ClassMethods
6
- def has_members *members
7
- self.members ||= []
8
- self.members = members.map(&:to_s) + self.members
9
- self.members.each do |member|
10
- attr_accessor member.to_sym
11
- end
12
- end
13
- alias_method :has_member, :has_members
14
- def keys
15
- members
16
- end
17
- end
18
-
19
- def [](key)
20
- self.send(key)
21
- end
22
-
23
- def []=(key, val)
24
- self.send("#{key}=", val)
25
- end
26
-
27
- def to_a
28
- values_of(*members)
29
- end
30
-
31
- def to_flat
32
- to_a.map(&:to_flat).flatten
33
- end
34
-
35
- def self.included base
36
- base.class_eval do
37
- extend ClassMethods
38
- include HashLike
39
- class_inheritable_accessor :members
40
-
41
- def to_hash *args
42
- super(*args).merge 'type' => self.class.to_s
43
- end
44
- end
45
- end
46
- end
47
- end
@@ -1,2 +0,0 @@
1
- require 'extlib/object'
2
- require 'extlib/module'
@@ -1,27 +0,0 @@
1
- require 'pathname'
2
- class Pathname
3
- # Append path segments and expand to absolute path
4
- #
5
- # file = Pathname(Dir.pwd) / "subdir1" / :subdir2 / "filename.ext"
6
- #
7
- # @param [Pathname, String, #to_s] path path segment to concatenate with receiver
8
- #
9
- # @return [Pathname]
10
- # receiver with _path_ appended and expanded to an absolute path
11
- #
12
- # @api public
13
- def /(path)
14
- (self + path).expand_path
15
- end
16
-
17
- def self.[](*vals)
18
- new( File.join(vals) )
19
- end
20
- end
21
-
22
- class Subdir < Pathname
23
- def self.[](*vals)
24
- dir = File.dirname(vals.shift)
25
- new(File.join(dir, *vals))
26
- end
27
- end
@@ -1,65 +0,0 @@
1
- #
2
- # String Monkeypatched for processing with wukong: see wukong/extensions/string
3
- #
4
- String.class_eval do
5
- # By default, +camelize+ converts strings to UpperCamelCase. If the argument to +camelize+
6
- # is set to <tt>:lower</tt> then +camelize+ produces lowerCamelCase.
7
- #
8
- # +camelize+ will also convert '/' to '::' which is useful for converting paths to namespaces.
9
- #
10
- # Examples:
11
- # "active_record".camelize # => "ActiveRecord"
12
- # "active_record".camelize(:lower) # => "activeRecord"
13
- # "active_record/errors".camelize # => "ActiveRecord::Errors"
14
- # "active_record/errors".camelize(:lower) # => "activeRecord::Errors"
15
- def camelize(first_letter_in_uppercase = true)
16
- if first_letter_in_uppercase
17
- self.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
18
- else
19
- self.first + camelize(self)[1..-1]
20
- end
21
- end
22
-
23
- #
24
- # The reverse of +camelize+. Makes an underscored, lowercase form from the expression in the string.
25
- #
26
- # Changes '::' to '/' to convert namespaces to paths.
27
- #
28
- # Examples:
29
- # "ActiveRecord".underscore # => "active_record"
30
- # "ActiveRecord::Errors".underscore # => active_record/errors
31
- #
32
- # Stolen from active_support
33
- #
34
- def underscore
35
- gsub(/::/, '/').
36
- gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
37
- gsub(/([a-z\d])([A-Z])/,'\1_\2').
38
- tr("-", "_").
39
- downcase
40
- end
41
-
42
- # Tries to find a constant with the name specified in the argument string:
43
- #
44
- # "Module".constantize # => Module
45
- # "Test::Unit".constantize # => Test::Unit
46
- #
47
- # The name is assumed to be the one of a top-level constant, no matter whether
48
- # it starts with "::" or not. No lexical context is taken into account:
49
- #
50
- # C = 'outside'
51
- # module M
52
- # C = 'inside'
53
- # C # => 'inside'
54
- # "C".constantize # => 'outside', same as ::C
55
- # end
56
- #
57
- # NameError is raised when the name is not in CamelCase or the constant is
58
- # unknown.
59
- def constantize
60
- unless /\A(?:::)?([A-Z]\w*(?:::[A-Z]\w*)*)\z/ =~ self
61
- raise NameError, "#{self.inspect} is not a valid constant name!"
62
- end
63
- Object.module_eval("::#{$1}", __FILE__, __LINE__)
64
- end
65
- end
@@ -1,17 +0,0 @@
1
- require 'wukong/extensions/hash'
2
- require 'wukong/extensions/hash_like'
3
- require 'wukong/extensions/symbol'
4
-
5
- #
6
- # extensions/struct
7
- #
8
- # Add several methods to make a struct duck-type much more like a Hash
9
- #
10
- Struct.class_eval do
11
- include Wukong::HashLike
12
- def self.keys
13
- members
14
- end
15
- end
16
-
17
-
@@ -1,11 +0,0 @@
1
- #
2
- # h2. extensions/symbol.rb -- extensions to symbol class
3
- #
4
- class Symbol
5
- #
6
- # Turn the symbol into a simple proc (stolen from
7
- # <tt>ActiveSupport::CoreExtensions::Symbol</tt>).
8
- def to_proc
9
- Proc.new { |*args| args.shift.__send__(self, *args) }
10
- end unless method_defined?(:to_proc)
11
- end
@@ -1,74 +0,0 @@
1
- module Wukong
2
- class FilenamePattern
3
- # the filename pattern, e.g. 'ripd/:handle/:date/:handle+:timestamp-:pid-:hostname.tsv'
4
- attr_accessor :pattern
5
- # custom token replacements
6
- attr_accessor :token_val_defaults
7
-
8
- DEFAULT_PATTERN_STR = ":dest_dir/:handle_prefix/:handle/:date/:handle:timestamp-:pid-:hostname.tsv"
9
-
10
- def initialize pattern, token_val_defaults={}
11
- self.pattern = pattern
12
- self.token_val_defaults = token_val_defaults
13
- end
14
-
15
- #
16
- # walk through pattern, replacing tokens (eg :time or :pid) with the
17
- # corresponding value.
18
- #
19
- # Don't use ':' in a pattern except to introduce a token
20
- # and separate tokens with '-', '+' '/' or '.'
21
- #
22
- def make token_vals={}
23
- token_vals = token_val_defaults.merge token_vals
24
- token_vals[:timestamp] ||= Time.now.utc.strftime("%Y%m%d%H%M%S")
25
- val = pattern.gsub(/:(\w+)/){ replace($1, token_vals) }
26
- val
27
- end
28
-
29
- def to_s token_vals={}
30
- make token_vals
31
- end
32
-
33
- #
34
- # substitute for token
35
- #
36
- def replace token, token_vals
37
- token = token.to_sym
38
- return token_vals[token] if token_vals.include? token
39
- case token
40
- when :pid then pid
41
- when :hostname then hostname
42
- when :handle then token_vals[:handle]
43
- when :handle_prefix then token_vals[:handle].to_s[0..5]
44
- when :timestamp then token_vals[:timestamp]
45
- when :date then token_vals[:timestamp][ 0..7]
46
- when :time then token_vals[:timestamp][ 8..13]
47
- when :hour then token_vals[:timestamp][ 8..9]
48
- when :h4 then "%0.2d" % (( token_vals[:timestamp][8..9].to_i / 4 ) * 4)
49
- when :min then token_vals[:timestamp][10..11]
50
- when :sec then token_vals[:timestamp][12..13]
51
- when :s10 then "%0.2d" % (( token_vals[:timestamp][12..13].to_i / 10 ) * 10)
52
- else
53
- raise "Don't know how to encode token #{token} #{token_vals[token]}"
54
- end
55
- end
56
-
57
- # Memoized: the hostname for the machine running this script.
58
- def hostname
59
- @hostname ||= ENV['HOSTNAME'] || `hostname`.chomp
60
- end
61
- # Memoized: the Process ID for this invocation.
62
- def pid
63
- @pid ||= Process.pid
64
- end
65
-
66
- # Characters deemed safe in a filename;
67
- SAFE_CHARS = 'a-zA-Z0-9_\-\.\+\/'
68
- RE_SAFE_FILENAME = %r{[^#{SAFE_CHARS}]+}moxi
69
- def self.sanitize str
70
- str.gsub(RE_SAFE_FILENAME, '-')
71
- end
72
-
73
- end
74
- end
@@ -1,7 +0,0 @@
1
- module Wukong
2
- module Helper
3
-
4
- autoload :Tokenize, 'wukong/helper/tokenize'
5
-
6
- end
7
- end
@@ -1,195 +0,0 @@
1
- module Wukong
2
- module Corpus
3
- STOPWORDS = %w[
4
- the
5
- of
6
- and
7
- a
8
- in
9
- to
10
- it
11
- is
12
- was
13
- I
14
- for
15
- that
16
- you
17
- he
18
- be
19
- with
20
- on
21
- by
22
- at
23
- have
24
- are
25
- not
26
- this
27
- but
28
- had
29
- they
30
- his
31
- from
32
- she
33
- which
34
- or
35
- we
36
- an
37
- were
38
- as
39
- do
40
- been
41
- their
42
- has
43
- would
44
- there
45
- what
46
- will
47
- all
48
- if
49
- can
50
- her
51
- said
52
- who
53
- so
54
- up
55
- them
56
- when
57
- some
58
- could
59
- him
60
- into
61
- its
62
- then
63
- out
64
- my
65
- about
66
- did
67
- your
68
- me
69
- other
70
- just
71
- more
72
- these
73
- also
74
- any
75
- see
76
- very
77
- may
78
- well
79
- should
80
- than
81
- how
82
- get
83
- way
84
- our
85
- made
86
- got
87
- after
88
- many
89
- those
90
- go
91
- being
92
- because
93
- down
94
- such
95
- over
96
- must
97
- still
98
- even
99
- too
100
- here
101
- come
102
- own
103
- last
104
- does
105
- oh
106
- no
107
- where
108
- us
109
- same
110
- might
111
- yes
112
- put
113
- another
114
- most
115
- again
116
- under
117
- much
118
- why
119
- each
120
- while
121
- off
122
- went
123
- used
124
- without
125
- give
126
- within
127
-
128
- am
129
- aren't
130
- between
131
- both
132
- can't
133
- cannot
134
- couldn't
135
- didn't
136
- doesn't
137
- doing
138
- don't
139
- hadn't
140
- hasn't
141
- haven't
142
- having
143
- he'd
144
- he'll
145
- he's
146
- here's
147
- hers
148
- how's
149
- i'd
150
- i'll
151
- i'm
152
- i've
153
- isn't
154
- it'd
155
- it'll
156
- it's
157
- let's
158
- once
159
- only
160
- ought
161
- ours
162
- she'd
163
- she'll
164
- she's
165
- shouldn't
166
- that's
167
- theirs
168
- there's
169
- they'd
170
- they'll
171
- they're
172
- they've
173
- through
174
- wasn't
175
- we'd
176
- we'll
177
- we're
178
- we've
179
- weren't
180
- what's
181
- where's
182
- who's
183
- won't
184
- wouldn't
185
- you'd
186
- you'll
187
- you're
188
- you've
189
- yours
190
-
191
- ].to_set
192
- STOPWORDS_3 = STOPWORDS.reject{|w| w.length < 3 }.to_set
193
-
194
- end
195
- end