wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,41 +0,0 @@
1
- module Wukong
2
- #
3
- # Local execution Options
4
- #
5
- module LocalCommand
6
-
7
- Settings.define :sort_command, :default => 'sort'
8
- Settings.define :sort_args, :default => [], :description => 'Extra params to send to the sort function eg: Settings.sort_args = ["-t", "\t", "-S", "200M"]'
9
-
10
- def execute_local_workflow
11
- Log.info " Reading STDIN / Writing STDOUT"
12
- execute_command!(local_commandline)
13
- end
14
-
15
- # program, including arg, to sort input between mapper and reducer in local
16
- # mode. You could override to for example run 'sort -n' (numeric sort).
17
- def local_mode_sort_commandline
18
- [ Settings.sort_command, Settings.sort_args ].flatten.join(" ")
19
- end
20
-
21
- #
22
- # Commandline string to execute the job in local mode
23
- #
24
- # With an input path of '-', just uses $stdin
25
- # With an output path of '-', just uses $stdout
26
- #
27
- def local_commandline
28
- @input_paths = input_paths.map(&:strip).join(' ')
29
- cmd_input_str = (input_paths == '-') ? "" : "cat '#{input_paths}' | "
30
- cmd_output_str = (output_path == '-') ? "" : "> '#{output_path}'"
31
-
32
- if (reducer || options[:reduce_command])
33
- %Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} | #{reducer_commandline} #{cmd_output_str} }
34
- else
35
- %Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} #{cmd_output_str} }
36
- end
37
-
38
- end
39
-
40
- end
41
- end
@@ -1,10 +0,0 @@
1
- module Wukong
2
- module Store
3
- autoload :Base, 'wukong/store/base'
4
- autoload :FlatFileStore, 'wukong/store/flat_file_store'
5
- autoload :ChunkedFlatFileStore, 'wukong/store/chunked_flat_file_store'
6
- autoload :ChhChunkedFlatFileStore, 'wukong/store/chh_chunked_flat_file_store'
7
-
8
- autoload :CassandraModel, 'wukong/store/cassandra_model'
9
- end
10
- end
@@ -1,27 +0,0 @@
1
- module Wukong
2
- module Store
3
- class Base
4
- def initialize options={}
5
- Log.info "Creating #{self.class} with #{options.inspect}"
6
- end
7
-
8
- #Iterate through each object casting it as a new object of klass.
9
- def each_as klass, &block
10
- self.each do |*args|
11
- begin
12
- item = klass.new *args[1..-1]
13
- rescue StandardError => e
14
- Log.info [args, e.to_s, self].join("\t")
15
- raise e
16
- end
17
- yield item
18
- end
19
- end
20
-
21
- def log_line
22
- nil
23
- end
24
-
25
- end
26
- end
27
- end
@@ -1,10 +0,0 @@
1
- Settings.define :cassandra_hosts, :default => '127.0.0.1:9160', :type => Array, :description => 'Comma-delimited list of hostname:port addresses for the Cassandra database holding Twitter API objects'
2
- Settings.define :cassandra_keyspace, :default => 'soc_net_tw', :description => 'Cassandra keyspace for Twitter objects'
3
-
4
- module Wukong
5
- module Store
6
- module CassandraStore
7
- autoload :StructLoader, 'wukong/store/cassandra/struct_loader'
8
- end
9
- end
10
- end
@@ -1,75 +0,0 @@
1
- require 'avro'
2
-
3
- Settings.define :cassandra_avro_schema, :default => ('/usr/local/share/cassandra/interface/avro/cassandra.avpr')
4
- module Wukong::Store::CassandraModel
5
-
6
- #
7
- # Store model using avro writer
8
- #
9
- def streaming_save
10
- self.class.streaming_insert id, self
11
- end
12
- module ClassMethods
13
-
14
- def streaming_writer
15
- @streaming_writer ||= AvroWriter.new
16
- end
17
-
18
- #
19
- # Use avro and stream into cassandra
20
- #
21
- def streaming_insert id, hsh
22
- streaming_writer.put(id.to_s, hsh.to_db_hash)
23
- end
24
- end
25
- class AvroWriter
26
- #
27
- # Reads in the protocol schema
28
- # creates the necessary encoder and writer.
29
- #
30
- def initialize
31
- schema_file = Settings.cassandra_avro_schema
32
- @proto = Avro::Protocol.parse(File.read(schema_file))
33
- @schema = @proto.types.detect{|schema| schema.name == 'StreamingMutation'}
34
- @enc = Avro::IO::BinaryEncoder.new($stdout)
35
- # @enc = DummyEncoder.new($stdout)
36
- @writer = Avro::IO::DatumWriter.new(@schema)
37
- # warn [@schema, @enc].inspect
38
- end
39
-
40
- def write key, col_name, value
41
- @writer.write(smutation(key, col_name, value), @enc)
42
- end
43
-
44
- def write_directly key, col_name, value, timestamp, ttl
45
- # Log.info "Insert(row_key => #{key}, col_name => #{col_name}, value => #{value}"
46
- @enc.write_bytes(key)
47
- @enc.write_bytes(col_name)
48
- @enc.write_bytes(value)
49
- @enc.write_long(timestamp)
50
- @enc.write_int(ttl)
51
- end
52
-
53
- #
54
- # Iterate through each key value pair in the hash to
55
- # be inserted and write directly one at a time
56
- #
57
- def put id, hsh, timestamp=nil, ttl=0
58
- timestamp ||= Time.now.to_i
59
- hsh.each do |attr, val|
60
- write_directly(id, attr, val, timestamp, ttl)
61
- end
62
- end
63
-
64
- def smutation key, name, value
65
- {
66
- 'key' => key,
67
- 'name' => name.to_s,
68
- 'value' => value.to_s,
69
- 'timestamp' => Time.epoch_microseconds,
70
- 'ttl' => 0
71
- }
72
- end
73
- end
74
-
75
- end
@@ -1,21 +0,0 @@
1
- require 'avro'
2
-
3
- Settings.define :cassandra_avro_schema, :default => ('/usr/local/share/cassandra/interface/avro/cassandra.avpr')
4
-
5
- module Wukong::Store::Cassandra
6
- class StructLoader < Wukong::Streamer::StructStreamer
7
- def initialize *args
8
- super(*args)
9
- @log = PeriodicMonitor.new
10
- end
11
-
12
- #
13
- # Blindly expects objects streaming by to have a "streaming_save" method
14
- #
15
- def process object, *_
16
- # object.save
17
- object.streaming_save
18
- @log.periodically(object.to_flat)
19
- end
20
- end
21
- end
@@ -1,91 +0,0 @@
1
- module Wukong
2
- module Store
3
- #
4
- # Barebones interface between a wukong class and a cassandra database
5
- #
6
- # Class must somehow provide a class-level cassandra_db accessor
7
- # that sets the @cassandra_db instance variable.
8
- #
9
- module CassandraModel
10
- #
11
- # Store model to the DB
12
- #
13
- def save
14
- self.class.insert key, self.to_db_hash
15
- end
16
-
17
- #
18
- # Flatten attributes for storage in the DB.
19
- #
20
- # * omits elements whose value is nil
21
- # * calls to_s on everything else
22
- # * This means that blank strings are preserved;
23
- # * and that false is saved as 'false'
24
- #
25
- # Override if you think something fancier than that should happen.
26
- #
27
- def to_db_hash
28
- db_hsh = {}
29
- each_pair{|k,v| db_hsh[k.to_s] = v.to_s unless v.nil? }
30
- db_hsh
31
- end
32
-
33
-
34
- module ClassMethods
35
- # Cassandra column family -- taken from the class name by default.
36
- def table_name
37
- class_basename
38
- end
39
-
40
- # Override to control how your class is instantiated from the DB hash
41
- def from_db_hash *args
42
- from_hash *args
43
- end
44
-
45
- # Insert into the cassandra database
46
- # uses object's #to_db_hash method
47
- def insert key, *args
48
- hsh = args.first
49
- cassandra_db.insert(table_name, key.to_s, hsh)
50
- end
51
-
52
- # Insert into the cassandra database
53
- # calls out to object's #from_db_hash method
54
- def load key
55
- hsh = cassandra_db.get(self.class_basename, key.to_s)
56
- from_db_hash(hsh) if hsh
57
- end
58
-
59
- # invalidates cassandra connection on errors where that makes sense.
60
- def handle_error action, e
61
- warn "#{action} failed: #{e} #{e.backtrace.join("\t")}" ;
62
- @cassandra_db = nil
63
- sleep 0.2
64
- end
65
- end
66
- # The standard 'inject class methods when module is included' trick
67
- def self.included base
68
- base.class_eval{ extend ClassMethods}
69
- end
70
- end
71
-
72
- end
73
- end
74
-
75
- Hash.class_eval do
76
- #
77
- # Flatten attributes for storage in the DB.
78
- #
79
- # * omits elements whose value is nil
80
- # * calls to_s on everything else
81
- # * This means that blank strings are preserved;
82
- # * and that false is saved as 'false'
83
- #
84
- # Override if you think something fancier than that should happen.
85
- #
86
- def to_db_hash
87
- db_hsh = {}
88
- to_hash.each{|k,v| db_hsh[k.to_s] = v.to_s unless v.nil? }
89
- db_hsh
90
- end
91
- end
@@ -1,37 +0,0 @@
1
- module Wukong
2
- module Store
3
- class ChhChunkedFlatFileStore < Wukong::Store::FlatFileStore
4
- attr_accessor :filename_pattern, :handle, :rootdir
5
-
6
- # Move to configliere
7
- Settings.define :chunk_file_pattern, :default => ":rootdir/:date/:handle:timestamp-:pid.tsv",:description => "The pattern for chunked files."
8
- Settings.define :chunk_file_rootdir, :default => nil, :description => "The root directory for the chunked files."
9
-
10
- #Note that filemode is inherited from flat_file
11
-
12
- def initialize options={}
13
- # super wants a :filename in the options or it will fail. We need to get the initial filename
14
- # set up before we call super, so we need all of the parts of the pattern set up.
15
- self.rootdir = options[:rootdir] || Settings[:chunk_file_rootdir]
16
- self.handle = options[:handle]
17
- pattern = options[:pattern] || Settings[:chunk_file_pattern]
18
- self.filename_pattern = FilenamePattern.new(pattern, :handle => handle, :rootdir => self.rootdir)
19
- options[:filename] = filename_pattern.make()
20
-
21
- super options
22
-
23
- self.mkdir!
24
- end
25
-
26
- def new_chunk
27
- new_filename = filename_pattern.make()
28
- Log.info "Rotating chunked file #{filename} into #{new_filename}"
29
- self.flush
30
- self.close
31
- @filename = new_filename
32
- self.mkdir!
33
- end
34
-
35
- end
36
- end
37
- end
@@ -1,48 +0,0 @@
1
- require 'wukong/monitor/periodic_monitor'
2
- module Wukong
3
- module Store
4
- class ChunkedFlatFileStore < Wukong::Store::FlatFileStore
5
- attr_accessor :filename_pattern, :chunk_monitor, :handle, :chunktime, :rootdir
6
-
7
- # Move to configliere
8
- Settings.define :chunk_file_pattern, :default => ":rootdir/:date/:handle-:timestamp-:pid.tsv",:description => "The pattern for chunked files."
9
- Settings.define :chunk_file_interval, :default => 4*60*60, :description => "The time interval to keep a chunk file open."
10
- Settings.define :chunk_file_rootdir, :default => '/tmp', :description => "The root directory for the chunked files."
11
-
12
- #Note that filemode is inherited from flat_file
13
-
14
- def initialize options={}
15
- # super wants a :filename in the options or it will fail. We need to get the initial filename
16
- # set up before we call super, so we need all of the parts of the pattern set up.
17
- self.chunktime = options[:interval] || Settings[:chunk_file_interval]
18
- self.rootdir = options[:rootdir] || Settings[:chunk_file_rootdir]
19
- self.handle = options[:handle]
20
- pattern = options[:pattern] || Settings[:chunk_file_pattern]
21
- self.filename_pattern = FilenamePattern.new(pattern, :handle => handle, :rootdir => self.rootdir)
22
- options[:filename] = filename_pattern.make()
23
- options[:filemode] ||= 'a'
24
- Log.warn "You don't really want a chunk time this small: #{self.chunktime}" unless self.chunktime > 600
25
- self.chunk_monitor = Wukong::Monitor::PeriodicMonitor.new( :time => self.chunktime )
26
-
27
- super options
28
- self.mkdir!
29
- end
30
-
31
- def new_chunk!
32
- new_filename = filename_pattern.make()
33
- Log.info "Rotating chunked file #{filename} into #{new_filename}"
34
- self.flush
35
- self.close
36
- @filename = new_filename
37
- self.mkdir!
38
- end
39
-
40
- def save *args
41
- result = super *args
42
- chunk_monitor.periodically{ new_chunk! }
43
- result
44
- end
45
-
46
- end
47
- end
48
- end
@@ -1,57 +0,0 @@
1
- module Monkeyshines
2
- module Store
3
- class ConditionalStore < Monkeyshines::Store::Base
4
- attr_accessor :options, :cache, :store, :misses
5
-
6
- DEFAULT_OPTIONS = {
7
- :cache => { :type => :tyrant_rdb_key_store },
8
- :store => { :type => :chunked_flat_file_store },
9
- }
10
-
11
- #
12
- #
13
- # +cache+ must behave like a hash (Hash and
14
- # Monkeyshines::Store::TyrantRdbKeyStore are both cromulent
15
- # choices).
16
- #
17
- #
18
- #
19
- def initialize _options
20
- self.options = DEFAULT_OPTIONS.deep_merge(_options)
21
- self.cache = Monkeyshines::Store.create(options[:cache])
22
- self.store = Monkeyshines::Store.create(options[:store])
23
- self.misses = 0
24
- end
25
-
26
- #
27
- # If key is absent, save the result of calling the block.
28
- # If key is present, block is never called.
29
- #
30
- # Ex:
31
- # rt_store.set(url) do
32
- # fetcher.get url # will only be called if url isn't in rt_store
33
- # end
34
- #
35
- def set key, force=nil, &block
36
- return if (!force) && cache.include?(key)
37
- cache_val, store_val = block.call()
38
- return unless cache_val
39
- cache.set_nr key, cache_val # update cache
40
- store << store_val # save value
41
- self.misses += 1 # track the cache miss
42
- store_val
43
- end
44
-
45
- def size() cache.size end
46
-
47
- def log_line
48
- [size, "%8d misses"%misses]
49
- end
50
-
51
- def close()
52
- cache.close
53
- store.close
54
- end
55
- end
56
- end
57
- end
@@ -1,8 +0,0 @@
1
- module Monkeyshines
2
- module Store
3
- class Factory
4
- def self.generate type, opts
5
- end
6
- end
7
- end
8
- end
@@ -1,89 +0,0 @@
1
- require 'fileutils'; include FileUtils
2
-
3
- module Wukong
4
- module Store
5
- #
6
- class FlatFileStore < Store::Base
7
- attr_accessor :filename, :filemode
8
-
9
- #
10
- # +filename_root+ : first part of name for files
11
- #
12
- def initialize options={}
13
- super options
14
- self.filename = options[:filename] or raise "Missing filename in #{self.class}"
15
- self.filemode = options[:filemode] || 'r'
16
- skip!(options[:skip]) if options[:skip]
17
- end
18
-
19
- #
20
- #
21
- #
22
- def each &block
23
- file.each do |line|
24
- attrs = line.chomp.split("\t")
25
- next if attrs.blank?
26
- yield *attrs
27
- end
28
- end
29
-
30
- #
31
- # Read ahead n_lines lines in the file
32
- #
33
- def skip! n_lines
34
- Log.info "Skipping #{n_lines} in #{self.class}:#{filename}"
35
- n_lines.times do
36
- file.readline
37
- end
38
- end
39
-
40
- #
41
- # Open the timestamped file,
42
- # ensuring its directory exists
43
- #
44
- def file
45
- return @file if @file
46
- Log.info "Opening file #{filename} with mode #{filemode}"
47
- @file = File.open(filename, filemode)
48
- end
49
-
50
- # Close the dump file
51
- def close
52
- @file.close if @file
53
- @file = nil
54
- end
55
-
56
- def flush
57
- @file.flush if @file
58
- end
59
-
60
- # Ensure the file's directory exists
61
- def mkdir!
62
- dir = File.dirname(filename)
63
- return if File.directory?(dir)
64
- Log.info "Making directory #{dir}"
65
- FileUtils.mkdir_p dir
66
- end
67
-
68
- # write to the file
69
- def save obj
70
- file.puts obj
71
- obj
72
- end
73
-
74
- # returns the size of the current file
75
- def size
76
- return 0 if !@file
77
- File.size(filename)
78
- end
79
-
80
- # delegates to +#save+ -- writes the object to the file. Returns self for chaining on the stream.
81
- def <<(obj)
82
- save obj
83
- self
84
- end
85
-
86
- end
87
- end
88
- end
89
-