wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,3 +0,0 @@
1
- australia 253 499 671 663 710 687 774 654 627 422 376 132 25
2
- spain 37 102 257 177 118 90 144 183 210 222 162 93 17
3
- sweden 32 167 306 334 314 287 330 366 415 343 266 130 51
@@ -1,20 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # run like so:
3
- # $> ruby normalize.rb --run=local data/sizes.tsv data/normalized_sizes.tsv
4
- require 'rubygems'
5
- require 'wukong'
6
- require 'active_support/core_ext/enumerable' # for array#sum
7
-
8
- module Normalize
9
- class Mapper < Wukong::Streamer::RecordStreamer
10
- def process(country, *sizes)
11
- sizes.map!(&:to_i)
12
- sum = sizes.sum.to_f
13
- normalized = sizes.map{|x| 100 * x/sum }
14
- s = normalized.join(",")
15
- yield [country, s]
16
- end
17
- end
18
- end
19
-
20
- Wukong::Script.new(Normalize::Mapper, nil).run
@@ -1,55 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # run like so:
3
- # $> ruby sizes.rb --run=local data/orders.tsv data/sizes
4
- require 'rubygems'
5
- require 'wukong'
6
-
7
- module JeanSizes
8
- class Mapper < Wukong::Streamer::RecordStreamer
9
- def process(code,model,time,country,reg,col, n1,c1, venue,n3,n4, *sizes)
10
- yield [country, *sizes]
11
- end
12
- end
13
-
14
- #
15
- # This uses a ListReducer. It's nice and simple, but requires first
16
- # accumulating each key's records in memory.
17
- #
18
- class JeansListReducer < Wukong::Streamer::ListReducer
19
- def finalize
20
- return if values.empty?
21
- sums = []; 13.times{ sums << 0 }
22
- values.each do |country, *sizes|
23
- sizes.map!(&:to_i)
24
- sums = sums.zip(sizes).map{|sum, val| sum + val }
25
- end
26
- yield [key, *sums]
27
- end
28
- end
29
-
30
-
31
- #
32
- # This uses an AccumulatingReducer directly.
33
- # It has the advantage of a minimal footprint.
34
- #
35
- class JeansAccumulatingReducer < Wukong::Streamer::AccumulatingReducer
36
- attr_accessor :sums
37
-
38
- # start the sum with 0 for each size
39
- def start! *_
40
- self.sums = []; 13.times{ self.sums << 0 }
41
- end
42
- # accumulate each size count into the sizes_sum
43
- def accumulate country, *sizes
44
- sizes.map!(&:to_i)
45
- self.sums = self.sums.zip(sizes).map{|sum, val| sum + val }
46
- end
47
- # emit [country, size_0_sum, size_1_sum, ...]
48
- def finalize
49
- yield [key, sums].flatten
50
- end
51
- end
52
-
53
- end
54
-
55
- Wukong::Script.new(JeanSizes::Mapper, JeanSizes::JeansListReducer).run
@@ -1,44 +0,0 @@
1
- #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)
3
- require 'rubygems'
4
- require 'wukong/script'
5
-
6
- Settings.define :ripd_root, :default => '/data/chimpmark/ripd'
7
- BNC_SOURCE_FILE='ucrel.lancs.ac.uk/bncfreq/lists/1_1_all_fullalpha.txt'
8
-
9
- # File 1_1_all_fullalpha.txt -- 794771 lines
10
- #
11
- # cat /data/chimpmark/ripd/ucrel.lancs.ac.uk/bncfreq/lists/1_1_all_fullalpha.txt | ./bnc_word_freq.rb --map | sort -nk3 > /data/chimpmark/rawd/bnc_word_freq/bnc_word_freq.tsv
12
-
13
- class BncParser < Wukong::Streamer::RecordStreamer
14
- def before_stream
15
- @head_word, @part_of_speech, @head_word_stats = ["","",[]]
16
- $stdin.readline
17
- $stdin.readline
18
- end
19
-
20
- def process _, word, pos, variant, freq_ppm, range, dispersion
21
- word_stats = [freq_ppm, range, dispersion]
22
-
23
- unless word == "@" # lemma for a different head word
24
- @head_word = word
25
- @part_of_speech = pos
26
- @head_word_stats = word_stats
27
- end
28
-
29
- weirdness = (@head_word =~ /[^a-zA-Z]/)
30
-
31
- if variant == '%' # head word with lemmas
32
- word_stats = ['','','']
33
- elsif variant == ':' # head word with no lemmas
34
- variant = word
35
- else
36
- weirdness = weirdness || (variant =~ /[^a-zA-Z]/)
37
- end
38
- yield [@head_word, @part_of_speech, @head_word_stats, variant, word_stats, (weirdness ? 1 : 0)].flatten.join("\t")
39
- end
40
- end
41
-
42
- Wukong.run(
43
- BncParser, nil
44
- )
@@ -1,47 +0,0 @@
1
-
2
- class BucketCounter
3
- BUCKET_SIZE = 2**24
4
- attr_reader :total
5
-
6
- def initialize
7
- @hsh = Hash.new{|h,k| h[k] = 0 }
8
- @total = 0
9
- end
10
-
11
- # def [] val
12
- # @hsh[val]
13
- # end
14
- # def << val
15
- # @hsh[val] += 1; @total += 1 ; self
16
- # end
17
-
18
- def [] val
19
- @hsh[val.hash % BUCKET_SIZE]
20
- end
21
- def << val
22
- @hsh[val.hash % BUCKET_SIZE] += 1; @total += 1 ; self
23
- end
24
-
25
- def insert *words
26
- words.flatten.each{|word| self << word }
27
- end
28
- def clear
29
- @hsh.clear
30
- @total = 0
31
- end
32
-
33
- def stats
34
- { :total => total,
35
- :size => size,
36
- }
37
- end
38
- def size() @hsh.size end
39
-
40
- def full?
41
- size.to_f / BUCKET_SIZE > 0.5
42
- end
43
-
44
- def each *args, &block
45
- @hsh.each(*args, &block)
46
- end
47
- end
@@ -1,86 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require 'rubygems'
3
- require 'wukong/script'
4
-
5
- #
6
- # Use the stanford NLP parse to split a piece of text into sentences
7
- #
8
- # @example
9
- # SentenceParser.split("Beware the Jabberwock, my son! The jaws that bite, the claws that catch! Beware the Jubjub bird, and shun The frumious Bandersnatch!")
10
- # # => [["Beware", "the", "Jabberwock", ",", "my", "son", "!"], ["The", "jaws", "that", "bite", ",", "the", "claws", "that", "catch", "!"], ["Beware", "the", "Jubjub", "bird", ",", "and", "shun", "The", "frumious", "Bandersnatch", "!"]]
11
- #
12
- class SentenceParser
13
- def self.processor
14
- return @processor if @processor
15
- require 'rubygems'
16
- require 'stanfordparser'
17
- @processor = StanfordParser::DocumentPreprocessor.new
18
- end
19
-
20
- def self.split line
21
- processor.getSentencesFromString(line).map{|s| s.map{|w| w.to_s } }
22
- end
23
- end
24
-
25
- #
26
- # takes one document per line
27
- # splits into sentences
28
- #
29
- class WordNGrams < Wukong::Streamer::LineStreamer
30
- def recordize line
31
- line.strip!
32
- line.gsub!(%r{^<http://dbpedia.org/resource/([^>]+)> <[^>]+> \"}, '') ; title = $1
33
- line.gsub!(%r{\"@en \.},'')
34
- [title, SentenceParser.split(line)]
35
- end
36
-
37
- def process title, sentences
38
- sentences.each_with_index do |words, idx|
39
- yield [title, idx, words].flatten
40
- end
41
- end
42
- end
43
-
44
- Wukong.run WordNGrams, nil, :partition_fields => 1, :sort_fields => 2
45
-
46
- # ---------------------------------------------------------------------------
47
- #
48
- # Run Time:
49
- #
50
- # Job Name: dbpedia_abstract_to_sentences.rb---/data/rawd/encyc/dbpedia/dbpedia_dumps/short_abstracts_en.nt---/data/rawd/encyc/dbpedia/dbpedia_parsed/short_abstract_sentences
51
- # Status: Succeeded
52
- # Started at: Fri Jan 28 03:14:45 UTC 2011
53
- # Finished in: 41mins, 50sec
54
- # 3 machines: master m1.xlarge, 2 c1.xlarge workers; was having some over-memory issues on the c1.xls
55
- #
56
- # Counter Reduce Total
57
- # SLOTS_MILLIS_MAPS 0 10 126 566
58
- # Launched map tasks 0 15
59
- # Data-local map tasks 0 15
60
- # SLOTS_MILLIS_REDUCES 0 1 217
61
- # HDFS_BYTES_READ 1 327 116 133 1 327 116 133
62
- # HDFS_BYTES_WRITTEN 1 229 841 020 1 229 841 020
63
- # Map input records 3 261 096 3 261 096
64
- # Spilled Records 0 0
65
- # Map input bytes 1 326 524 800 1 326 524 800
66
- # SPLIT_RAW_BYTES 1 500 1 500
67
- # Map output records 9 026 343 9 026 343
68
- #
69
- # Job Name: dbpedia_abstract_to_sentences.rb---/data/rawd/encyc/dbpedia/dbpedia_dumps/long_abstracts_en.nt---/data/rawd/encyc/dbpedia/dbpedia_parsed/long_abstract_sentences
70
- # Status: Succeeded
71
- # Started at: Fri Jan 28 03:23:08 UTC 2011
72
- # Finished in: 41mins, 11sec
73
- # 3 machines: master m1.xlarge, 2 c1.xlarge workers; was having some over-memory issues on the c1.xls
74
- #
75
- # Counter Reduce Total
76
- # SLOTS_MILLIS_MAPS 0 19 872 357
77
- # Launched map tasks 0 29
78
- # Data-local map tasks 0 29
79
- # SLOTS_MILLIS_REDUCES 0 5 504
80
- # HDFS_BYTES_READ 2 175 900 769 2 175 900 769
81
- # HDFS_BYTES_WRITTEN 2 280 332 736 2 280 332 736
82
- # Map input records 3 261 096 3 261 096
83
- # Spilled Records 0 0
84
- # Map input bytes 2 174 849 644 2 174 849 644
85
- # SPLIT_RAW_BYTES 2 533 2533
86
- # Map output records 15 425 467 15 425 467
@@ -1,53 +0,0 @@
1
- #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)
3
- require 'rubygems'
4
- require 'wukong/script'
5
- require 'bucket_counter'
6
-
7
- #
8
- # Coocurrence counts
9
- #
10
-
11
- #
12
- # Input is a list of document-idx-sentences, each field is tab-separated
13
- # title idx word_a word_b word_c ...
14
- #
15
- # This emits each co-courring pair exactly once; in the case of a three-word
16
- # sentence the output would be
17
- #
18
- # word_a word_b
19
- # word_a word_c
20
- # word_b word_c
21
- #
22
- class SentenceBigrams < Wukong::Streamer::RecordStreamer
23
- def process title, idx, *words
24
- words[0..-2].zip(words[1..-1]).each do |word_a, word_b|
25
- yield [word_a, word_b]
26
- end
27
- end
28
- end
29
-
30
- #
31
- # Combine multiple bucket counts into a single on
32
- #
33
- class CombineBuckets < Wukong::Streamer::AccumulatingReducer
34
- def get_key *fields
35
- fields[0..1]
36
- end
37
- def start! *args
38
- @total = 0
39
- end
40
- def accumulate *fields
41
- @total += 1
42
- end
43
- def finalize
44
- yield [@total, key].flatten
45
- end
46
- end
47
-
48
- Wukong.run(
49
- SentenceBigrams,
50
- CombineBuckets,
51
- :io_sort_record_percent => 0.3,
52
- :io_sort_mb => 300
53
- )
@@ -1,66 +0,0 @@
1
- #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)
3
- require 'rubygems'
4
- require 'wukong/script'
5
- require 'bucket_counter'
6
-
7
- #
8
- # Coocurrence counts
9
- #
10
-
11
- #
12
- # Input is a list of document-idx-sentences, each field is tab-separated
13
- # title idx word_a word_b word_c ...
14
- #
15
- # This emits each co-courring pair exactly once; in the case of a three-word
16
- # sentence the output would be
17
- #
18
- # word_a word_b
19
- # word_a word_c
20
- # word_b word_c
21
- #
22
- class SentenceCoocurrence < Wukong::Streamer::RecordStreamer
23
- def initialize *args
24
- super *args
25
- @bucket = BucketCounter.new
26
- end
27
-
28
- def process title, idx, *words
29
- @bucket << words[0..-2].zip(words[1..-1])
30
- dump_bucket if @bucket.full?
31
- end
32
-
33
- def dump_bucket
34
- @bucket.each do |pair_key, count|
35
- emit [pair_key, count]
36
- end
37
- $stderr.puts "bucket stats: #{@bucket.stats.inspect}"
38
- @bucket.clear
39
- end
40
-
41
- def after_stream
42
- dump_bucket
43
- end
44
- end
45
-
46
- #
47
- # Combine multiple bucket counts into a single on
48
- #
49
- class CombineBuckets < Wukong::Streamer::AccumulatingReducer
50
- def start! *args
51
- @total = 0
52
- end
53
- def accumulate word, count
54
- @total += count.to_i
55
- end
56
- def finalize
57
- yield [@total, key] if @total > 20
58
- end
59
- end
60
-
61
- Wukong.run(
62
- SentenceCoocurrence,
63
- CombineBuckets,
64
- :io_sort_record_percent => 0.3,
65
- :io_sort_mb => 300
66
- )
@@ -1,138 +0,0 @@
1
- STOPWORDS_3 = %w[
2
- the
3
- of
4
- and
5
- a
6
- in
7
- to
8
- it
9
- is
10
- was
11
- I
12
- for
13
- that
14
- you
15
- he
16
- be
17
- with
18
- on
19
- by
20
- at
21
- have
22
- are
23
- not
24
- this
25
- but
26
- had
27
- they
28
- his
29
- from
30
- she
31
- which
32
- or
33
- we
34
- an
35
- were
36
- as
37
- do
38
- been
39
- their
40
- has
41
- would
42
- there
43
- what
44
- will
45
- all
46
- if
47
- can
48
- her
49
- said
50
- who
51
- one
52
- so
53
- up
54
- them
55
- when
56
- some
57
- could
58
- him
59
- into
60
- its
61
- then
62
- two
63
- out
64
- time
65
- my
66
- about
67
- did
68
- your
69
- now
70
- me
71
- other
72
- only
73
- just
74
- more
75
- these
76
- also
77
- any
78
- see
79
- very
80
- may
81
- well
82
- should
83
- than
84
- how
85
- get
86
- way
87
- our
88
- made
89
- got
90
- after
91
- many
92
- those
93
- go
94
- being
95
- because
96
- down
97
- such
98
- through
99
- over
100
- must
101
- still
102
- even
103
- take
104
- too
105
- here
106
- come
107
- own
108
- last
109
- does
110
- oh
111
- say
112
- no
113
- where
114
- us
115
- same
116
- might
117
- yes
118
- however
119
- put
120
- world
121
- another
122
- want
123
- most
124
- again
125
- never
126
- under
127
- much
128
- why
129
- each
130
- while
131
- off
132
- went
133
- used
134
- without
135
- give
136
- within
137
- ]
138
- RE_STOPWORDS_15 = '(?:'+STOPWORDS_3[0..15].join("|")+')'