wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -0,0 +1,71 @@
1
+ require_relative('../../rake_helper')
2
+
3
+ Pathname.register_paths(
4
+ geo_data: [:data, 'geo'],
5
+ geo_work: [:work, 'geo'],
6
+ geo_code: File.dirname(__FILE__),
7
+ #
8
+ iso_3166: [:geo_data, 'iso_codes', "iso_3166.tsv" ],
9
+ geonames_countries: [:geo_data, 'geonames', "geonames_countries.json" ],
10
+ #
11
+ countries_json: [:geo_work, "countries.json" ],
12
+ country_name_lookup: [:geo_work, "country_name_lookup.tsv" ],
13
+ )
14
+
15
+ chain :geo do
16
+ code_files = FileList[Pathname.of(:geo_code, '*.rb').to_s]
17
+ chain(:countries) do
18
+
19
+ task(:load) do
20
+ require_relative('./geo_models')
21
+ require_relative('./geo_json')
22
+ require_relative('./geonames_models')
23
+ require_relative('./iso_codes')
24
+ require_relative('./reconcile_countries')
25
+ CountryReconciler.load_reconciled_countries
26
+ end
27
+
28
+ # desc 'load the ISO 3166 countries'
29
+ # task(:countries_iso_3166, after: [code_files, :force]) do |dest|
30
+ # require_relative('./iso_codes')
31
+ # p Wukong::Data::CountryCode.for_any_name('Bolivia')
32
+ # end
33
+
34
+ # step(:geonames_countries, doc: 'load the Geonames countries',
35
+ # invoke: 'geo:countries:load',
36
+ # # , after: [code_files, :force]
37
+ # ) do |dest|
38
+ # Wukong::Data::GeonamesGeoJson.load(:geonames_countries)
39
+ # end
40
+
41
+ desc 'Add the iso_codes data to the geonames countries'
42
+ create_file(:countries_json, invoke: 'geo:countries:load', after: [code_files, :force]) do |dest|
43
+ Geo::Country.values.each do |country|
44
+ dest << country.to_json << "\n"
45
+ end
46
+ end
47
+
48
+ desc 'Add the iso_codes data to the geonames countries'
49
+ create_file(:country_name_lookup, invoke: 'geo:countries:load', after: [code_files, :force]) do |dest|
50
+ Geo::Country.values.each do |ct|
51
+ ct.names.each do |alt_name|
52
+ dest << [ct.country_id, ct.country_al3id, ct.country_numid,
53
+ ct.tld_id, ct.geonames_id,
54
+ ct.name,
55
+ Geo::Place.slugify_name(alt_name), alt_name
56
+ ].join("\t") << "\n"
57
+ end
58
+ end
59
+ end
60
+
61
+ # task(:country_name_lookup => :load) do
62
+ # Geo::CountryNameLookup.load
63
+ # end
64
+
65
+ end
66
+ end
67
+
68
+ task :default => [
69
+ # 'geo:countries',
70
+ 'geo:countries:country_name_lookup'
71
+ ]
@@ -0,0 +1,62 @@
1
+ require 'gorillib'
2
+ require 'gorillib/data_munging'
3
+ require 'configliere'
4
+
5
+ S3_BUCKET = 'bigdata.chimpy.us'
6
+ S3_DATA_ROOT = "s3n://#{S3_BUCKET}/data"
7
+ HDFS_DATA_ROOT = '/data'
8
+
9
+ Settings.define :orig_data_root, default: HDFS_DATA_ROOT, description: "directory root for input data"
10
+ Settings.define :scratch_data_root, default: HDFS_DATA_ROOT, description: "directory root for scratch data"
11
+ Settings.define :results_data_root, default: HDFS_DATA_ROOT, description: "directory root for results data"
12
+ Settings.define :mini, description: 'Run in mini mode - operate inside the mini version of the specified universe',type: :boolean, default: false
13
+ Settings.define :universe, description: 'Universe to draw data from', finally: ->(c){ c.universe ||= (c.mini? ? "mini" : "full") }
14
+ Settings.define :pig_path, default: '/usr/local/bin/pig'
15
+ Settings.define :local, type: :boolean, default: false
16
+
17
+ def Settings.mini?; !! Settings.mini ; end # BANG BANG BANG
18
+ def Settings.wu_run_cmd; (local ? '--run=local' : '--run') ; end;
19
+
20
+ def dir_exists? (dir)
21
+ if Settings.local
22
+ return File.exists? dir
23
+ else
24
+ `hadoop fs -test -e #{dir}`
25
+ return $?.exitstatus == 0
26
+ end
27
+ end
28
+
29
+ def wukong(script, input, output, options={})
30
+ input = Pathname.of(input)
31
+ output = Pathname.of(output)
32
+ if dir_exists? output
33
+ puts "#{output} exists. Assuming that this job has already run..."
34
+ return
35
+ end
36
+ opts = ['--rm']
37
+ options.each_pair do |k,v|
38
+ opts << "--#{k}=#{v}"
39
+ end
40
+ opts << input
41
+ opts << output
42
+ ruby(script, Settings.wu_run_cmd,*opts)
43
+ end
44
+
45
+ def wukong_xml(script, input, output, split_tag)
46
+ wukong(script,input,output,{split_on_xml_tag: split_tag})
47
+ end
48
+
49
+ def pig(script_name, options={})
50
+ cmd = Settings.pig_path
51
+ options.each_pair do |k,v|
52
+ v = Pathname.of(v) if v.is_a? Symbol
53
+ if k.to_s.include? '_out' and dir_exists? v
54
+ puts "#{v} already exists. Assuming that this job has already run..."
55
+ return
56
+ else
57
+ cmd += " -param #{k.upcase}=#{v}"
58
+ end
59
+ end
60
+ cmd += " #{script_name}"
61
+ sh cmd
62
+ end
@@ -0,0 +1 @@
1
+ Gemfile.lock
@@ -0,0 +1,4 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gem 'gorillib', :path => '/Users/dlaw/dev/gorillib'
4
+ gem 'wukong', :path =>'/Users/dlaw/dev/wukong_og'
@@ -0,0 +1,28 @@
1
+ require 'configliere'
2
+ Settings.use :commandline
3
+
4
+ require_relative '../rake_helper'
5
+
6
+ Settings.resolve!
7
+
8
+ Pathname.register_paths(
9
+ project: 'noaa_ish',
10
+ universe: 'full',
11
+
12
+ orig: [Settings.orig_data_root,'orig'],
13
+ scratch: [Settings.scratch_data_root, 'scratch'],
14
+ results: [Settings.results_data_root, 'results'],
15
+
16
+ #Origin
17
+ noaa_ish_orig: [:orig, 'www1.ncdc.noaa.gov','pub','data','noaa'],
18
+ noaa_ish_test: [:noaa_ish_orig, '010010-99999-2012'],
19
+ #Results
20
+ noaa_ish_results: [:results, :project, :universe],
21
+ )
22
+
23
+ namespace :extract do
24
+ desc 'Extract the NOAA ISH weather data from flat files'
25
+ task :ish do
26
+ wukong('extract_ish.rb', :noaa_ish_test, :noaa_ish_results)
27
+ end
28
+ end
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+
4
+ require 'wukong'
5
+ require 'wukong/streamer/flatpack_streamer'
6
+
7
+ module Weather
8
+ class Mapper < Wukong::Streamer::FlatPackStreamer
9
+ format "_4 i6 i5 s8 s4 sD6e3 D7e3 s5 i5 s5 s4 i3 ssD4e1ii5 ssbi6 sssD5e1 sD5e1 sD5e1 ss*"
10
+ end
11
+ end
12
+
13
+ Wukong::Script.new(Weather::Mapper, nil).run
@@ -0,0 +1,119 @@
1
+ require 'gorillib'
2
+ require 'gorillib/model'
3
+ require 'gorillib/model/serialization'
4
+ require 'gorillib/model/positional_fields'
5
+
6
+ class RawWeatherReport
7
+ include Gorillib::Model
8
+ include Gorillib::Model::PositionalFields
9
+
10
+ field :usaf_station_id, Integer
11
+
12
+ # wban id appears to have 99999 as a blank value even though
13
+ # it is not specified as such in the docs
14
+ field :wban_station_id, Integer
15
+
16
+ field :obs_date, String
17
+ field :obs_time, String
18
+
19
+ field :obs_data_source, String, blankish: ["9", '', nil]
20
+
21
+ field :wstn_latitude, Float, blankish: [99.999, '', nil]
22
+ field :wstn_longitude, Float, blankish: [999.999, '' , nil]
23
+
24
+ field :report_type_code, String, blankish: ["99999", '', nil]
25
+
26
+ field :wstn_elevation, Integer, blankish: [9999, '', nil]
27
+
28
+ field :wstn_call_letters, String, blankish: ["99999", '', nil]
29
+
30
+ field :quality_control_process_name, String
31
+
32
+ field :wind_direction, Integer, blankish: [999, '', nil]
33
+ field :wind_direction_qual, String
34
+ field :wind_observation_type, String, blankish: ["9", '', nil]
35
+ field :wind_speed, Float, blankish: [999.9, '', nil]
36
+ field :wind_speed_qual, String
37
+
38
+ field :ceiling_height, Integer, blankish: [99999, '', nil]
39
+ field :ceiling_qual, String
40
+ field :ceiling_determination, String, blankish:['9', '', nil]
41
+ field :cavok, :boolean
42
+
43
+ field :visibility, Integer, blankish: [999999, '', nil]
44
+ field :visibility_qual, String
45
+ field :visibility_variability_code, String, blankish: ['9', '', nil]
46
+ field :visibility_variability_code_qual, String
47
+
48
+ field :air_temp, Float, blankish: [999.9, '', nil]
49
+ field :air_temp_qual, String
50
+
51
+ field :dew_point, Float, blankish: [999.9, '', nil]
52
+ field :dew_point_qual, String
53
+
54
+ field :sea_level_pressure, Float, blankish: [9999.9, '' , nil]
55
+ field :sea_level_pressure_qual, String
56
+
57
+ field :raw_extended_observations, String
58
+ end
59
+
60
+ class ReportMetadata
61
+ include Gorillib::Model
62
+ field :wind_direction_qual, String
63
+ field :wind_speed_qual, String
64
+ field :ceiling_qual, String
65
+ field :visibility_qual, String
66
+ field :visibility_variability_code_qual, String
67
+ field :air_temp_qual, String
68
+ field :dew_point_qual, String
69
+ field :sea_level_pressure_qual, String
70
+
71
+ end
72
+
73
+ class WeatherReport
74
+ include Gorillib::Model
75
+
76
+ field :wstn_id, String #wban-usad
77
+
78
+ field :wstn_latitude, Float
79
+ field :wstn_longitude, Float
80
+ field :wstn_elevation, Float
81
+
82
+ field :obs_date, String
83
+ field :obs_time, String
84
+
85
+ field :wind_direction, Integer
86
+ field :wind_observation_type, String
87
+ field :wind_speed, Float
88
+
89
+ field :ceiling_height, Integer
90
+ field :ceiling_determination, String
91
+ field :cavok, :boolean
92
+
93
+ field :visibility, Integer
94
+ field :visibility_variability_code, :boolean
95
+
96
+ field :air_temp, Float
97
+
98
+ field :dew_point, Float
99
+
100
+ field :sea_level_pressure, Float
101
+
102
+ field :metadata, ReportMetadata, default: ReportMetadata.new
103
+
104
+ def receive!(hsh={})
105
+ # prune the quality fields
106
+ hsh.keys.each do |key|
107
+ next if (key.to_s =~ /[^_]*_qual/).nil?
108
+ val = hsh.delete(key)
109
+ metadata.send("receive_#{key.to_s}", val)
110
+ end
111
+ # transform the ids
112
+ if hsh.keys.include? :usaf_station_id and hsh.keys.include? :wban_station_id
113
+ id = hsh.delete(:usaf_station_id).to_s
114
+ id += "-#{hsh.delete :wban_station_id}"
115
+ hsh[:wstn_id] = id
116
+ end
117
+ super(hsh)
118
+ end
119
+ end
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding:UTF-8
3
+
4
+ require 'open-uri'
5
+ require 'configliere'
6
+
7
+ NOAA_URL = 'http://www1.ncdc.noaa.gov/pub/data/noaa/'
8
+ Settings.use :commandline
9
+
10
+ Settings({
11
+ years: [1901],
12
+ verbose: false,
13
+ out_dir: /data/rawd/noaa/isd/,
14
+ un_gzip: false,
15
+ })
16
+
17
+ Settings.define :years, flag 'y', description: "Years to download"
18
+ Settings.define :verbose, flag 'v', description: "Get chatty", type: :boolean
19
+ Settings.define :un_gzip, flag 'g', description: "Unzip the files as they are uploaded", type: :boolean
20
+ Settings.define :out_dir, flag 'o', description: "The directory in the hdfs to put the files"
21
+
22
+ Settings.resolve!
23
+
24
+ def get_files_for_year(year)
25
+ year_page = open("#{NOAA_URL}/#{year}")
26
+ years = []
27
+ year_page.each_line do |line|
28
+ next unless line =~ /<a href="[^.]*\.gz">/
29
+ match = /<a href="([^.]*\.gz)">/.match(line)
30
+ years << match[1] if not match.nil?
31
+ end
32
+ return years
33
+ end
34
+
35
+ years.each do |year|
36
+ puts "Uploading files for year #{year}..." if Settings[:verbose]
37
+ get_files_for_year(year).each do |file|
38
+ puts " Uploading #{file}..." if Settings[:verbose]
39
+ path = "#{NOAA_URL}/#{year}/#{file}"
40
+ if Settings[:un_gzip]
41
+ `curl '#{path}' | zcat | hdp-put #{Settings[:out_dir]}/#{year}/#{file}`
42
+ else
43
+ `curl #{file} | hdp-put #{Settings[:out_dir]}/#{year}/#{file}`
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,34 @@
1
+ ## Encodings
2
+ All SQL dumps are theoretically encoded in UTF-8, but the Wikipedia dumps contain malformed characters. You might see a 'Invalid UTF-8 byte sequence' error when running a Wukong because of this.
3
+
4
+ To fix this, use `guard_encoding` in `MungingUtils` to filter out malformed characters before attempting to process them. `guard_encoding` replaces all invalid characters with '�'.
5
+
6
+ If you need to ensure that all characters are valid UTF-8 when piping things around on the command line, then pipe your stream through `char_filter.rb`.
7
+
8
+ If you need an invalid UTF-8 character, pretty much any single-byte character above \x79 will do. e.g:
9
+
10
+ > char = "\x80"
11
+ => "\x80"
12
+ > char.encoding.name
13
+ => "UTF-8"
14
+ > char.valid_encoding?
15
+ => false
16
+
17
+ [James Gray's blog](http://blog.grayproductions.net/articles/understanding_m17n) is really valuable for further reading on this.
18
+
19
+ ## Dates
20
+ Date information should be formatted as follows:
21
+
22
+ +----------+--------+--------------------------+-------------+
23
+ | int | int | long or float | int |
24
+ +----------+--------+--------------------------+-------------+
25
+ | YYYYMMDD | HHMMSS | Seconds since Unix epoch | Day of week |
26
+ +----------+--------+--------------------------+-------------+
27
+
28
+ Should always be in the UTC time zone.
29
+
30
+ Hours go from 0 to 23
31
+
32
+ Months go from 01 to 12
33
+
34
+ Day of week goes from 0 to 6 (Sunday to Saturday)
@@ -0,0 +1,193 @@
1
+ require 'configliere'
2
+ Settings.use :commandline
3
+
4
+ require_relative '../rake_helper'
5
+
6
+ DUMPS = ['20110722','20110803','20110901','20111007','20111115',
7
+ '20111201','20120104','20120211','20120307','20120403',
8
+ '20120502','20120601','20120702','20120802']
9
+
10
+ Settings.define :pageviews_date_range_slug_in, description: 'The pageviews date range', default: '2012/2012-08'
11
+ Settings.define :pageviews_date_range_slug_out, description: 'The pageviews date range', default: '2012/2012-08'
12
+ Settings.define :dump, description: 'The wikipedia dump to use', default: DUMPS[-1]
13
+ Settings.define :n1_node_id, description: 'Node to construct the N1 subuniverse around', default: '13692155'
14
+ Settings.define :n1_subuniverse, description: 'The output universe for N1 subuniverse generation', finally: ->(c) {c.n1_subuniverse ||= "n1_#{c.n1_node_id}"}
15
+ Settings.define :num_reducers, type: Integer, default: nil
16
+ Settings.resolve!
17
+
18
+ if (not DUMPS.include? Settings.dump)
19
+ puts "Invalid dump specified. Must be one of [#{DUMPS.join(', ')}].\nExiting..."
20
+ exit
21
+ end
22
+
23
+ =begin
24
+ Universe is the universe that data is drawn from.
25
+ It is also the default universe the data is written into.
26
+ There are tasks (namely subuniverse generation) that do not write out
27
+ into the supplied universe. Be careful
28
+ =end
29
+
30
+ Pathname.register_paths(
31
+ project: 'wikipedia',
32
+ universe: [Settings.universe],
33
+
34
+ orig: [Settings.orig_data_root,'ripd'],
35
+ scratch: [Settings.scratch_data_root, 'scratch'],
36
+ results: [Settings.results_data_root, 'results'],
37
+
38
+ #Origin
39
+ wiki_dumps: [:orig,'dumps.wikimedia.org'],
40
+ orig_enwiki: [:wiki_dumps, 'enwiki'],
41
+ orig_pageviews: [:wiki_dumps, 'other', 'pagecounts-raw', Settings.pageviews_date_range_slug_in],
42
+ orig_articles: [:orig_enwiki, Settings.dump, "enwiki-#{Settings.dump}-pages-articles.xml.gz"],
43
+ orig_pages: [:orig_enwiki, Settings.dump, "enwiki-#{Settings.dump}-page.sql.gz"],
44
+ orig_pagelinks: [:orig_enwiki, Settings.dump,"enwiki-#{Settings.dump}-pagelinks.sql.gz"],
45
+
46
+ # Scratch
47
+ wiki_scratch: [:scratch, :project, :universe],
48
+ page_metadata_scratch: [:wiki_scratch,'page_metadata'],
49
+ articles_scratch: [:wiki_scratch, 'articles'],
50
+ pageviews_scratch: [:wiki_scratch, 'pageviews',Settings.pageviews_date_range_slug_out],
51
+ pagelinks_scratch: [:wiki_scratch, 'pagelinks'],
52
+
53
+ # Results
54
+ wiki_results: [:results, :project, :universe],
55
+ page_metadata_results: [:wiki_results, 'page_metadata'],
56
+ pageviews_results: [:wiki_results, 'pageviews'],
57
+ articles_results: [:wiki_results, 'articles'],
58
+ pagelinks_results: [:wiki_results, 'pagelinks'],
59
+ undirected_pagelinks_results: [:wiki_results, 'undirected_pagelinks'],
60
+ redirects_pagelinks_results: [:wiki_results, 'redirects_pagelinks'],
61
+ redirects_page_metadata_results: [:wiki_results, 'redirects_page_metadata'],
62
+
63
+ # N1 Subuniverse
64
+ n1_results: [:results,'wikipedia', Settings.n1_subuniverse],
65
+ n1_nodes_results: [:n1_results, 'nodes'],
66
+ n1_edges_results: [:n1_results, 'edges'],
67
+ n1_page_metadata_results: [:n1_results, 'page_metadata'],
68
+ n1_articles_results: [:n1_results, 'articles'],
69
+ n1_pageviews_results: [:n1_results, 'pageviews'],
70
+
71
+ )
72
+
73
+ namespace :utils do
74
+ desc 'Fetch a list of all Wikipedia namespaces and their IDs'
75
+ task :get_namespaces do
76
+ if File.exists 'utils/namespaces.json'
77
+ puts 'utils/namespaces.json exists... Assuming that namespaces have already been downloaded'
78
+ return
79
+ end
80
+ ruby('utils/get_namespaces.rb')
81
+ end
82
+ end
83
+ namespace :extract do
84
+ desc 'Extract the Wikipedia article corpus from bzipped XML files'
85
+ task :articles do
86
+ wukong_xml('articles/extract_articles.rb', :orig_articles, :articles_results)
87
+ end
88
+
89
+ desc 'Extract the Wikipedia pages table from gzipped SQL dumps'
90
+ task :page_metadata do
91
+ wukong('page_metadata/extract_page_metadata.rb', :orig_pages, :page_metadata_results)
92
+ end
93
+
94
+ desc 'Extract Wikipedia pageview data from gzipped server logs'
95
+ task :pageviews do
96
+ if Settings.num_reducers.nil?
97
+ wukong('pageviews/extract_pageviews.rb', :orig_pageviews, :pageviews_scratch)
98
+ else
99
+ wukong('pageviews/extract_pageviews.rb', :orig_pageviews, :pageviews_scratch,{reduce_tasks: Settings.num_reducers})
100
+ end
101
+ end
102
+
103
+ desc 'Extract Wikipedia pagelinks data from gzipped SQL dumps'
104
+ task :pagelinks do
105
+ wukong('pagelinks/extract_pagelinks.rb', :orig_pagelinks, :pagelinks_scratch)
106
+ end
107
+ end
108
+ namespace :augment do
109
+ desc 'Augment extracted Wikipedia pageview data with page ID and other metadata'
110
+ task :pageviews => ["extract:pageviews", "extract:page_metadata"] do
111
+ pig('pageviews/augment_pageviews.pig',{
112
+ page_metadata: :page_metadata_results,
113
+ extracted_pageviews: :pageviews_scratch,
114
+ augmented_pageviews_out: :pageviews_results,
115
+ })
116
+ end
117
+
118
+ desc 'Augment Wikipedia pagelinks data with page metadata'
119
+ task :pagelinks => ["extract:pagelinks","extract:page_metadata"] do
120
+ pig('pagelinks/augment_pagelinks.pig',{
121
+ page_metadata: :page_metadata_results,
122
+ extracted_pagelinks: :pagelinks_scratch,
123
+ augmented_pagelinks_out: :pagelinks_results,
124
+ })
125
+ end
126
+
127
+ desc 'Undirect the Wikipedia pagelinks graph'
128
+ task :pagelinks_undirect => "augment:pagelinks" do
129
+ pig('pagelinks/undirect_pagelinks.pig',{
130
+ augmented_pagelinks: :pagelinks_results,
131
+ undirected_pagelinks_out: :pagelinks_undirected_results,
132
+ })
133
+ end
134
+ end
135
+ namespace :n1 do
136
+ desc 'Generate a list of node ids for the N1 neighborhood of the specified node'
137
+ task :nodes => 'augment:pagelinks_undirect' do
138
+ pig('n1_subuniverse/n1_nodes.pig',{
139
+ undirected_pagelinks: :undirected_pagelinks_results,
140
+ hub: Settings.n1_node_id,
141
+ n1_nodes_out: :n1_nodes_results,
142
+ })
143
+ end
144
+ desc 'Extract pagelinks for the N1 neighborhood of the specified node'
145
+ task :undirected_pagelinks => ['augment:pagelinks_undirect', :nodes] do
146
+ pig('subuniverse/sub_undirected_pagelinks_within.pig',{
147
+ undirected_pagelinks: :undirected_pagelinks_results,
148
+ sub_nodes: :n1_nodes_results,
149
+ sub_pagelinks_out: :n1_edges_results,
150
+ })
151
+ end
152
+ desc 'Extract page metadata for the N1 neighborhood of the specified node'
153
+ task :page_metadata => ['augment:page_metadata', :nodes] do
154
+ pig('subuniverse/sub_page_metadata.pig',{
155
+ page_metadata: :page_metadata_results,
156
+ sub_nodes: :n1_nodes_results,
157
+ sub_page_metadata_out: :n1_page_metadata_results,
158
+ })
159
+ end
160
+ desc 'Extract articles for the N1 neighborhood of the specified node'
161
+ task :articles => ['extract:articles', :nodes] do
162
+ pig('subuniverse/sub_articles.pig',{
163
+ articles: :articles_results,
164
+ sub_nodes: :n1_nodes_results,
165
+ sub_articles_out: :n1_articles_results,
166
+ })
167
+ end
168
+ desc 'Extract pageview data for the N1 neighborhood of the specified node'
169
+ task :pageviews => ['augment:pageviews', :nodes] do
170
+ pig('subuniverse/sub_pageviews.pig',{
171
+ pageviews: :pageviews_results,
172
+ sub_nodes: :n1_nodes_results,
173
+ sub_pageviews_out: :n1_pageviews_results,
174
+ })
175
+ end
176
+ end
177
+ namespace :redirects do
178
+ desc 'Extract redirects from pagemetadata table'
179
+ task :redirects_page_metadata => 'extract:page_metadata' do
180
+ pig('redirects/redirects_page_metadata.pig',{
181
+ page_metadata: :page_metadata_results,
182
+ redirects_out: :redirects_page_metadata_results,
183
+ })
184
+ end
185
+ desc 'Extract redirect links from pagelinks table'
186
+ task :redirect_pagelinks => ['redirects_page_metadata','augment:pagelinks'] do
187
+ pig('subuniverse/sub_pagelinks_from.pig',{
188
+ pagelinks: :pagelinks_results,
189
+ sub_nodes: :redirects_page_metadata_results,
190
+ sub_pagelinks_out: :redirects_pagelinks_results,
191
+ })
192
+ end
193
+ end