wukong 3.0.0.pre → 3.0.0.pre2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -0,0 +1,32 @@
1
+ /*
2
+ * Augments raw pageview data with page ID.
3
+ * Pageview stats are *theoretically* uniquely keyed by namespace
4
+ * and title, so that is what is used to join pageviews with page_metadata.
5
+ *
6
+ * In practice, the original pageview stats only give the URL visited, and
7
+ * reliably extracting namespace and title from the URL is difficult. Additionally,
8
+ * page names change, redirects happen, and many other small things can go
9
+ * wrong with the join. All pageview data is kept in the final table, but
10
+ * the page id will be blank in rows where the join failed.
11
+ *
12
+ * Output format:
13
+ * page_id:int, namespace:int, title:chararray, num_visitors:long,
14
+ * date:int, time:int, epoch_time:long, day_of_week:int
15
+ */
16
+
17
+ %default PAGE_METADATA '/data/results/wikipedia/full/page_metadata' -- page metadata for all Wikipedia pages
18
+ %default EXTRACTED_PAGEVIEWS '/data/scratch/wikipedia/full/pageviews' -- raw extracted pageview stats (see extract_pageviews.rb)
19
+ %default AUGMENTED_PAGEVIEWS_OUT '/data/results/wikipedia/full/pageviews' -- where output will be stored
20
+
21
+ page_metadata = LOAD '$PAGE_METADATA' AS
22
+ (id:int, namespace:int, title:chararray,
23
+ restrictions:chararray, counter:long, is_redirect:int, is_new:int,
24
+ random:float, touched:int, page_latest:int, len:int);
25
+ pageviews = LOAD '$EXTRACTED_PAGEVIEWS' AS (namespace:int, title:chararray,
26
+ num_visitors:long, date:int, time:int, epoch_time:long, day_of_week:int);
27
+
28
+ first_join = JOIN page_metadata BY (namespace, title) RIGHT OUTER, pageviews BY (namespace, title);
29
+ final = FOREACH first_join GENERATE
30
+ page_metadata::id, pageviews::namespace, pageviews::title, pageviews::num_visitors,
31
+ pageviews::date, pageviews::time, pageviews::epoch_time, pageviews::day_of_week;
32
+ STORE final INTO '$AUGMENTED_PAGEVIEWS_OUT';
@@ -0,0 +1,85 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding:UTF-8
3
+
4
+ # Pig output format:
5
+ # namespace:int, title:chararray, num_visitors:long, date:int, time:int, epoch_time:long, day_of_week:int
6
+
7
+ $:.unshift '/home/dlaw/dev/wukong_og/lib'
8
+ $:.unshift '/home/dlaw/dev/gorillib/lib'
9
+
10
+ require 'uri'
11
+ require 'pathname'
12
+ require 'json'
13
+ require 'wukong'
14
+ require 'wukong/streamer'
15
+ require 'wukong/streamer/encoding_cleaner'
16
+ load '/home/dlaw/dev/wukong/examples/munging/wikipedia/utils/munging_utils.rb'
17
+
18
+ ENV['map_input_file'] ||= 'pagecounts-20071222-100000.gz'
19
+
20
+ class String
21
+ def is_enwiki?
22
+ return (not (self =~ /^en /).nil?)
23
+ end
24
+
25
+ def is_after_enwiki?
26
+ return (not (self =~ /^(e[o-z][a-z]*|[f-z][a-z]+) /).nil?)
27
+ end
28
+ end
29
+
30
+ module PageviewsExtractor
31
+ class Mapper < Wukong::Streamer::LineStreamer
32
+ include Wukong::Streamer::EncodingCleaner
33
+ include MungingUtils
34
+
35
+ ns_json_file = File.open("/home/dlaw/dev/wukong/examples/munging/wikipedia/utils/namespaces.json",'r:UTF-8')
36
+ NAMESPACES = JSON.parse(ns_json_file.read)
37
+
38
+ # the filename strings are formatted as
39
+ # pagecounts-YYYYMMDD-HH0000.gz
40
+ def time_from_filename(filename)
41
+ parts = filename.split('-')
42
+ year = parts[1][0..3].to_i
43
+ month = parts[1][4..5].to_i
44
+ day = parts[1][6..7].to_i
45
+ hour = parts[2][0..1].to_i
46
+ return Time.new(year,month,day,hour)
47
+ end
48
+
49
+ def process line
50
+ # we only want enwiki lines
51
+ return if @done
52
+ if line.is_after_enwiki?
53
+ @done = true
54
+ return
55
+ end
56
+ return if not line.is_enwiki?
57
+ # we have an enwiki line - process it!
58
+ fields = line.split(' ')[1..-1]
59
+ out_fields = []
60
+ # add the namespace
61
+ namespace = nil
62
+ if fields[0].include? ':'
63
+ namespace = NAMESPACES[fields[0].split(':')[0]]
64
+ out_fields << (namespace || '0')
65
+ else
66
+ out_fields << '0'
67
+ end
68
+ # add the title
69
+ if namespace.nil?
70
+ out_fields << URI.unescape(fields[0])
71
+ else
72
+ out_fields << URI.unescape(fields[0][(fields[0].index(':')||-1)+1..-1])
73
+ end
74
+ # add number of visitors in the hour
75
+ out_fields << fields[2]
76
+ # grab date info from filename
77
+ file = Pathname.new(ENV['map_input_file']).basename
78
+ time = time_from_filename(file.to_s)
79
+ out_fields += time_columns_from_time(time)
80
+ yield out_fields
81
+ end
82
+ end
83
+ end
84
+
85
+ Wukong::Script.new(PageviewsExtractor::Mapper, Wukong::Streamer::LineStreamer).run
@@ -0,0 +1,25 @@
1
+ # Pig Style Guide
2
+
3
+ - Everything except names should be in all caps. E.g.
4
+
5
+ first_join = JOIN pages BY (namespace,title)
6
+ RIGHT OUTER, pageviews BY (namespace, title);
7
+
8
+ - Group and align columns in the script in ways that make sense. Don't be afraid of newlines. E.g.
9
+
10
+ second_pass = FOREACH second_pass_j GENERATE
11
+ first_pass::from_id, pages::id,
12
+ first_pass::from_namespace, first_pass::from_title,
13
+ first_pass::into_namespace, first_pass::into_title;
14
+
15
+ - Columns that form an important sub-set of the table's data should be easily accessible as a unit.
16
+
17
+ E.g. The edge list above has the from and into ids in the first and second columns, making it easy to just get an edge list of ids without the additional metadata.
18
+
19
+ - When at all possible, you should include sample LOAD statements in the comments for your script. This makes it easy to use the output of your script
20
+
21
+ - Parameterize as much as possible. All paths should be parameterized.
22
+
23
+ - Parameters should be in all caps, e.g. $NODE.
24
+
25
+ - Parameters should have defaults if at all possible. When you define the default, also include a comment describing the parameter.
@@ -0,0 +1,19 @@
1
+ /*
2
+ * Filters the page metadata table, leaving only pages that
3
+ * are redirects.
4
+ *
5
+ * Output Format (same as page_metadata):
6
+ * (id:int, namespace:int, title:chararray, restrictions:chararray,
7
+ * counter:long, is_redirect:int, is_new:int, random:float, touched:int,
8
+ * page_latest:int, len:int)
9
+ */
10
+
11
+ %default PAGE_METADATA '/data/results/wikipedia/full/page_metadata' -- page metdata for all pages in Wikipedia
12
+ %default REDIRECTS_OUT '/data/results/wikipedia/full/redirect_page_metadata' -- place to store page metdata for redirects
13
+
14
+ page_metadata = LOAD '$PAGE_METADATA' AS (id:int, namespace:int, title:chararray,
15
+ restrictions:chararray, counter:long, is_redirect:int, is_new:int, random:float,
16
+ touched:int, page_latest:int, len:int);
17
+
18
+ redirects = FILTER page_metadata BY (is_redirect == 1);
19
+ STORE redirects INTO '$REDIRECTS_OUT';
@@ -0,0 +1,23 @@
1
+ /*
2
+ * This script filters the articles table, leaving only the articles
3
+ * in the specified subuniverse.
4
+ *
5
+ * Output format:
6
+ * page_id:int, title:chararray, namespace:int, rev_date:int, rev_time:int,
7
+ * rev_epoch_time:long, rev_dow:int, article_text:chararray
8
+ */
9
+
10
+ %default ARTICLES '/data/results/wikipedia/full/articles' -- all articles in the wikipedia corpus
11
+ %default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
12
+ %default SUB_ARTICLES_OUT '/data/results/wikipedia/mini/articles' -- where output will be stored
13
+
14
+ articles = LOAD '$ARTICLES' AS (page_id:int, title:chararray, namespace:int,
15
+ rev_date:int, rev_time:int, rev_epoch_time:long, rev_dow:int, article_text:chararray);
16
+ sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
17
+ sub_articles_unfiltered = JOIN articles BY id, sub_nodes BY node_id;
18
+ sub_articles = FOREACH sub_articles_unfiltered GENERATE
19
+ articles::page_id AS page_id, articles::title AS title, articles::namespace AS namespace,
20
+ articles::rev_date AS rev_date, articles::rev_time AS rev_time,
21
+ articles::rev_epoch_time AS rev_epoch_time, articles::rev_dow AS rev_dow,
22
+ articles::article_text AS article_text;
23
+ STORE sub_articles INTO '$SUB_ARTICLES_OUT';
@@ -0,0 +1,24 @@
1
+ /*
2
+ * This script filters the page metadata table, leaving only the pages
3
+ * in the specified subuniverse.
4
+ *
5
+ * Output format (same as page_metadata):
6
+ * id:int, namespace:int, title:chararray, restrictions:chararray, counter:long,
7
+ * is_redirect:int, is_new:int, random:float, touched:int, page_latest:int, len:int
8
+ */
9
+
10
+ %default PAGE_METADATA '/data/results/wikipedia/full/page_metadata' -- metadata for all pages in the wikipedia corpus
11
+ %default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
12
+ %default SUB_PAGE_METADATA_OUT '/data/results/wikipedia/mini/page_metadata' -- where output will be stored
13
+
14
+ page_metadata = LOAD '$PAGE_METADATA' AS (id:int, namespace:int, title:chararray,
15
+ restrictions:chararray, counter:long, is_redirect:int, is_new:int, random:float,
16
+ touched:int, page_latest:int, len:int);
17
+ sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
18
+ sub_page_metadata_unfiltered = JOIN page_metadata BY id, sub_nodes BY node_id;
19
+ sub_page_metadata = FOREACH sub_page_metadata_unfiltered GENERATE
20
+ page_metadata::id, page_metadata::namespace, page_metadata::title,
21
+ page_metadata::restrictions, page_metadata::counter, page_metadata::is_redirect,
22
+ page_metadata::is_new, page_metadata::random, page_metadata::touched,
23
+ page_metadata::page_latest, page_metadata::len;
24
+ STORE sub_page_metadata INTO '$SUB_PAGE_METADATA_OUT';
@@ -0,0 +1,22 @@
1
+ /*
2
+ * This script filters the pagelinks table, leaving only the pagelinks
3
+ * that start within supplied subuniverse.
4
+ *
5
+ * Output format (same as augmented_pagelinks):
6
+ * from_id:int, into_id:int, from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray
7
+ */
8
+
9
+ %default PAGELINKS '/data/results/wikipedia/full/pagelinks' -- all edges in the pagelink graph (must be *directed*)
10
+ %default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
11
+ %default SUB_PAGELINKS_OUT '/data/results/wikipedia/mini/pagelinks' -- where output will be stored
12
+
13
+ all_pagelinks = LOAD '$PAGELINKS' AS (from_id:int, into_id:int,
14
+ from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray);
15
+ sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
16
+
17
+ sub_pagelinks_from = JOIN all_pagelinks BY from_id, sub_nodes BY node_id;
18
+ sub_pagelinks = FOREACH sub_pagelinks_from GENERATE
19
+ all_pagelinks::from_id, all_pagelinks::into_id,
20
+ all_pagelinks::from_namespace, all_pagelinks::from_title,
21
+ all_pagelinks::into_namespace, all_pagelinks::into_title;
22
+ STORE sub_pagelinks INTO '$SUB_PAGELINKS_OUT';
@@ -0,0 +1,22 @@
1
+ /*
2
+ * This script filters the pagelinks table, leaving only the pagelinks
3
+ * that terminate within supplied subuniverse.
4
+ *
5
+ * Output format (same as augment_pagelinks):
6
+ * node_a:int, node_b:int, a_into_b:int, b_into_a:int, is_symmetric:int
7
+ */
8
+
9
+ %default PAGELINKS '/data/results/wikipedia/full/pagelinks' -- all edges in the pagelink graph (must be *directed*)
10
+ %default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
11
+ %default SUB_PAGELINKS_OUT '/data/results/wikipedia/mini/pagelinks' -- where output will be stored
12
+
13
+ all_pagelinks = LOAD '$PAGELINKS' AS (from_id:int, into_id:int,
14
+ from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray);
15
+ sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
16
+
17
+ sub_pagelinks_into = JOIN all_pagelinks BY into_id, sub_nodes BY node_id;
18
+ sub_pagelinks = FOREACH sub_pagelinks_into GENERATE
19
+ all_pagelinks::from_id, all_pagelinks::into_id,
20
+ all_pagelinks::from_namespace, all_pagelinks::from_title,
21
+ all_pagelinks::into_namespace, all_pagelinks::into_title;
22
+ STORE sub_pagelinks INTO '$SUB_PAGELINKS_OUT';
@@ -0,0 +1,26 @@
1
+ /*
2
+ * This script filters the pagelinks table, leaving only the pagelinks
3
+ * that start and end within supplied subuniverse.
4
+ *
5
+ * Output format (same as augment_pagelinks):
6
+ * from_id:int, into_id:int, from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray
7
+ */
8
+
9
+ %default PAGELINKS '/data/results/wikipedia/full/undirected_pagelinks' -- all edges in the pagelink graph
10
+ %default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
11
+ %default SUB_PAGELINKS_OUT '/data/results/wikipedia/mini/pagelinks' -- where output will be stored
12
+
13
+ all_pagelinks = LOAD '$PAGELINKS' AS (from_id:int, into_id:int,
14
+ from_namespace:int, from_title:chararray, into_namespace:int, into_title:chararray);
15
+ sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
16
+
17
+ sub_pagelinks_in = JOIN all_pagelinks BY from_id, sub_nodes BY node_id;
18
+ sub_pagelinks_unfiltered = JOIN sub_pagelinks_in BY into_id, sub_nodes BY node_id;
19
+ sub_pagelinks = FOREACH sub_pagelinks_unfiltered GENERATE
20
+ sub_pagelinks_in::all_pagelinks::from_id,
21
+ sub_pagelinks_in::all_pagelinks::into_id,
22
+ sub_pagelinks_in::all_pagelinks::from_namespace,
23
+ sub_pagelinks_in::all_pagelinks::from_title,
24
+ sub_pagelinks_in::all_pagelinks::into_namespace,
25
+ sub_pagelinks_in::all_pagelinks::into_title;
26
+ STORE sub_pagelinks INTO '$SUB_PAGELINKS_OUT';
@@ -0,0 +1,29 @@
1
+ /*
2
+ * This script filters the pageviews table, leaving only the pageviews
3
+ * in the specified subuniverse.
4
+ *
5
+ * Parameters:
6
+ * pageviews - all pageviews in the wikipedia corpus
7
+ * sub_nodes - the list of nodes in your subuniverse
8
+ * sub_pageviews_out - the directory where output will be stored
9
+ *
10
+ * Output format (same as pageviews_augment.pig):
11
+ * id:int, namespace:int,
12
+ * page_id:int, title:chararray, namespace:int, rev_date:int, rev_time:int,
13
+ * rev_epoch_time:long, rev_dow:int, article_text:chararray
14
+ */
15
+
16
+ %default PAGEVIEWS '/data/results/wikipedia/full/pageviews' -- all pageview stats for the English Wikipedia
17
+ %default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
18
+ %default SUB_PAGEVIEWS_OUT '/data/results/wikipedia/mini/pageviews' -- where output will be stored
19
+
20
+ pageviews = LOAD '$PAGEVIEWS' AS (page_id:int, title:chararray, namespace:int,
21
+ rev_date:int, rev_time:int, rev_epoch_time:long, rev_dow:int, article_text:chararray);
22
+ sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
23
+ sub_pageviews_unfiltered = JOIN pageviews BY id, sub_nodes BY node_id;
24
+ sub_pageviews = FOREACH sub_pageviews_unfiltered GENERATE
25
+ articles::page_id AS page_id, articles::title AS title, articles::namespace AS namespace,
26
+ articles::rev_date AS rev_date, articles::rev_time AS rev_time,
27
+ articles::rev_epoch_time AS rev_epoch_time, articles::rev_dow AS rev_dow,
28
+ articles::article_text AS article_text;
29
+ STORE sub_pageviews INTO '$SUB_PAGEVIEWS_OUT';
@@ -0,0 +1,24 @@
1
+ /*
2
+ * This script filters the pagelinks table, leaving only the pagelinks
3
+ * that start and end within supplied subuniverse.
4
+ *
5
+ * Output format (same as undirected_pagelinks):
6
+ * node_a:int, node_b:int, a_into_b:int, b_into_a:int, is_symmetric:int
7
+ */
8
+
9
+ %default UNDIRECTED_PAGELINKS '/data/results/wikipedia/full/undirected_pagelinks' -- all edges in the pagelink graph
10
+ %default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse
11
+ %default SUB_PAGELINKS_OUT '/data/results/wikipedia/mini/pagelinks' -- where output will be stored
12
+
13
+ all_pagelinks = LOAD '$UNDIRECTED_PAGELINKS' AS (node_a:int, node_b:int, a_into_b:int, b_into_a:int, is_symmetric:int);
14
+ sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
15
+
16
+ sub_pagelinks_in = JOIN all_pagelinks BY node_a, sub_nodes BY node_id;
17
+ sub_pagelinks_unfiltered = JOIN sub_pagelinks_in BY node_b, sub_nodes BY node_id;
18
+ sub_pagelinks = FOREACH sub_pagelinks_unfiltered GENERATE
19
+ sub_pagelinks_in::all_pagelinks::node_a AS node_a,
20
+ sub_pagelinks_in::all_pagelinks::node_b AS node_b,
21
+ sub_pagelinks_in::all_pagelinks::a_into_b AS a_into_b,
22
+ sub_pagelinks_in::all_pagelinks::b_into_a AS b_into_a,
23
+ sub_pagelinks_in::all_pagelinks::is_symmetric AS is_symmetric;
24
+ STORE sub_pagelinks INTO '$SUB_PAGELINKS_OUT';
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding:UTF-8
3
+
4
+ # A script that fetches the namespace -> id mapping for
5
+ # all wikipedia languages. The output is stored (by default)
6
+ # in a json file that represents a hash from namespace name => id
7
+
8
+ require 'ruby-progressbar'
9
+ require 'open-uri'
10
+ require 'set'
11
+ require 'configliere'
12
+ require 'json'
13
+
14
+ Settings.use :commandline
15
+
16
+ NS_FILE = 'namespaces'
17
+
18
+ Settings.define :out_dir, flag: 'o', description: "Directory to drop the namespace file into.", default: File.expand_path(File.dirname(__FILE__))
19
+ Settings.define :verbose, flag: 'v', description: "Get chatty", type: :boolean, default: false
20
+ Settings.define :silent, description: "Say nothing", type: :boolean, default: false
21
+ Settings.define :head_length, flag: 'h', description: "The number of lines to read into the wiki xml for the namespace definitions.", type: Integer, default: 100
22
+ Settings.define :std_out, flag: 's', description: "Print output to standard out.", type: :boolean, default: false
23
+ Settings.define :to_tsv, flag: 't', description: 'Format the output as a TSV instead of JSON', type: :boolean, default:false
24
+
25
+ Settings.resolve!
26
+
27
+ Settings.out_dir = File.expand_path(Settings.out_dir)
28
+
29
+ namespaces = {}
30
+ namespaces_by_wiki = {}
31
+
32
+ wikis_page = open('http://dumps.wikimedia.org/backup-index.html')
33
+ wikis = Set.new
34
+
35
+ # grap the list of wikis
36
+ wikis_page.each_line do |line|
37
+ next unless line =~ />[a-z]*wiki</
38
+ wikis << line.gsub(/.*>([a-z]*)wiki<.*/,'\1')[0..-2]
39
+ end
40
+
41
+ if Settings.verbose
42
+ $stderr.puts "Retrieved the names of #{wikis.size} wikis"
43
+ $stderr.puts "Grabbing namespace data"
44
+ elsif (not Settings.silent)
45
+ progressbar = ProgressBar.create(:title => "Retrieving Namespaces...", :total => wikis.size, :format => '%t |%B| %c/%C %e ')
46
+ end
47
+
48
+ wikis.each_with_index do |prefix,index|
49
+ progressbar.increment unless (Settings.silent or Settings.verbose)
50
+ namespaces_by_wiki[prefix] = {}
51
+ $stderr.puts "Getting namespaces for #{prefix}.wikipedia.org" if Settings.verbose
52
+ raw = `curl -s 'http://dumps.wikimedia.org/#{prefix}wiki/latest/#{prefix}wiki-latest-pages-logging.xml.gz' | gzcat | head -n #{Settings.head_length}`
53
+ #TODO: Make this actually work
54
+ if $?.exitstatus != 0
55
+ out = "Could not access page dump for #{prefix}wiki." +
56
+ " This dump is probably being updated now." +
57
+ " Namespaces for this wiki will not be included in the final output"
58
+ $stderr.puts out
59
+ next
60
+ end
61
+ raw.each_line do |line|
62
+ next unless line =~ /.*<\/?namespace[^>]*>/
63
+ match = /<\/?namespace key="(?<key>-?\d+)"[^>]*>(?<ns>[^<]*)<\/namespace>/.match(line)
64
+ next if match.nil?
65
+ namespaces[match[:ns]] = match[:key].to_i
66
+ namespaces_by_wiki[prefix][match[:ns]] = match[:key].to_i
67
+ $stderr.puts " #{match[:ns]} -> #{match[:key]}" if Settings.verbose
68
+ end
69
+ $stderr.puts "Finished getting namespaces for #{prefix}.wikipedia.org. #{wikis.size - index} wikis to go" if Settings.verbose
70
+ end
71
+
72
+ if Settings.to_tsv
73
+ output = ""
74
+ namespaces.each_pair do |k,v|
75
+ output += "#{k}\t#{v}\n"
76
+ end
77
+ else
78
+ output = namespaces.to_json
79
+ end
80
+
81
+ if Settings.std_out
82
+ pp output
83
+ else
84
+ filename = "#{Settings.out_dir}/#{NS_FILE}.#{Settings.to_tsv ? "tsv" : "json"}"
85
+ File.open(filename, 'w') { |f| f.write(output)}
86
+ end
@@ -0,0 +1,68 @@
1
+ # encoding:UTF-8
2
+
3
+ require 'multi_json'
4
+
5
+ module MungingUtils
6
+ extend self # you can call MungingUtils.foo, or include it and call on self.
7
+
8
+ # all non-keyboard characters (that is, characters outside the 0x20 to 0x127 range)
9
+ NON_PLAIN_ASCII_RE = /[^\x20-\x7e]/m
10
+ # characters below 0x20
11
+ CONTROL_CHARS_RE = /[\x00-\x19]/m
12
+
13
+ def time_columns_from_time(time)
14
+ columns = []
15
+ columns << "%04d%02d%02d" % [time.year, time.month, time.day]
16
+ columns << "%02d%02d%02d" % [time.hour, time.min, time.sec]
17
+ columns << time.to_i
18
+ columns << time.wday
19
+ return columns
20
+ end
21
+
22
+ def warn_record(desc, record=nil)
23
+ record_info = MultiJson.encode(record)[0..1000] rescue "(unencodeable record) #{record.inspect[0..100]}"
24
+ Log.warn [desc, record_info].join("\t")
25
+ nil
26
+ end
27
+
28
+ # Modifies the text in place, replacing all newlines, tabs, and other control
29
+ # characters with a space (those < ascii 0x20, but not including 0xff). This
30
+ # uses a whitelist
31
+ #
32
+ # Only use this if funny characters aren't suppose to be in there in the first
33
+ # place; there are safe, easy ways to properly encode, eg `MultiJson.encode()`
34
+ #
35
+ def scrub_control_chars(text)
36
+ text.gsub!(CONTROL_CHARS_RE, ' ')
37
+ text
38
+ end
39
+
40
+ # Modifies the text in place, replacing all non-keyboard characters (newline,
41
+ # tab, anything not between ascii 0x20 and 0x7e) with their XML entity encoding
42
+ def safe_xml_encode(text)
43
+ text.gsub!(NON_PLAIN_ASCII_RE){|ch| "\\u%04x" % ch.ord } unless jsonized.ascii_only?
44
+ text
45
+ end
46
+
47
+
48
+ # Returns a JSON encoded string, with all non-ASCII characters escaped
49
+ def safe_json_encode(string)
50
+ jsonized = MultiJson.encode(string)
51
+ jsonized.gsub!(NON_PLAIN_ASCII_RE){|ch| "\\u%04x" % ch.ord } unless jsonized.ascii_only?
52
+ jsonized
53
+ end
54
+
55
+
56
+ end
57
+
58
+ Time.class_eval do
59
+ def to_flat
60
+ utc.strftime("%Y%m%d%H%M%SZ")
61
+ end
62
+ end
63
+
64
+ MatchData.class_eval do
65
+ def as_hash
66
+ Hash[ names.map{|name| [name.to_sym, self[name]] } ]
67
+ end
68
+ end