wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Extracts wikipedia articles from bzipped xml, outputs them in TSV. Article
4
+ # text is XML encoded, but all newlines and tabs (in fact, all control
5
+ # characters) are converted to XML entities, making it safe to truck around as
6
+ # TSV.
7
+ #
8
+ # Sample Pig LOAD Statement:
9
+ #
10
+ # all_articles = LOAD '$articles' AS
11
+ # (id:long, namespace:int, title:chararray, revision_timestamp:long, redirect:chararray, text:chararray);
12
+ #
13
+
14
+ # ## Usage
15
+ #
16
+ # Flattens the wikipedia 'enwiki-latest-pages-articles.xml.gz' into a
17
+ # one-line-per-record heap.
18
+ #
19
+ # examples/munging/wikipedia/articles/extract_articles-templated.rb --rm --run \
20
+ # /data/origin/dumps.wikimedia.org/enwiki/20120601/enwiki-20120601-pages-articles.xml
21
+ # /data/results/wikipedia/full/articles.json.tsv
22
+ #
23
+
24
+ require 'wukong'
25
+ require 'wukong/streamer/encoding_cleaner'
26
+ require 'crack/xml'
27
+ require 'multi_json'
28
+ require 'oj'
29
+ require_relative '../utils/munging_utils.rb'
30
+
31
+ module ArticlesExtractor
32
+ class Mapper < Wukong::Streamer::LineStreamer
33
+ include Wukong::Streamer::EncodingCleaner
34
+ include MungingUtils
35
+
36
+ def lines
37
+ @lines ||= []
38
+ end
39
+
40
+ def recordize line
41
+ lines << line
42
+ if line =~ /<\/page>/
43
+ result = Crack::XML::parse(lines.join("\n"))
44
+ @lines = []
45
+ return [result]
46
+ else
47
+ return nil
48
+ end
49
+ end
50
+
51
+ def process record
52
+ if record.has_key? 'mediawiki'
53
+ record = record['mediawiki']
54
+ end
55
+
56
+ redirect = record['page']['redirect'] ? record['page']['redirect']['title'] : ''
57
+ timestamp = Time.iso8601(record['page']['revision']['timestamp']).to_flat
58
+ raw_text = record['page']['revision']['text']
59
+
60
+ # some few parts per million articles have an empty body -- workaround
61
+ raw_text = '' if not raw_text.is_a?(String)
62
+
63
+ result = [
64
+ record['page']['id'],
65
+ record['page']['ns'],
66
+ scrub_control_chars(record['page']['title']),
67
+ record['page']['revision']['id'],
68
+ timestamp,
69
+ scrub_control_chars(redirect),
70
+ safe_json_encode(raw_text)
71
+ ]
72
+ yield result
73
+ end
74
+ end
75
+ end
76
+
77
+ # Force it to run in a single map task, to avoid writing a custom input format.
78
+ # The job runs in 2 hours, once; much less than the time it'd take me to do so.
79
+ Wukong::Script.new(ArticlesExtractor::Mapper, nil, min_split_size: 1152921504606846976).run
@@ -0,0 +1,136 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Extracts wikipedia articles from bzipped xml, outputs them in TSV. Article
4
+ # text is XML encoded, but all newlines and tabs (in fact, all control
5
+ # characters) are converted to XML entities, making it safe to truck around as
6
+ # TSV.
7
+
8
+ # ## Schema
9
+ #
10
+ # Sample Pig LOAD Statement:
11
+ #
12
+ # all_articles = LOAD '$articles' AS
13
+ # (id:long, namespace:int, title:chararray, revision_timestamp:long, redirect:chararray, text:chararray);
14
+ #
15
+
16
+ # ## Usage
17
+ #
18
+ # Flattens the wikipedia 'enwiki-latest-pages-articles.xml.gz' into a
19
+ # one-line-per-record heap.
20
+ #
21
+ # examples/munging/wikipedia/articles/extract_articles-templated.rb --rm --run \
22
+ # /data/origin/dumps.wikimedia.org/enwiki/20120601/enwiki-20120601-pages-articles.xml
23
+ # /data/results/wikipedia/full/articles.tsv
24
+ #
25
+
26
+ require 'wukong'
27
+ require 'wukong/streamer/encoding_cleaner'
28
+ require 'crack/xml'
29
+ require 'multi_json'
30
+ require_relative '../utils/munging_utils.rb'
31
+
32
+ # <page>
33
+ # <title>Anarchism</title>
34
+ # <ns>0</ns>
35
+ # <id>12</id>
36
+ # <revision>
37
+ # <id>370845941</id>
38
+ # <timestamp>2010-06-29T20:14:56Z</timestamp>
39
+ # <contributor>
40
+ # <username>Centographer</username>
41
+ # <id>12640258</id>
42
+ # </contributor>
43
+ # <minor />
44
+ # <comment>clarifying not ordinary anarcho-socialism</comment>
45
+ # <text xml:space="preserve">
46
+ # ...snip ...
47
+ # </text>
48
+ # <sha1>...</sha1>
49
+ # </revision>
50
+ # </page>
51
+ #
52
+ module ArticlesExtractor
53
+ class Mapper < Wukong::Streamer::LineStreamer
54
+ include Wukong::Streamer::EncodingCleaner
55
+ include MungingUtils
56
+
57
+ def initialize(*)
58
+ super
59
+ @lines = []
60
+ @state = :out_of_article
61
+ @num_lines = 0
62
+ end
63
+
64
+ # Bolt together all lines between a <page> and a </page> marker.
65
+ def recordize line
66
+ @num_lines += 1
67
+ return if @state == :out_of_article && (ARTICLE_BEG_RE !~ line)
68
+ @state = :in_article
69
+ #
70
+ @lines << line
71
+ if ARTICLE_END_RE =~ line
72
+ result = @lines.join("\n")
73
+ @lines = []
74
+ @state = :out_of_article
75
+ return [result]
76
+ else
77
+ return nil
78
+ end
79
+ end
80
+
81
+ def process article
82
+ info = ARTICLE_RE.match(article)
83
+ if not info then warn "Bad match line #{@lines}: #{article.to_s[0..2000]}" ; return ; end
84
+
85
+ timestamp = [info[:rts_yr], info[:rts_mo], info[:rts_day], info[:rts_hr], info[:rts_min], info[:rts_sec], 'Z'].join
86
+ text = Crack::XML::parse("<text>#{info[:text]}</text>")['text'] || ''
87
+ redirect = info[:redirect] || ''
88
+
89
+ record = [
90
+ info[:id],
91
+ info[:ns],
92
+ scrub_control_chars(info[:title]),
93
+ info[:revision_id],
94
+ timestamp,
95
+ scrub_control_chars(redirect),
96
+ safe_json_encode(text)
97
+ ]
98
+ yield record
99
+ end
100
+ end
101
+
102
+ ARTICLE_BEG_RE = %r{\A\s*<page>\z}
103
+ ARTICLE_END_RE = %r{\A\s*</page>\z}
104
+ ARTICLE_RE = %r{\A
105
+ \s*<page>
106
+ \s* <title>(?<title>[^<]*)</title>
107
+ \s* <ns>(?<ns>\d+)</ns>
108
+ \s* <id>(?<id>\d+)</id>
109
+ \s* (?:<redirect\stitle=\"(?<redirect>[^\"]+)\"\s/>)?
110
+ \s* (?:<restrictions>(?<restrictions>[^<]+)</restrictions>)?
111
+ \s* <revision>
112
+ \s* <id>(?<revision_id>\d+)</id>
113
+ \s* <timestamp>(?<rts_yr>\d\d\d\d)-(?<rts_mo>\d\d)-(?<rts_day>\d\d)T(?<rts_hr>\d\d):(?<rts_min>\d\d):(?<rts_sec>\d\d)Z</timestamp>
114
+ \s* (?:
115
+ <contributor>\s*<username>[^<]+</username>\s*<id>\d+</id>\s*</contributor> |
116
+ <contributor>\s*<ip>[\d\.]+</ip>\s*</contributor> |
117
+ <contributor\sdeleted="deleted"\s/>
118
+ )
119
+ \s* (?:<minor\s/>)?
120
+ \s* (?:<comment>[^<]*</comment>|<comment\sdeleted="deleted"\s/>)?
121
+ \s* (?:
122
+ <text\sxml:space="preserve">
123
+ (?<text>.*)
124
+ </text>
125
+ |
126
+ <text\sxml:space="preserve"\s/>
127
+ )
128
+ \s* (?:<sha1>(?<sha1>[a-z0-9]+)</sha1> | <sha1\s/>)
129
+ \s* </revision>
130
+ \s*</page>\s*\z}xmo
131
+
132
+ end
133
+
134
+ # Force it to run in a single map task, to avoid writing a custom input format.
135
+ # The job runs in 2 hours, once; much less than the time it'd take me to do so.
136
+ Wukong::Script.new(ArticlesExtractor::Mapper, nil, min_split_size: 1152921504606846976).run
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Generate plain-text versions of articles from the tsv-converted raw article data
4
+ # (output from extract_articles)
5
+ #
6
+ # This strips out template tags, wiki links, and so forth
7
+ #
8
+ # Everything that's left is either actual text, or nicely detached punctuation.
9
+
10
+ # ## Usage
11
+ #
12
+ # Uses the output of extract_articles-templated.rb:
13
+ #
14
+ # examples/munging/wikipedia/articles/textualize_articles.rb --rm --run \
15
+ # /data/results/wikipedia/full/articles.json.tsv \
16
+ # /data/results/wikipedia/full/article_texts.json.tsv
17
+ #
18
+
19
+ require 'wukong'
20
+ require 'multi_json'
21
+ require 'oj'
22
+ require 'strscan'
23
+ require 'find'
24
+ require 'sanitize'
25
+ #
26
+ require_relative '../utils/munging_utils.rb'
27
+ require_relative './wp2txt_article'
28
+ require_relative './wp2txt_utils'
29
+
30
+ module TextualizeArticles
31
+
32
+ class Mapper < Wukong::Streamer::RecordStreamer
33
+ include MungingUtils
34
+
35
+ @@errors = 0
36
+ MAX_ERRORS = 1_000
37
+
38
+ def process(id, namespace, title, revision_id, timestamp, redirect, raw_text)
39
+
40
+ text = MultiJson.decode(raw_text)
41
+ article = Wp2txt::Article.new(text, title)
42
+ jsonized_text = MultiJson.encode(article.polish)
43
+
44
+ yield [id, namespace, title, revision_id, timestamp, redirect, jsonized_text]
45
+
46
+ rescue StandardError => err
47
+ Wukong.bad_record("Bad Record", err, raw_text)
48
+ raise "Too many errors" if (@@errors += 1) > MAX_ERRORS
49
+ end
50
+
51
+ end
52
+ end
53
+
54
+ Wukong::Script.new(TextualizeArticles::Mapper, nil).run
@@ -0,0 +1,43 @@
1
+
2
+ # cat /mnt/data/origin/dumps.wikimedia.org/enwiki/20120601/enwiki-20120601-pages-articles.xml | ruby -ne '$_.chomp!; case $_ when %r{\A(\s*)<redirect title="[^"]+" />\z} then puts %Q{#{$1}<redirect title=\"...\" />} when %r{\A(\s*<[^>]+>)[^<]*(</\w+>)\z} then puts "#{$1}...#{$2}" ; when %r{\A(\s*<[^>]+>)} then puts $1 ; when %r{\A[^<]*(</[\w-]+>)\z} then puts $1 else false end ' > /mnt/data/origin/dumps.wikimedia.org/enwiki/20120601/xml-tags-seen.txt &
3
+
4
+ # cat /mnt/data/origin/dumps.wikimedia.org/enwiki/20120601/xml-tags-seen.txt | sort -S1G --temp=/mnt{,2,3,4}/tmp | uniq -c | sort -n | tee xml-tags-census.txt
5
+
6
+ # cat /mnt/data/origin/dumps.wikimedia.org/enwiki/20120601/xml-tags-seen.txt | sort -S1800M --temp=/mnt{2,3,4}/tmp | uniq -c | sort -n | tee xml-tags-census.txt &
7
+
8
+
9
+ # 609844 <page>
10
+ # 609844 <title>...</title>
11
+ # 609844 <ns>...</ns>
12
+ # 609844 <id>...</id>
13
+ # 714 <restrictions>...</restrictions>
14
+ #
15
+ # 251254 <redirect title="..." />
16
+ # 609844 <revision>
17
+ # 609844 <id>...</id>
18
+ # 609844 <timestamp>...</timestamp>
19
+ #
20
+ # 609843 <contributor>
21
+ # 548236 <username>...</username>
22
+ # 61607 <ip>...</ip>
23
+ # 548236 <id>...</id>
24
+ # 609843 </contributor>
25
+ #
26
+ # 288319 <minor />
27
+ # 529222 <comment>...</comment>
28
+ # 108 <comment>
29
+ # 108 </comment>
30
+ #
31
+ # 224818 <text xml:space="preserve">...</text>
32
+ # 385019 <text xml:space="preserve">
33
+ # 7 <text xml:space="preserve" />
34
+ #
35
+ # 346490 <sha1>...</sha1>
36
+ # 263354 <sha1 />
37
+ #
38
+ # 609843 </revision>
39
+ # 609843 </page>
40
+ #
41
+ # 384998 </text>
42
+ # 20 </text>
43
+ # 643676 </page>
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Yoichiro Hasebe
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,259 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Taken from Yoichiro Yohasebe's [`wp2txt` project](https://github.com/yohasebe/wp2txt)
5
+ # with liberal modifications for our purposes.
6
+ #
7
+ # This software is distributed under the MIT License. Please see the `./wp2txt-LICENSE.txt` file.
8
+
9
+ require 'strscan'
10
+ require_relative 'wp2txt_utils'
11
+
12
+ module Wp2txt
13
+
14
+ # possible element type, which could be later chosen to print or not to print
15
+ # :mw_heading
16
+ # :mw_htable
17
+ # :mw_quote
18
+ # :mw_unordered
19
+ # :mw_ordered
20
+ # :mw_definition
21
+ # :mw_pre
22
+ # :mw_paragraph
23
+ # :mw_comment
24
+ # :mw_math
25
+ # :mw_source
26
+ # :mw_inputbox
27
+ # :mw_template
28
+ # :mw_link
29
+ # :mw_summary
30
+ # :mw_blank
31
+ # :mw_redirect
32
+
33
+ # an article contains elements, each of which is [TYPE, string]
34
+ class Article
35
+
36
+ include Wp2txt
37
+ attr_accessor :elements, :title
38
+
39
+ # class varialbes to save resource for generating regexps
40
+ # those with a trailing number 1 represent opening tag/markup
41
+ # those with a trailing number 2 represent closing tag/markup
42
+ # those without a trailing number contain both opening/closing tags/markups
43
+
44
+ @@in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
45
+ @@in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
46
+
47
+ @@in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
48
+ @@in_inputbox_regex1 = Regexp.new('<inputbox>')
49
+ @@in_inputbox_regex2 = Regexp.new('<\/inputbox>')
50
+
51
+ @@in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
52
+ @@in_source_regex1 = Regexp.new('<source.*?>')
53
+ @@in_source_regex2 = Regexp.new('<\/source>')
54
+
55
+ @@in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
56
+ @@in_math_regex1 = Regexp.new('<math.*?>')
57
+ @@in_math_regex2 = Regexp.new('<\/math>')
58
+
59
+ @@in_heading_regex = Regexp.new('^=+.*?=+$')
60
+
61
+ @@in_html_table_regex = Regexp.new('<table.*?><\/table>')
62
+ @@in_html_table_regex1 = Regexp.new('<table\b')
63
+ @@in_html_table_regex2 = Regexp.new('<\/\s*table>')
64
+
65
+ @@in_table_regex1 = Regexp.new('^\s*\{\|')
66
+ @@in_table_regex2 = Regexp.new('^\|\}.*?$')
67
+
68
+ @@in_unordered_regex = Regexp.new('^\*')
69
+ @@in_ordered_regex = Regexp.new('^\#')
70
+ @@in_pre_regex = Regexp.new('^ ')
71
+ @@in_definition_regex = Regexp.new('^[\;\:]')
72
+
73
+ @@blank_line_regex = Regexp.new('^\s*$')
74
+
75
+ @@redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
76
+
77
+ def initialize(text, title = "", strip_tmarker = false)
78
+ @title = title.strip
79
+ @strip_tmarker = strip_tmarker
80
+ parse text
81
+ end
82
+
83
+ def create_element(tp, text)
84
+ [tp, text]
85
+ end
86
+
87
+ def parse(source)
88
+ self.class.remove_comments(source)
89
+ @elements = []
90
+ mode = nil
91
+ open_stack = []
92
+ close_stack = []
93
+ source.each_line do |line|
94
+
95
+ case mode
96
+ when :mw_table
97
+ if @@in_table_regex2 =~ line
98
+ mode = nil
99
+ end
100
+ @elements.last.last << line
101
+ next
102
+ when :mw_inputbox
103
+ if @@in_inputbox_regex2 =~ line
104
+ mode = nil
105
+ end
106
+ @elements.last.last << line
107
+ next
108
+ when :mw_source
109
+ if @@in_source_regex2 =~ line
110
+ mode = nil
111
+ end
112
+ @elements.last.last << line
113
+ next
114
+ when :mw_math
115
+ if @@in_math_regex2 =~ line
116
+ mode = nil
117
+ end
118
+ @elements.last.last << line
119
+ next
120
+ when :mw_htable
121
+ if @@in_html_table_regex2 =~ line
122
+ mode = nil
123
+ end
124
+ @elements.last.last << line
125
+ next
126
+ end
127
+
128
+ case line
129
+ when @@blank_line_regex
130
+ @elements << create_element(:mw_blank, "\n")
131
+ when @@redirect_regex
132
+ @elements << create_element(:mw_redirect, line)
133
+ when @@in_template_regex
134
+ @elements << create_element(:mw_template, line)
135
+ when @@in_heading_regex
136
+ @elements << create_element(:mw_heading, "\n" + line + "\n")
137
+ when @@in_inputbox_regex
138
+ @elements << create_element(:mw_inputbox, line)
139
+ when @@in_inputbox_regex1
140
+ mode = :mw_inputbox
141
+ @elements << create_element(:mw_inputbox, line)
142
+ when @@in_source_regex
143
+ @elements << create_element(:mw_source, line)
144
+ when @@in_source_regex1
145
+ mode = :mw_source
146
+ @elements << create_element(:mw_source, line)
147
+ when @@in_math_regex
148
+ @elements << create_element(:mw_math, line)
149
+ when @@in_math_regex1
150
+ mode = :mw_math
151
+ @elements << create_element(:mw_math, line)
152
+ when @@in_html_table_regex
153
+ @elements << create_element(:mw_htable, line)
154
+ when @@in_html_table_regex1
155
+ mode = :mw_htable
156
+ @elements << create_element(:mw_htable, line)
157
+ when @@in_table_regex1
158
+ mode = :mw_table
159
+ @elements << create_element(:mw_table, line)
160
+ when @@in_unordered_regex
161
+ line = line.sub(/\A[\*\#\;\:\ ]+/, "") if @strip_tmarker
162
+ @elements << create_element(:mw_unordered, line)
163
+ when @@in_ordered_regex
164
+ line = line.sub(/\A[\*\#\;\:\ ]+/, "") if @strip_tmarker
165
+ @elements << create_element(:mw_ordered, line)
166
+ when @@in_pre_regex
167
+ line = line.sub(/\A\^\ /, "") if @strip_tmarker
168
+ @elements << create_element(:mw_pre, line)
169
+ when @@in_definition_regex
170
+ line = line.sub(/\A[\;\:\ ]+/, "") if @strip_tmarker
171
+ @elements << create_element(:mw_definition, line)
172
+ when @@in_link_regex
173
+ @elements << create_element(:mw_link, line)
174
+ else
175
+ @elements << create_element(:mw_paragraph, line)
176
+ end
177
+ end
178
+ @elements
179
+ end
180
+
181
+ def self.remove_comments(text)
182
+ # remove all comment texts
183
+ # and insert as many number of new line chars included in
184
+ # each comment instead
185
+ text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
186
+ num_of_newlines = content.count("\n")
187
+ (num_of_newlines == 0) ? "" : ("\n" * num_of_newlines)
188
+ end
189
+ end
190
+
191
+ EXCLUDE_SECTIONS = {
192
+ mw_title: false,
193
+ mw_heading: false,
194
+ mw_paragraph: false,
195
+ mw_link: false,
196
+ mw_redirect: false,
197
+ #
198
+ mw_pre: false,
199
+ mw_quote: false,
200
+ mw_unordered: false,
201
+ mw_ordered: false,
202
+ mw_definition: false,
203
+ #
204
+ mw_table: true,
205
+ mw_htable: true,
206
+ mw_blank: true,
207
+ mw_math: true,
208
+ mw_source: true,
209
+ mw_template: true,
210
+ }
211
+
212
+ def polish
213
+ contents = []
214
+ elements.each do |el_type, element|
215
+ contents << "+#{el_type.to_s.upcase}+\t" if $DEBUG_MODE
216
+ next if EXCLUDE_SECTIONS[el_type]
217
+ #
218
+ case el_type
219
+ when :mw_heading then contents << format_wiki(element)
220
+ when :mw_paragraph then contents << format_wiki(element)
221
+ when :mw_link then contents << format_wiki(element)
222
+ when :mw_redirect then contents << format_wiki(element) << "\n\n"
223
+ when :mw_pre then contents << element
224
+ when :mw_quote then contents << format_wiki(element)
225
+ when :mw_unordered then contents << format_wiki(element)
226
+ when :mw_ordered then contents << format_wiki(element)
227
+ when :mw_definition then contents << format_wiki(element)
228
+ when :mw_table, :mw_htable then contents << format_wiki(element)
229
+ when :mw_math, :mw_source then contents << format_wiki(element)
230
+ when :mw_blank then contents << format_wiki(element)
231
+ else
232
+ warn "Unknown section #{el_type}, content '#{element.to_s.gsub(/[\r\n]+/m,'')[0..200]}'"
233
+ contents << format_wiki(element)
234
+ end
235
+ end
236
+ text = contents.join
237
+
238
+ # Extract text from <b>..</b> and so forth; remove contents of <ref>...</ref> completely
239
+ text = clean_html(text)
240
+ # translate some recognizable special characters
241
+ text = special_chr(text)
242
+ # re-hang the no-wiki segments
243
+ unescape_nowiki(text)
244
+ # strip out templates. Several parts per million of these will fail for
245
+ # bad structure; I assume that means some parts per thousand will be
246
+ # mis-estimated. C'est la UGC.
247
+ text = remove_templates(text) if EXCLUDE_SECTIONS[:mw_template]
248
+
249
+ return '' if /\A\s*\z/m =~ text
250
+ #
251
+ result = EXCLUDE_SECTIONS[:mw_title] ? "" : "# #{format_wiki(title)}\n\n"
252
+ result << text
253
+ result.gsub!(/\n\n\n+/m){"\n\n"}
254
+ result << "\n"
255
+
256
+ result
257
+ end
258
+ end
259
+ end