ul-wukong 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +60 -0
  3. data/.gitmodules +6 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +19 -0
  6. data/.yardopts +6 -0
  7. data/CHANGELOG.md +7 -0
  8. data/Gemfile +17 -0
  9. data/Guardfile +12 -0
  10. data/LICENSE.md +95 -0
  11. data/NOTES-travis.md +31 -0
  12. data/README-old.md +422 -0
  13. data/README.md +1308 -0
  14. data/Rakefile +28 -0
  15. data/TODO.md +99 -0
  16. data/bin/cutc +30 -0
  17. data/bin/cuttab +5 -0
  18. data/bin/greptrue +6 -0
  19. data/bin/md5sort +20 -0
  20. data/bin/setcat +11 -0
  21. data/bin/tabchar +5 -0
  22. data/bin/uniq-ord +59 -0
  23. data/bin/uniqc +3 -0
  24. data/bin/wu +34 -0
  25. data/bin/wu-clean-encoding +31 -0
  26. data/bin/wu-date +13 -0
  27. data/bin/wu-datetime +13 -0
  28. data/bin/wu-hist +3 -0
  29. data/bin/wu-lign +186 -0
  30. data/bin/wu-local +4 -0
  31. data/bin/wu-plus +9 -0
  32. data/bin/wu-source +5 -0
  33. data/bin/wu-sum +31 -0
  34. data/diagrams/wu_local.dot +39 -0
  35. data/diagrams/wu_local.dot.png +0 -0
  36. data/examples/Gemfile +38 -0
  37. data/examples/README.md +9 -0
  38. data/examples/basic/string_reverser.rb +23 -0
  39. data/examples/basic/tiny_count.rb +8 -0
  40. data/examples/basic/word_count/accumulator.rb +26 -0
  41. data/examples/basic/word_count/tokenizer.rb +13 -0
  42. data/examples/basic/word_count/word_count.rb +6 -0
  43. data/examples/dataflow/scraper_macro_flow.rb +28 -0
  44. data/examples/deploy_pack/Gemfile +6 -0
  45. data/examples/deploy_pack/README.md +6 -0
  46. data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
  47. data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
  48. data/examples/deploy_pack/config/environment.rb +1 -0
  49. data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
  50. data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
  51. data/examples/dsl/dataflow/simple.rb +12 -0
  52. data/examples/dsl/dataflow/telegram.rb +45 -0
  53. data/examples/dsl/workflow/cherry_pie.dot +97 -0
  54. data/examples/dsl/workflow/cherry_pie.md +104 -0
  55. data/examples/dsl/workflow/cherry_pie.png +0 -0
  56. data/examples/dsl/workflow/cherry_pie.rb +101 -0
  57. data/examples/empty/.gitkeep +0 -0
  58. data/examples/examples_helper.rb +9 -0
  59. data/examples/geo.rb +4 -0
  60. data/examples/geo/geo_grids.numbers +0 -0
  61. data/examples/geo/geolocated.rb +331 -0
  62. data/examples/geo/quadtile.rb +69 -0
  63. data/examples/geo/spec/geolocated_spec.rb +247 -0
  64. data/examples/geo/tile_fetcher.rb +77 -0
  65. data/examples/graph/implied_geolocation/README.md +63 -0
  66. data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
  67. data/examples/improver/tweet_summary.rb +73 -0
  68. data/examples/loadable.rb +2 -0
  69. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  70. data/examples/munging/airline_flights/airplane.rb +0 -0
  71. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  72. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  73. data/examples/munging/airline_flights/indexable.rb +75 -0
  74. data/examples/munging/airline_flights/indexable_spec.rb +90 -0
  75. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  76. data/examples/munging/airline_flights/tasks.rake +83 -0
  77. data/examples/munging/airline_flights/topcities.rb +167 -0
  78. data/examples/munging/geo/geo_json.rb +54 -0
  79. data/examples/munging/geo/geo_models.rb +69 -0
  80. data/examples/munging/geo/geonames_models.rb +107 -0
  81. data/examples/munging/geo/iso_codes.rb +172 -0
  82. data/examples/munging/geo/reconcile_countries.rb +124 -0
  83. data/examples/munging/geo/tasks.rake +71 -0
  84. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  85. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  86. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  87. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  88. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  89. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  90. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  91. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
  92. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  93. data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
  94. data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
  95. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  96. data/examples/rake_helper.rb +97 -0
  97. data/examples/ruby_project/Gemfile +6 -0
  98. data/examples/ruby_project/README.md +6 -0
  99. data/examples/ruby_project/a/b/c/.gitkeep +0 -0
  100. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  101. data/examples/server_logs/logline.rb +95 -0
  102. data/examples/server_logs/models.rb +66 -0
  103. data/examples/server_logs/page_counts.pig +48 -0
  104. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  105. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  106. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  107. data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
  108. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  109. data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
  110. data/examples/serverlogs/models/logline.rb +102 -0
  111. data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
  112. data/examples/serverlogs/visit_paths/common.rb +4 -0
  113. data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
  114. data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
  115. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
  116. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
  117. data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
  118. data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
  119. data/examples/splitter.rb +94 -0
  120. data/examples/string_reverser.rb +7 -0
  121. data/examples/text/pig_latin/pig_latinizer.rb +35 -0
  122. data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
  123. data/examples/text/regional_flavor/README.md +14 -0
  124. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  125. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  126. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  127. data/examples/twitter.rb +5 -0
  128. data/lib/hanuman.rb +36 -0
  129. data/lib/hanuman/graph.rb +97 -0
  130. data/lib/hanuman/graphvizzer.rb +206 -0
  131. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  132. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  133. data/lib/hanuman/link.rb +35 -0
  134. data/lib/hanuman/registry.rb +46 -0
  135. data/lib/hanuman/stage.rb +128 -0
  136. data/lib/hanuman/tree.rb +67 -0
  137. data/lib/wu/geo.rb +4 -0
  138. data/lib/wu/geo/geo_grids.numbers +0 -0
  139. data/lib/wu/geo/geolocated.rb +331 -0
  140. data/lib/wu/geo/quadtile.rb +69 -0
  141. data/lib/wu/graph/union_find.rb +62 -0
  142. data/lib/wu/model/reconcilable.rb +63 -0
  143. data/lib/wu/munging.rb +71 -0
  144. data/lib/wu/social/models/twitter.rb +31 -0
  145. data/lib/wu/wikipedia/models.rb +20 -0
  146. data/lib/wukong.rb +54 -0
  147. data/lib/wukong/dataflow.rb +43 -0
  148. data/lib/wukong/doc_helpers.rb +14 -0
  149. data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
  150. data/lib/wukong/doc_helpers/field_handler.rb +91 -0
  151. data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
  152. data/lib/wukong/driver.rb +214 -0
  153. data/lib/wukong/driver/event_machine_driver.rb +15 -0
  154. data/lib/wukong/driver/wiring.rb +68 -0
  155. data/lib/wukong/local.rb +42 -0
  156. data/lib/wukong/local/runner.rb +96 -0
  157. data/lib/wukong/local/stdio_driver.rb +104 -0
  158. data/lib/wukong/logger.rb +102 -0
  159. data/lib/wukong/model/faker.rb +136 -0
  160. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  161. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  162. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  163. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  164. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  165. data/lib/wukong/plugin.rb +48 -0
  166. data/lib/wukong/processor.rb +110 -0
  167. data/lib/wukong/rake_helper.rb +6 -0
  168. data/lib/wukong/runner.rb +169 -0
  169. data/lib/wukong/runner/boot_sequence.rb +123 -0
  170. data/lib/wukong/runner/code_loader.rb +52 -0
  171. data/lib/wukong/runner/command_runner.rb +44 -0
  172. data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
  173. data/lib/wukong/runner/help_message.rb +42 -0
  174. data/lib/wukong/source.rb +33 -0
  175. data/lib/wukong/source/source_driver.rb +74 -0
  176. data/lib/wukong/source/source_runner.rb +38 -0
  177. data/lib/wukong/spec_helpers.rb +74 -0
  178. data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
  179. data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
  180. data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
  181. data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
  182. data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
  183. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
  184. data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
  185. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
  186. data/lib/wukong/version.rb +3 -0
  187. data/lib/wukong/widget/echo.rb +55 -0
  188. data/lib/wukong/widget/extract.rb +122 -0
  189. data/lib/wukong/widget/filters.rb +452 -0
  190. data/lib/wukong/widget/logger.rb +56 -0
  191. data/lib/wukong/widget/operators.rb +82 -0
  192. data/lib/wukong/widget/reducers.rb +10 -0
  193. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  194. data/lib/wukong/widget/reducers/bin.rb +368 -0
  195. data/lib/wukong/widget/reducers/count.rb +73 -0
  196. data/lib/wukong/widget/reducers/group.rb +128 -0
  197. data/lib/wukong/widget/reducers/group_concat.rb +98 -0
  198. data/lib/wukong/widget/reducers/improver.rb +71 -0
  199. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  200. data/lib/wukong/widget/reducers/moments.rb +72 -0
  201. data/lib/wukong/widget/reducers/sort.rb +180 -0
  202. data/lib/wukong/widget/reducers/uniq.rb +91 -0
  203. data/lib/wukong/widget/serializers.rb +317 -0
  204. data/lib/wukong/widget/utils.rb +46 -0
  205. data/lib/wukong/widgets.rb +7 -0
  206. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  207. data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
  208. data/spec/examples/dataflow/parsing_spec.rb +14 -0
  209. data/spec/examples/dataflow/simple_spec.rb +34 -0
  210. data/spec/examples/dataflow/telegram_spec.rb +43 -0
  211. data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
  212. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  213. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  214. data/spec/examples/text/pig_latin_spec.rb +18 -0
  215. data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
  216. data/spec/hanuman/graph_spec.rb +119 -0
  217. data/spec/hanuman/hanuman_spec.rb +10 -0
  218. data/spec/hanuman/registry_spec.rb +123 -0
  219. data/spec/hanuman/stage_spec.rb +81 -0
  220. data/spec/hanuman/tree_spec.rb +119 -0
  221. data/spec/spec.opts +1 -0
  222. data/spec/spec_helper.rb +43 -0
  223. data/spec/support/example_test_helpers.rb +95 -0
  224. data/spec/support/hanuman_test_helpers.rb +92 -0
  225. data/spec/support/integration_helper.rb +38 -0
  226. data/spec/support/model_test_helpers.rb +115 -0
  227. data/spec/support/shared_context_for_graphs.rb +57 -0
  228. data/spec/support/shared_context_for_reducers.rb +37 -0
  229. data/spec/support/shared_examples_for_builders.rb +94 -0
  230. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  231. data/spec/wu/model/reconcilable_spec.rb +152 -0
  232. data/spec/wukong/dataflow_spec.rb +87 -0
  233. data/spec/wukong/driver_spec.rb +154 -0
  234. data/spec/wukong/local/runner_spec.rb +29 -0
  235. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  236. data/spec/wukong/local_spec.rb +6 -0
  237. data/spec/wukong/logger_spec.rb +49 -0
  238. data/spec/wukong/model/faker_spec.rb +132 -0
  239. data/spec/wukong/processor_spec.rb +21 -0
  240. data/spec/wukong/runner_spec.rb +132 -0
  241. data/spec/wukong/source_spec.rb +6 -0
  242. data/spec/wukong/widget/extract_spec.rb +101 -0
  243. data/spec/wukong/widget/filters_spec.rb +79 -0
  244. data/spec/wukong/widget/logger_spec.rb +23 -0
  245. data/spec/wukong/widget/operators_spec.rb +25 -0
  246. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  247. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  248. data/spec/wukong/widget/reducers/group_spec.rb +21 -0
  249. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  250. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  251. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  252. data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
  253. data/spec/wukong/widget/serializers_spec.rb +114 -0
  254. data/spec/wukong/widget/sink_spec.rb +19 -0
  255. data/spec/wukong/widget/source_spec.rb +65 -0
  256. data/spec/wukong/wu-local_spec.rb +109 -0
  257. data/spec/wukong/wu-source_spec.rb +32 -0
  258. data/spec/wukong/wu_spec.rb +14 -0
  259. data/spec/wukong/wukong_spec.rb +10 -0
  260. data/wukong.gemspec +35 -0
  261. metadata +465 -0
@@ -0,0 +1,78 @@
1
+ #!/usr/bin/env ruby
2
+ require_relative './dbpedia_common'
3
+
4
+ # ## Usage
5
+ #
6
+ # Flattens the wikipedia 'enwiki-latest-pages-articles.xml.gz' into a
7
+ # one-line-per-record heap.
8
+ #
9
+ # examples/munging/wikipedia/dbpedia/extract_geolocations.rb --rm --run \
10
+ # /data/origin/wikipedia/dbpedia/geo_coordinates_en.nq \
11
+ # /data/results/wikipedia/dbpedia-geo_coordinates_en.tsv
12
+ #
13
+
14
+ # ## Sample Pig Schema
15
+ #
16
+ # geolocations = LOAD '$geolocations' AS
17
+ # (id:long, namespace:int, title:chararray, longitude:float, latitude:float);
18
+ #
19
+
20
+ module Dbpedia
21
+ class GeocoordinatesExtractor < Wukong::Streamer::LineStreamer
22
+ include MungingUtils
23
+
24
+ KNOWN_LINE_RE = %r{\A(?:
25
+ \#\sstarted
26
+ | <http://dbpedia\.org/resource/[^>]+>\s
27
+ <http://(
28
+ www\.georss\.org/georss/point
29
+ | www\.w3\.org/1999/02/22-rdf-syntax-ns\#type
30
+ | www\.w3\.org/2003/01/geo/wgs84_pos\#(?:lat|long)
31
+ )>
32
+ )}x
33
+
34
+ # it's on one line in the actual dataset; split here for readability
35
+ GEO_RSS_RE = %r{\A
36
+ <http://dbpedia\.org/resource/(?<title>[^>]+)>
37
+ \s <http://www\.georss\.org/georss/point>
38
+ \s "(?<lat>#{DECIMAL_NUM_RE})\s(?<lng>#{DECIMAL_NUM_RE})"@en
39
+ \s <http://en\.wikipedia\.org/wiki/(?:[^\?]+)\?oldid=(?<article_id>\d+)>
40
+ \s \.
41
+ \z}x
42
+
43
+ def warn_record(desc, record=nil)
44
+ record_info = MultiJson.encode(record)[0..1000] rescue "(unencodeable record) #{record.inspect[0..100]}"
45
+ Log.warn [desc, record_info].join("\t")
46
+ end
47
+
48
+ ARTICLE_NAMESPACE = 0
49
+
50
+ # The file is in that godawful semantic web format. Let's fix that.
51
+ #
52
+ # <http://dbpedia.org/resource/Alabama> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.opengis.net/gml/_Feature> <http://en.wikipedia.org/wiki/Alabama?oldid=495507959> .
53
+ # <http://dbpedia.org/resource/Alabama> <http://www.w3.org/2003/01/geo/wgs84_pos#lat> "33.0"^^<http://www.w3.org/2001/XMLSchema#float> <http://en.wikipedia.org/wiki/Alabama?oldid=495507959> .
54
+ # <http://dbpedia.org/resource/Alabama> <http://www.w3.org/2003/01/geo/wgs84_pos#long> "-86.66666666666667"^^<http://www.w3.org/2001/XMLSchema#float> <http://en.wikipedia.org/wiki/Alabama?oldid=495507959> .
55
+ # <http://dbpedia.org/resource/Alabama> <http://www.georss.org/georss/point> "33.0 -86.66666666666667"@en <http://en.wikipedia.org/wiki/Alabama?oldid=495507959> .
56
+ #
57
+ # The lines seem to be redundant, with the georss one containing what we need, so just filter for those
58
+ #
59
+ def process(line)
60
+ if not KNOWN_LINE_RE.match(line) then warn_record("Unrecognized line type", line) ; return ; end
61
+ return unless $1 == 'www.georss.org/georss/point'
62
+ geo_info = GEO_RSS_RE.match(line)
63
+ if not geo_info then warn_record("Unrecognized georss line", line) ; return ; end
64
+
65
+ result = [
66
+ geo_info[:article_id],
67
+ ARTICLE_NAMESPACE, # the dbpedia stuff is all NS 0
68
+ geo_info[:title],
69
+ geo_info[:lng],
70
+ geo_info[:lat],
71
+ ]
72
+ yield result
73
+ end
74
+ end
75
+
76
+ end
77
+
78
+ Wukong::Script.new(Dbpedia::GeocoordinatesExtractor, nil).run
@@ -0,0 +1,66 @@
1
+
2
+ Settings.define :dbpedia_filetype, description: 'The dbpedia file type ("geo_coordinates", etc) -- taken from input filename if available'
3
+
4
+ # Settings[:dbpedia_filetype] ||= Settings[:input_paths].to_s
5
+ # Settings[:dbpedia_filetype] = File.basename(Settings[:dbpedia_filetype]).gsub(/[\.\-].*/, '')
6
+ # @flavor, flavor_info = DBPEDIA_FLAVOR_INFO.detect{|flavor, (filename, _r)| filename == Settings[:dbpedia_filetype] }
7
+ # @kind, @filename, @regexps = flavor_info
8
+
9
+ DBPEDIA_FLAVOR_INFO = {
10
+ title: ['labels_en', [:title, ], ],
11
+ page_id: ['page_ids_en', [:page_id, ], ],
12
+ wikipedia_link: ['wikipedia_links_en', [:wikipedia_links, :wikipedia_backlink, :wikipedia_lang, ], ],
13
+ abstract_short: ['short_abstracts_en', [:abstract_short, ], ],
14
+ abstract_long: ['long_abstracts_en', [:abstract_long, ], ],
15
+ geo_coordinates: ['geo_coordinates_en', [:geo_coordinates, :geo_coord_skip_a, :geo_coord_skip_b, ], ],
16
+ # #
17
+ page_links: ['page_links_unredirected_en', [:page_links, ], ],
18
+ disambiguations: ['disambiguations_unredirected_en', [:disambiguations, ], ],
19
+ redirects: ['redirects_transitive_en', [:redirects, ], ],
20
+ # #
21
+ external_links: ['external_links_en', [:external_links, ], ],
22
+ homepages: ['homepages_en', [:homepages, ], ],
23
+ geonames: ['geonames_links', [:geonames, ], ],
24
+ musicbrainz: ['musicbrainz_links', [:musicbrainz, ], ],
25
+ nytimes: ['nytimes_links', [:nytimes, ], ],
26
+ uscensus: ['uscensus_links', [:uscensus, ], ],
27
+ pnd: ['pnd_en', [:pnd, ], ],
28
+ # #
29
+ article_categories: ['article_categories_en', [:article_categories, ], ],
30
+ category_title: ['category_labels_en', [:title, ], ],
31
+ category_skos: ['skos_categories_en', [:category_skos_skip, :category_skos_title, :category_skos_reln ], ],
32
+ # #
33
+ wordnet: ['wordnet_links', [:wordnet, ], ],
34
+ persondata: ['persondata_unredirected_en', [:persondata_reln, :persondata_type, ], ],
35
+ yago: ['yago_links', [:yago, :instance_type_a, :instance_type_b, ], ],
36
+ instance_types: ['instance_types_en', [:yago, :instance_type_a, :instance_type_b, ], ],
37
+ property_specmap: ['specific_mappingbased_properties_en', [:property_specmap, ], ],
38
+ property_mapped: ['mappingbased_properties_unredirected_en', [
39
+ :property_str, :property_bool, :property_int,
40
+ :property_float, :property_date, :property_yearmonth, :property_monthday,
41
+ :persondata_reln, :persondata_type, :property_foaf, :property_desc, ], ],
42
+ topical_concepts: ['topical_concepts_unredirected_en', [:topical_concepts, ], ],
43
+ }
44
+
45
+ module Re
46
+ ##
47
+ # Container for the character classes specified in
48
+ # <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
49
+ # Borrowed from the addressable gem
50
+ module Uri
51
+ ALPHA = "a-zA-Z"
52
+ DIGIT = "0-9"
53
+ GEN_DELIMS = "\\:\\/\\?\\#\\[\\]\\@"
54
+ SUB_DELIMS = "\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\="
55
+ RESERVED = GEN_DELIMS + SUB_DELIMS
56
+ UNRESERVED = ALPHA + DIGIT + "\\-\\.\\_\\~"
57
+ PCHAR = UNRESERVED + SUB_DELIMS + "\\:\\@"
58
+ SCHEME = ALPHA + DIGIT + "\\-\\+\\."
59
+ AUTHORITY = PCHAR
60
+ PATH = PCHAR + "\\/"
61
+ QUERY = PCHAR + "\\/\\?"
62
+ FRAGMENT = PCHAR + "\\/\\?"
63
+ #
64
+ PATHSEG = ""
65
+ end
66
+ end
@@ -0,0 +1,260 @@
1
+ #!/usr/bin/env ruby
2
+ require_relative './dbpedia_common'
3
+ require 'ap'
4
+
5
+ # Notes:
6
+ #
7
+ # * disambiguation: `generic disambiguates specifics` -- `["Alien", "Alien_(law)"]` and `["Alien", "Alien_(film)"]`
8
+ # * redirects: `dupe redirects to actual` -- `["Oxygen-13", "Isotopes_of_oxygen"]`
9
+ # * page_link: `from links to into` -- `["Achilles", "Greeks"]
10
+
11
+ module Dbpedia
12
+
13
+ DECIMAL_NUM_RE = '[\-\+\d]+\.\d+'
14
+ URI_PATHCHARS = '\w\-\.\'~!$&()*+,;=:@'
15
+ # all backslash-escaped character, or non-quotes, up to first quote
16
+ DBLQ_STRING_C = '"(?<%s>(?:\\.|[^\"])*)"'
17
+
18
+ # output flavors:
19
+ #
20
+ # :abstract_long :abstract_short :category :category_reln :disambiguation
21
+ # :external_link :geo_coordinates :homepage :instance_of :page_id :page_link
22
+ # :persondata_reln :property :redirects :sameas :subject :title :wikipedia_link
23
+ #
24
+
25
+ MAPPING_INFO = {
26
+ # atomic topic properties
27
+ title: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :title, ], },
28
+ page_id: { kind: :page_id, fields: [:page_id, :wp_ns, :wikipedia_id, :wikipedia_pageid, ], },
29
+ abstract_short: { kind: :abstract_short, fields: [:page_id, :wp_ns, :wikipedia_id, :abstract, ], },
30
+ abstract_long: { kind: :abstract_long, fields: [:page_id, :wp_ns, :wikipedia_id, :abstract, ], },
31
+ wikipedia_lang: { kind: :skip, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :url, :slug, :lang, ], },
32
+ wikipedia_link: { kind: :wikipedia_link, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :url, :slug, :revision_id, ], },
33
+ wikipedia_backlink: { kind: :skip, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :url, :slug, :revision_id, ], },
34
+ geo_coordinates: { kind: :geo_coordinates, fields: [:page_id, :wp_ns, :wikipedia_id, :lat, :lng, ], },
35
+ geo_coord_skip_a: { kind: :skip, fields: [], },
36
+ geo_coord_skip_b: { kind: :skip, fields: [], },
37
+ # links between topics
38
+ page_link: { kind: :page_link, fields: [:page_id, :wp_ns, :from_id, :relation, :into_id, ], },
39
+ disambiguation: { kind: :disambiguation, fields: [:page_id, :wp_ns, :generic_wpid, :relation, :specific_wpid, ], },
40
+ redirects: { kind: :redirects, fields: [:page_id, :wp_ns, :dupe_id, :relation, :wikipedia_id, ], },
41
+ # external links and sameas'es
42
+ external_link: { kind: :external_link, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :weblink_url, ], },
43
+ homepage: { kind: :homepage, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :weblink_url, ], },
44
+ geonames: { kind: :sameas, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :geonames_id, ], },
45
+ musicbrainz: { kind: :sameas, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :musicbrainz_type, :musicbrainz_id,], },
46
+ nytimes: { kind: :sameas, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :nytimes_id, ], },
47
+ pnd: { kind: :sameas, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :pnd_id, ], },
48
+ uscensus: { kind: :sameas, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :country_id, :state_id, :kind, :adm2_id, :adm3_id, :adm4_id], },
49
+ # category links
50
+ category_skos_type: { kind: :instance_of, fields: [:page_id, :wp_ns, :wikipedia_id, :scheme, :obj_class ], },
51
+ category_skos_title: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :val_type, :category_title, ], },
52
+ category: { kind: :category, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :specific_wpid, ], },
53
+ category_subject: { kind: :subject, fields: [:page_id, :wp_ns, :wikipedia_id, :scheme, :into_wpid, ], },
54
+ category_reln: { kind: :category_reln, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :into_wpid, ], },
55
+ # properties
56
+ wordnet: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :wn_reln, :wn_class, :wn_pos, :wn_idx, ], },
57
+ property_bool: { kind: :property_bool, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
58
+ property_int: { kind: :property_int, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
59
+ property_float: { kind: :property_float, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
60
+ property_date: { kind: :property_date, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
61
+ property_yearmonth: { kind: :property_yearmonth, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
62
+ property_monthday: { kind: :property_monthday, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
63
+ property_str: { kind: :property_str, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
64
+ #
65
+ persondata_reln: { kind: :persondata_reln, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :into_wpid, ], },
66
+ # persondata_type: { kind: :# persondata_type, fields: [:page_id, :wp_ns, :wikipedia_id, :property, ], },
67
+ property_foaf: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
68
+ property_desc: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :name, ], },
69
+ yago: { kind: :category, fields: [:page_id, :wp_ns, :wikipedia_id, :scheme, :obj_class, ], },
70
+ instance_type_a: { kind: :instance_of, fields: [:page_id, :wp_ns, :wikipedia_id, :scheme, :obj_class, ], },
71
+ instance_type_b: { kind: :instance_of, fields: [:page_id, :wp_ns, :wikipedia_id, :scheme, :obj_class, ], },
72
+ property_specmap: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :units, :val, ], },
73
+ # topical_concepts: { kind: :# topical_concepts, fields: [:page_id, :wp_ns, :wikipedia_id, :skos_subject :x, ], },
74
+ }
75
+
76
+ RDF_RES = {
77
+ # type descriptions
78
+ dbpedia_class: 'http://dbpedia\.org/class/(?<%s>[^>\s]+)',
79
+ dbpedia_ontb: 'http://dbpedia\.org/ontology',
80
+ dbpedia_ont: 'http://dbpedia\.org/ontology/(?<%s>[\w\/]+)',
81
+ dbpedia_prop: 'http://dbpedia\.org/property/(?<%s>\w+)',
82
+ dbpedia_rsrc: 'http://dbpedia\.org/resource/(?<%s>[' + URI_PATHCHARS + '%%\/]+)',
83
+ yago_class: 'http://dbpedia\.org/class/(?<%s>yago)/(?<%s>[' + URI_PATHCHARS + '%%\/]+)',
84
+ wikipedia_rsrc: '(?<%s>http://\w\w\.wikipedia\.org/wiki/(?<%s>[' + URI_PATHCHARS + '%%\/]+))',
85
+ wiki_category: 'http://en\.wikipedia\.org/wiki/Category:Futurama?oldid=485425712\\#absolute-line=1',
86
+ wiki_link_id: 'http://en\.wikipedia\.org/wiki/(?<%s>[^\?]+)\?oldid=(?<%s>\d+)(?:\\#absolute-line=(?<%s>\d+))?',
87
+ wiki_link_id_sec: 'http://en\.wikipedia\.org/wiki/(?<%s>[^\?]+)\?oldid=(?<%s>\d+)\\#?(?:section=(?<%s>.*?)\&relative-line=(?<%s>\d+))?(?:&?absolute-line=(?<%s>\d+))?',
88
+ purl_desc: 'http://purl\.org/dc/elements/1\.1/(?<%s>description)',
89
+ purl_lang: 'http://purl\.org/dc/elements/1\.1/language',
90
+ purl_subject: 'http://purl\.org/dc/terms/subject',
91
+ rdf_type: 'http://www\.w3\.org/1999/02/22-rdf-syntax-ns\\#type',
92
+ rdf_comment: 'http://www\.w3\.org/2000/01/rdf-schema\\#comment',
93
+ rdf_label: 'http://www\.w3\.org/2000/01/rdf-schema\\#label',
94
+ # external links and sameas'es
95
+ same_as: 'http://www\.w3\.org/2002/07/owl\\#sameAs',
96
+ wordnet_inst: 'http://www\.w3\.org/2006/03/wn/wn20/instances/(?<%s>synset)-(?<%s>\w+)-(?<%s>noun)-(?<%s>[0-9]+)',
97
+ musicbrainz_rsrc: 'http://zitgist\.com/music/(?<%s>\w+)/(?<%s>[a-f0-9\-]+)',
98
+ nytimes_rsrc: 'http://data\.nytimes\.com/(?<%s>[A-Z0-9]+)',
99
+ geonames_rsrc: 'http://sws\.geonames\.org/(?<%s>\d+)/',
100
+ georss_type: 'http://www\.georss\.org/georss/point',
101
+ wgs_latorlng: 'http://www\.w3\.org/2003/01/geo/wgs84_pos\\#(?:lat|long)',
102
+ # http://www.rdfabout.com/rdf/usgov/geo/ us / ak / counties /bethel_area /an_subarea /aniak >
103
+ uscensus_url: 'http://www.rdfabout.com/rdf/usgov/geo/(?<%s>us)/(?<%s>\w\w)(?:/(?<%s>counties)/(?<%s>\w+)(?:/(?<%s>\w+)\/?(?<%s>\w+)?)?)?',
104
+ # category links
105
+ skos_subject: 'http://www\.w3\.org/2004/02/skos/core\\#subject',
106
+ skos_concept: 'http://www\.w3\.org/2004/02/skos/core\\#(?<%s>[a-zA-Z]+)',
107
+ foaf_homepage: 'http://xmlns\.com/foaf/0\.1/homepage',
108
+ foaf_name: 'http://xmlns\.com/foaf/0\.1/name',
109
+ foaf_topic: 'http://xmlns\.com/foaf/0\.1/(?:isPrimaryTopicOf|primaryTopic)',
110
+ foaf_prop: 'http://xmlns\.com/foaf/0\.1/(?<property>\w+)',
111
+ # property values
112
+ georss_latlng: '\"(?<%s>[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?)\\s(?<%s>[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?)\"@\w\w',
113
+ rdf_eol: '\\.',
114
+ #
115
+ rdf_bool: '\"(?<%s>true|false )\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>boolean)>',
116
+ rdf_date: '\"(?<%s>-?\d\d\d\d-\d\d-\d\d )\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>date)>',
117
+ rdf_yearmonth: '\"(?<%s>-?\d\d\d\d-\d\d )\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>gYearMonth)>',
118
+ rdf_monthday: '\"(?<%s>--\d\d-\d\d )\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>gMonthDay)>',
119
+ rdf_int: '\"(?<%s>[\+\-]?\d+ )\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>integer|gYear|positiveInteger|nonNegativeInteger)>',
120
+ rdf_float: '\"(?<%s>[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?)\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>float|double)>',
121
+ # all backslash-escaped character, or non-quotes, up to first quote
122
+ rdf_string: '"(?<%s>(?:\\\\.|[^\"])*)"@en',
123
+ dbpedia_value: '"(?<%s>(?:\\\\.|[^\"])*)"\\^\\^<http://dbpedia\.org/datatype/(?<%s>[a-zA-Z]+)>',
124
+ #
125
+ url_loose: '(?<%s>(?:https?|ftp)://(?:[a-zA-Z0-9\-]+\.)+(?:[a-zA-Z\-]+)[^\s>]*)',
126
+ # rdf_value: '\"(?<%s>-?\d\d\d\d-\d\d-\d\d|-?\d\d\d\d-\d\d|--\d\d-\d\d|[\+\-]?\d+|[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?|true|false)\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>integer|date|gYearMonth|gMonthDay|gYear|positiveInteger|nonNegativeInteger|float|double|boolean)>',
127
+ schema_type: 'http://(?<%s>www\\.w3\\.org/2002/07/owl|schema\\.org|dbpedia\\.org/ontology|purl\\.org/ontology|xmlns.com/foaf/0\\.1)[/\\#](?<%s>[^>]+)'
128
+ }
129
+
130
+ SCHEMA_SCHEMES = {
131
+ 'www.w3.org/2002/07/owl' => 'owl',
132
+ 'schema.org' => 'schemaorg',
133
+ 'dbpedia.org/ontology' => 'dbpedia',
134
+ 'purl.org/ontology' => 'purl',
135
+ 'xmlns.com/foaf/0.1' => 'foaf'
136
+ }
137
+
138
+ # lookup regexp in above table, sub in variable names
139
+ private
140
+ def self.r(regexp_name, *args)
141
+ RDF_RES[regexp_name] % args
142
+ end
143
+ public
144
+
145
+ MAPPING_RES = {
146
+ # atomic topic properties
147
+ title: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_label)}> \s#{r(:rdf_string, :title )} \s<#{r(:wiki_link_id, :wikipedia_id2, :revision_id, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
148
+ page_id: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/wikiPageID> \s#{r(:rdf_int, :wikipedia_pageid, :_dtyp)} \s<#{r(:wiki_link_id, :wikipedia_id2, :revision_id, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
149
+ wikipedia_lang: %r{\A<#{r(:wikipedia_rsrc, :url, :slug)}> \s<#{r(:purl_lang)}> \s#{r(:rdf_string, :lang)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
150
+ wikipedia_link: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:foaf_topic)}> \s<#{r(:wikipedia_rsrc, :url, :slug)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
151
+ wikipedia_backlink: %r{\A<#{r(:wikipedia_rsrc, :url, :slug)}> \s<#{r(:foaf_topic)}> \s<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
152
+ abstract_short: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_comment)}> \s#{r(:rdf_string, :abstract)} \s<#{r(:wiki_link_id, :wikipedia_id2, :revision_id, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
153
+ abstract_long: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/abstract> \s#{r(:rdf_string, :abstract)} \s<#{r(:wiki_link_id, :wikipedia_id2, :revision_id, :article_lineno)}> \s#{r(:rdf_eol)} \z}xm,
154
+ geo_coordinates: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:georss_type)}> \s#{r(:georss_latlng, :lat, :lng)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
155
+ geo_coord_skip_a: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<http://www\.opengis\.net/gml/_Feature> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
156
+ geo_coord_skip_b: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:wgs_latorlng)}> \s#{r(:rdf_float, :val, :_dtyp)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
157
+ # links between topic
158
+ page_link: %r{\A<#{r(:dbpedia_rsrc, :from_id)}> \s<#{r(:dbpedia_ontb)}/wikiPageWikiLink> \s<#{r(:dbpedia_rsrc, :into_id)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
159
+ disambiguation: %r{\A<#{r(:dbpedia_rsrc, :generic_wpid)}> \s<#{r(:dbpedia_ontb)}/wikiPageDisambiguates> \s<#{r(:dbpedia_rsrc, :specific_wpid)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
160
+ redirects: %r{\A<#{r(:dbpedia_rsrc, :dupe_id)}> \s<#{r(:dbpedia_ontb)}/wikiPageRedirects> \s<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s#{r(:rdf_eol)} \z}x,
161
+ # external links and sameas'es
162
+ external_link: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/wikiPageExternalLink> \s<#{r(:url_loose, :weblink_url)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
163
+ homepage: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:foaf_homepage)}> \s<#{r(:url_loose, :weblink_url)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
164
+ geonames: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:same_as)}> \s<#{r(:geonames_rsrc, :geonames_id)}> \s#{r(:rdf_eol)} \z}x,
165
+ musicbrainz: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:same_as)}> \s<#{r(:musicbrainz_rsrc, :musicbrainz_type, :musicbrainz_id)}> \s#{r(:rdf_eol)} \z}x,
166
+ nytimes: %r{\A<#{r(:nytimes_rsrc, :nytimes_id)}> \s<#{r(:same_as)}> \s<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s#{r(:rdf_eol)} \z}x,
167
+ uscensus: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:same_as)}> \s<#{r(:uscensus_url, :country_id, :state_id, :kind, :adm2_id, :adm3_id, :adm4_id)}> \s#{r(:rdf_eol)} \z}x,
168
+ pnd: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/individualisedPnd> \s#{r(:rdf_string, :pnd_id)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
169
+ # category links
170
+ category: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:purl_subject)}> \s<#{r(:dbpedia_rsrc, :specific_wpid)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
171
+ category_skos_type: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<#{r(:skos_concept, :obj_class)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
172
+ category_subject: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:skos_subject, :relation)}> \s<#{r(:dbpedia_rsrc, :into_wpid)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
173
+ category_reln: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:skos_concept, :relation)}> \s<#{r(:dbpedia_rsrc, :into_wpid)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
174
+ category_skos_title: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:skos_concept, :relation)}> \s#{r(:rdf_string, :category_title)} \s<#{r(:wiki_link_id, :wikipedia_id2, :revision_id, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
175
+ # properties
176
+ wordnet: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_prop, :property)}> \s<#{r(:wordnet_inst, :wn_reln, :wn_class, :wn_pos, :wn_idx)}> \s#{r(:rdf_eol)} \z}x,
177
+ #
178
+ property_bool: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_bool, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
179
+ property_int: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_int, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
180
+ property_float: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_float, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
181
+ property_date: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_date, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
182
+ property_yearmonth: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_yearmonth, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
183
+ property_monthday: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_monthday, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
184
+ property_str: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_string, :val) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
185
+ #
186
+ persondata_reln: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s<#{r(:dbpedia_rsrc, :into_wpid)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
187
+ property_foaf: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:foaf_prop, :property)}> \s#{r(:rdf_string, :val)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
188
+ property_desc: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:purl_desc, :property)}> \s#{r(:rdf_string,:name)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
189
+ yago: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<#{r(:yago_class, :scheme, :obj_class)}> \s#{r(:rdf_eol)} \z}x,
190
+ instance_type_a: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<#{r(:dbpedia_ont, :obj_class)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
191
+ instance_type_b: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<#{r(:schema_type, :org, :obj_class)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
192
+ property_specmap: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:dbpedia_value, :val, :units)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
193
+ # topical_concepts: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:skos_subject)}> \s<#{r(:x, )}> \z},
194
+ }
195
+ MAPPING_RES.each{|re_name, re| MAPPING_INFO[re_name][:re] = re }
196
+ SKIPPAPLE_FIELDS = [:flavor, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno, :val_lang, :name_lang, :_dtyp]
197
+
198
+ class RdfExtractor < Wukong::Streamer::LineStreamer
199
+ include MungingUtils
200
+ attr_accessor :flavor, :kind, :filename, :regexps, :seen_keys, :seen_props
201
+
202
+ def initialize(*args)
203
+ @seen_keys = Hash.new(0)
204
+ @seen_props = Hash.new(0)
205
+ end
206
+
207
+ def record_for_flavor(kind, fields, flavor, hsh)
208
+ hsh.merge!( wp_ns: 0, flavor: flavor )
209
+ return if kind == :skip
210
+
211
+ case flavor
212
+ when :property_str, :property_foaf then hsh[:val] = MultiJson.encode(hsh[:val])
213
+ when :abstract_long, :abstract_short then hsh[:abstract] = MultiJson.encode(hsh[:abstract])
214
+ when :title then hsh[:title] = MultiJson.encode(hsh[:title]) ; hsh[:property] = 'title'
215
+ when :category_skos_title then hsh[:category_title] = MultiJson.encode(hsh[:category_title])
216
+ when :category_skos_type then hsh[:scheme] = 'skos'
217
+ when :category_subject then hsh[:scheme] = 'subject'
218
+ when :instance_type_a then hsh[:scheme] = 'dbpedia'
219
+ when :instance_type_b
220
+ hsh[:scheme] = SCHEMA_SCHEMES[hsh.delete(:org)]
221
+ return if hsh[:scheme] == 'owl'
222
+ when :wikipedia_link, :wikipedia_backlink
223
+ raise "Titles disagree!" unless hsh[:slug] == hsh[:wikipedia_id]
224
+ end
225
+
226
+ # record seen properties, seen fields
227
+ hsh.except(*fields).except(*SKIPPAPLE_FIELDS).
228
+ each{|key, val| @seen_keys[key] += 1 if val.present? }
229
+ seen_props[hsh[:property]] += 1 if hsh[:property].present?
230
+ sanity_check(hsh)
231
+ #
232
+ [kind] + hsh.values_at(*fields)
233
+ end
234
+
235
+ def sanity_check(hsh)
236
+ hsh.each{|key,val| raise if CONTROL_CHARS_RE =~ val.to_s }
237
+ end
238
+
239
+ def after_stream
240
+ Log.info ["seen keys:", seen_keys.inspect, "seen props:", seen_props.inspect].join("\t")
241
+ end
242
+
243
+ def process(line)
244
+ return if line =~ /\A(?:\#|$)/
245
+ if (line =~ /=> \w+\.\w+ <=/) then yield [line] ; return ; end
246
+
247
+ MAPPING_INFO.each do |flavor, info|
248
+ next unless mm = info[:re].match(line)
249
+ yield record_for_flavor(info[:kind], info[:fields], flavor, mm.as_hash)
250
+ return
251
+ end
252
+
253
+ Log.warn ['not found:', line].join("\t")
254
+ end
255
+ end
256
+ end
257
+
258
+
259
+
260
+ Wukong::Script.new(Dbpedia::RdfExtractor, nil).run
@@ -0,0 +1,20 @@
1
+ module Dbpedia
2
+
3
+ class SameasExtractor < Wukong::Streamer::LineStreamer
4
+ include MungingUtils
5
+
6
+ SAME_AS_RE = %r{\A
7
+ <http://dbpedia\.org/resource/(?<title>[^>]+)>
8
+ \s <http://www\.w3\.org/2002/07/owl\#sameAs>
9
+ \s <http://(?<target>[^>]+)>
10
+ \s \.
11
+ \z}x
12
+
13
+ def recordize(line)
14
+ same_as = SAME_AS_RE.match(line)
15
+ if not same_as then warn_record("Unrecognized line type", line) ; return end
16
+ [same_as[:title], same_as[:target]]
17
+ end
18
+ end
19
+
20
+ end
@@ -0,0 +1,97 @@
1
+ require 'configliere' ; Settings.use :commandline
2
+ require 'gorillib'
3
+ require 'gorillib/data_munging'
4
+ require 'pry'
5
+ require 'rake'
6
+
7
+ <<<<<<< HEAD
8
+ PROJ_ROOT = (ENV['PROJ_CONTENTS'] || File.expand_path('..', File.dirname(__FILE__)))
9
+ =======
10
+ BOOK_ROOT = (ENV['BOOK_CONTENTS'] || File.expand_path('..', File.dirname(__FILE__)))
11
+ >>>>>>> produce_consume
12
+
13
+ Settings.define :mini, type: :boolean, default: false, description: "use sample data or full data?"
14
+ Settings.resolve!
15
+ Settings[:mini_slug] = Settings.mini ? "-sample" : ""
16
+
17
+ # dummy dependency to force a task. TASK FORCE DELTA GO
18
+ task :force
19
+
20
+ Pathname.register_paths(
21
+ <<<<<<< HEAD
22
+ proj_root: PROJ_ROOT,
23
+ root: [:proj_root],
24
+ code: [:proj_root, 'code'],
25
+ data: [:proj_root, 'data'],
26
+ work: [:proj_root, 'tmp'],
27
+ =======
28
+ book_root: BOOK_ROOT,
29
+ root: [:book_root],
30
+ code: [:book_root, 'code'],
31
+ data: [:book_root, 'data'],
32
+ work: [:book_root, 'tmp'],
33
+ >>>>>>> produce_consume
34
+ )
35
+
36
+ require 'rake/name_space'
37
+ module ::Rake
38
+ class NameSpace
39
+ def name
40
+ @scope.join(':')
41
+ end
42
+
43
+ def direct_tasks
44
+ tasks.find_all{|task| task.name =~ /\A#{name}:\w+\z/ }
45
+ end
46
+ end
47
+ end
48
+
49
+ def step(target, options)
50
+ deps = [options[:after]].flatten.compact
51
+ Array.wrap(options[:invoke]).each{|task_name| Rake::Task[task_name].invoke }
52
+ desc(options[:doc]) if options[:doc]
53
+ task target => deps do
54
+ yield target if block_given?
55
+ end
56
+ target
57
+ end
58
+
59
+ def file_task(name, options={})
60
+ target = Pathname.of(name)
61
+ target_dir = File.dirname(target.to_s)
62
+ task(name => target)
63
+ #
64
+ directory(target_dir)
65
+ deps = [options[:after], target_dir].flatten.compact
66
+ file target => deps do
67
+ Array.wrap(options[:invoke]).each{|task_name| Rake::Task[task_name].invoke }
68
+ Log.info "Creating #{name} => #{target}"
69
+ yield target if block_given?
70
+ end
71
+ target
72
+ end
73
+
74
+ def create_file(name, options={})
75
+ file_task(name, options) do |target|
76
+ File.open(target, 'wb') do |target_file|
77
+ yield target_file
78
+ end
79
+ end
80
+ end
81
+
82
+ # * accumulates all symbol-named tasks that are direct children
83
+ #
84
+ # @example Will make task parse depending on 'parse:bob'
85
+ # chain('parse') do
86
+ # task('bob'){ ... }
87
+ # chain('nest'){ task('two_down') }
88
+ # end
89
+ #
90
+ def chain(name, doc=nil, &block)
91
+ desc(doc) if doc
92
+ task(name)
93
+ return unless block
94
+ ns = namespace(name, &block)
95
+ task(name => ns.direct_tasks)
96
+ ns
97
+ end