wukong 3.0.0.pre2 → 3.0.0.pre3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (146) hide show
  1. data/Gemfile +13 -0
  2. data/README.md +182 -6
  3. data/bin/wu-local +13 -5
  4. data/bin/wu-server +1 -1
  5. data/examples/Gemfile +2 -1
  6. data/examples/basic/string_reverser.rb +23 -0
  7. data/examples/{tiny_count.rb → basic/tiny_count.rb} +0 -0
  8. data/examples/{word_count → basic/word_count}/accumulator.rb +0 -0
  9. data/examples/{word_count → basic/word_count}/tokenizer.rb +0 -0
  10. data/examples/{word_count → basic/word_count}/word_count.rb +0 -0
  11. data/examples/deploy_pack/Gemfile +7 -0
  12. data/examples/deploy_pack/README.md +6 -0
  13. data/examples/{text/latinize_text.rb → deploy_pack/a/b/c/.gitkeep} +0 -0
  14. data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
  15. data/examples/deploy_pack/config/environment.rb +1 -0
  16. data/examples/{dataflow → dsl/dataflow}/fibonacci_series.rb +0 -0
  17. data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
  18. data/examples/{dataflow → dsl/dataflow}/simple.rb +0 -0
  19. data/examples/{dataflow → dsl/dataflow}/telegram.rb +0 -0
  20. data/examples/{workflow → dsl/workflow}/cherry_pie.dot +0 -0
  21. data/examples/{workflow → dsl/workflow}/cherry_pie.md +0 -0
  22. data/examples/{workflow → dsl/workflow}/cherry_pie.png +0 -0
  23. data/examples/{workflow → dsl/workflow}/cherry_pie.rb +0 -0
  24. data/examples/empty/.gitkeep +0 -0
  25. data/examples/graph/implied_geolocation/README.md +63 -0
  26. data/examples/graph/{minimum_spanning_tree.rb → minimum_spanning_tree/airfares_graphviz.rb} +0 -0
  27. data/examples/munging/airline_flights/indexable.rb +75 -0
  28. data/examples/munging/airline_flights/indexable_spec.rb +90 -0
  29. data/examples/munging/geo/geonames_models.rb +29 -0
  30. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +1 -0
  31. data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
  32. data/examples/munging/wikipedia/dbpedia/extract_links.rb +213 -146
  33. data/examples/rake_helper.rb +12 -0
  34. data/examples/ruby_project/Gemfile +7 -0
  35. data/examples/ruby_project/README.md +6 -0
  36. data/examples/ruby_project/a/b/c/.gitkeep +0 -0
  37. data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
  38. data/examples/serverlogs/models/logline.rb +102 -0
  39. data/examples/{dataflow/parse_apache_logs.rb → serverlogs/parser/apache_parser_widget.rb} +0 -0
  40. data/examples/serverlogs/visit_paths/common.rb +4 -0
  41. data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
  42. data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
  43. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
  44. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
  45. data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
  46. data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
  47. data/examples/text/{pig_latin.rb → pig_latin/pig_latinizer.rb} +0 -0
  48. data/examples/{dataflow/pig_latinizer.rb → text/pig_latin/pig_latinizer_widget.rb} +0 -0
  49. data/lib/hanuman/graph.rb +6 -1
  50. data/lib/wu/geo.rb +4 -0
  51. data/lib/wu/geo/geo_grids.numbers +0 -0
  52. data/lib/wu/geo/geolocated.rb +331 -0
  53. data/lib/wu/geo/quadtile.rb +69 -0
  54. data/{examples → lib/wu}/graph/union_find.rb +0 -0
  55. data/lib/wu/model/reconcilable.rb +63 -0
  56. data/{examples/munging/wikipedia/utils/munging_utils.rb → lib/wu/munging.rb} +7 -4
  57. data/lib/wu/social/models/twitter.rb +31 -0
  58. data/{examples/models/wikipedia.rb → lib/wu/wikipedia/models.rb} +0 -0
  59. data/lib/wukong.rb +9 -4
  60. data/lib/wukong/boot.rb +10 -1
  61. data/lib/wukong/driver.rb +65 -71
  62. data/lib/wukong/logger.rb +93 -0
  63. data/lib/wukong/processor.rb +38 -29
  64. data/lib/wukong/runner.rb +144 -0
  65. data/lib/wukong/server.rb +119 -0
  66. data/lib/wukong/spec_helpers.rb +1 -0
  67. data/lib/wukong/spec_helpers/integration_driver.rb +22 -9
  68. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +26 -4
  69. data/lib/wukong/spec_helpers/processor_helpers.rb +4 -10
  70. data/lib/wukong/spec_helpers/shared_examples.rb +12 -13
  71. data/lib/wukong/version.rb +1 -1
  72. data/lib/wukong/widget/processors.rb +13 -0
  73. data/lib/wukong/widget/serializers.rb +55 -65
  74. data/lib/wukong/widgets.rb +0 -2
  75. data/spec/hanuman/graph_spec.rb +14 -0
  76. data/spec/spec_helper.rb +4 -30
  77. data/spec/support/{wukong_test_helpers.rb → example_test_helpers.rb} +29 -2
  78. data/spec/support/integration_helper.rb +38 -0
  79. data/spec/support/model_test_helpers.rb +115 -0
  80. data/spec/wu/geo/geolocated_spec.rb +247 -0
  81. data/spec/wu/model/reconcilable_spec.rb +152 -0
  82. data/spec/wukong/widget/processors_spec.rb +0 -1
  83. data/spec/wukong/widget/serializers_spec.rb +88 -62
  84. data/spec/wukong/wu_local_spec.rb +125 -0
  85. data/wukong.gemspec +3 -16
  86. metadata +72 -266
  87. data/examples/dataflow/apache_log_line.rb +0 -100
  88. data/examples/jabberwocky.txt +0 -36
  89. data/examples/munging/Gemfile +0 -8
  90. data/examples/munging/airline_flights/airline.rb +0 -57
  91. data/examples/munging/airline_flights/airport.rb +0 -211
  92. data/examples/munging/airline_flights/flight.rb +0 -156
  93. data/examples/munging/airline_flights/models.rb +0 -4
  94. data/examples/munging/airline_flights/parse.rb +0 -26
  95. data/examples/munging/airline_flights/route.rb +0 -35
  96. data/examples/munging/airline_flights/timezone_fixup.rb +0 -62
  97. data/examples/munging/airports/40_wbans.txt +0 -40
  98. data/examples/munging/airports/filter_weather_reports.rb +0 -37
  99. data/examples/munging/airports/join.pig +0 -31
  100. data/examples/munging/airports/to_tsv.rb +0 -33
  101. data/examples/munging/airports/usa_wbans.pig +0 -19
  102. data/examples/munging/airports/usa_wbans.txt +0 -2157
  103. data/examples/munging/airports/wbans.pig +0 -19
  104. data/examples/munging/airports/wbans.txt +0 -2310
  105. data/examples/munging/rake_helper.rb +0 -62
  106. data/examples/munging/weather/.gitignore +0 -1
  107. data/examples/munging/weather/Gemfile +0 -4
  108. data/examples/munging/weather/Rakefile +0 -28
  109. data/examples/munging/weather/extract_ish.rb +0 -13
  110. data/examples/munging/weather/models/weather.rb +0 -119
  111. data/examples/munging/weather/utils/noaa_downloader.rb +0 -46
  112. data/examples/munging/wikipedia/README.md +0 -34
  113. data/examples/munging/wikipedia/Rakefile +0 -193
  114. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +0 -18
  115. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +0 -21
  116. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +0 -27
  117. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +0 -29
  118. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +0 -14
  119. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +0 -25
  120. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +0 -29
  121. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +0 -32
  122. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +0 -85
  123. data/examples/munging/wikipedia/pig_style_guide.md +0 -25
  124. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +0 -19
  125. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +0 -23
  126. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +0 -24
  127. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +0 -22
  128. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +0 -22
  129. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +0 -26
  130. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +0 -29
  131. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +0 -24
  132. data/examples/munging/wikipedia/utils/get_namespaces.rb +0 -86
  133. data/examples/munging/wikipedia/utils/namespaces.json +0 -1
  134. data/examples/string_reverser.rb +0 -26
  135. data/examples/twitter/locations.rb +0 -29
  136. data/examples/twitter/models.rb +0 -24
  137. data/examples/twitter/pt1-fiddle.pig +0 -8
  138. data/examples/twitter/pt2-simple_parse.pig +0 -31
  139. data/examples/twitter/pt2-simple_parse.rb +0 -18
  140. data/examples/twitter/pt3-join_on_zips.pig +0 -39
  141. data/examples/twitter/pt4-strong_links.rb +0 -20
  142. data/examples/twitter/pt5-lnglat_and_strong_links.pig +0 -16
  143. data/examples/twitter/states.tsv +0 -50
  144. data/examples/workflow/package_gem.rb +0 -55
  145. data/lib/wukong/widget/sink.rb +0 -16
  146. data/lib/wukong/widget/source.rb +0 -14
@@ -0,0 +1,75 @@
1
+ module Gorillib
2
+ module Model
3
+
4
+ #
5
+ # @example
6
+ # class Airport
7
+ # include Gorillib::Model::Indexable
8
+ # # ... define model
9
+ # index_on :icao_code, :city
10
+ # end
11
+ # Airport.for_icao_code('KAUS') #=> #<Airport icao_code="KAUS" ... >
12
+ # Airport.for_city('Austin') #=> #<Airport icao_code="KAUS" ... >
13
+ #
14
+ # You must implement a `load` method
15
+ #
16
+ module Indexable
17
+ extend Gorillib::Concern
18
+
19
+ included do |base|
20
+ base.class_attribute :lookups, instance_writer: false
21
+ self.lookups ||= []
22
+ end
23
+
24
+ module ClassMethods
25
+
26
+ def values
27
+ @values ||= Array.new
28
+ end
29
+
30
+ def flush_lookups
31
+ lookups.each{|idx| remove_instance_variable("@#{idx}") if instance_variable_defined?("@#{idx}") }
32
+ end
33
+
34
+ #
35
+ # @example
36
+ # class Airport
37
+ # index_on :icao_code, [:city, :cities]
38
+ # end
39
+ # Airport.for_icao_code('KAUS') #=> #<Airport icao_code="KAUS" ... >
40
+ # Airport.for_city('Austin') #=> #<Airport icao_code="KAUS" ... >
41
+ #
42
+ # NOTE: `.#{key_name}_index` method is NOT part of the framework interface.
43
+ # only the `.for_#{keyname}` method is suported.
44
+ def index_on(*key_names)
45
+ self.lookups += key_names
46
+ self.lookups.uniq!
47
+ #
48
+ key_names.each do |key_name, index_name|
49
+ index_name ||= "#{key_name}_index"
50
+ class_eval <<-EOV, __FILE__, __LINE__ + 1
51
+ class << self
52
+ def #{index_name} # def name_index
53
+ @#{index_name} ||= # @name_index ||=
54
+ Hash[values.map{|el| [el.#{key_name}, el] }] # Hash[values.map{|el| [el.name, el]
55
+ end # end
56
+ protected(:#{index_name}) # protected :name_index
57
+ end
58
+ EOV
59
+
60
+ instance_eval <<-EOV, __FILE__, __LINE__ + 1
61
+ def for_#{key_name}(*args, &block) # def for_name(*args, &block)
62
+ #{index_name}.fetch(*args, &block) # name_index.fetch(*args, &block)
63
+ end # end
64
+ EOV
65
+ end
66
+ end
67
+
68
+
69
+
70
+ end
71
+
72
+ end
73
+
74
+ end
75
+ end
@@ -0,0 +1,90 @@
1
+ require 'spec_helper'
2
+ require 'gorillib/model'
3
+ require 'gorillib/pathname'
4
+ #
5
+ require 'gorillib/model/serialization'
6
+ require 'gorillib/model/serialization/tsv'
7
+ require 'gorillib/array/hashify'
8
+ #
9
+ require 'wu/model/indexable'
10
+
11
+ describe Gorillib::Model::Indexable, :model_spec, :only do
12
+ let(:mock_array){ mock('array') }
13
+
14
+ let(:country_code_class) do
15
+ module Gorillib::Test
16
+ remove_const(:CountryCode) if defined?(CountryCode)
17
+
18
+ class CountryCode
19
+ include Gorillib::Model
20
+ include Gorillib::Model::Indexable
21
+ field :alpha_2_code, String, position: 0
22
+ field :name, String, position: 1
23
+ def self.load
24
+ self.values << new('dj', 'Djibouti')
25
+ self.values << new('us', 'United States of America')
26
+ values
27
+ end
28
+ end
29
+
30
+ end
31
+ Gorillib::Test::CountryCode
32
+ end
33
+
34
+ let(:djibouti){ country_code_class.new('dj', 'Djibouti') }
35
+ let(:usa ){ country_code_class.new('us', 'United States of America') }
36
+
37
+ context 'test setup' do
38
+ subject{ country_code_class.load }
39
+ it{ should == [djibouti, usa] }
40
+ end
41
+
42
+ context '.values' do
43
+ # before{ country_code_class.send(:remove_instance_variable, '@values') }
44
+ it 'gets its values from .load' do
45
+ country_code_class.should_receive(:load).once.and_return mock_array
46
+ country_code_class.values.should equal(mock_array)
47
+ end
48
+ it 'memoizes once it is called' do
49
+ country_code_class.should_receive(:load).once.and_return mock_array
50
+ country_code_class.values.should equal(mock_array)
51
+ country_code_class.values.should equal(mock_array)
52
+ end
53
+ end
54
+
55
+ context '.index_on' do
56
+ it 'defines a .for_foo method' do
57
+ country_code_class.should_not respond_to(:for_name)
58
+ country_code_class.index_on(:name)
59
+ country_code_class.should respond_to(:for_name)
60
+ country_code_class.protected_methods.should include(:name_index)
61
+ end
62
+ end
63
+
64
+ context '.for_foo' do
65
+ before{ country_code_class.index_on :name }
66
+ context 'behaves like Hash#fetch:' do
67
+ context 'when key is not present' do
68
+ it 'retrieves a value if in the index' do
69
+ country_code_class.for_name('Djibouti').should == djibouti
70
+ end
71
+ end
72
+ context 'when key is not present' do
73
+ it 'and no default it raises KeyError' do
74
+ expect{ country_code_class.for_name('Yo Mama') }.to raise_error(KeyError, 'key not found: "Yo Mama"')
75
+ end
76
+ it 'returns default value if given' do
77
+ yo_mama = country_code_class.for_name('Yo Mama', 'wears combat boots')
78
+ yo_mama.should == 'wears combat boots'
79
+ end
80
+ it 'calls block if given' do
81
+ she = nil
82
+ so_fat = country_code_class.for_name('Yo Mama'){ she = 'sits around the house' ; 'when she sits' }
83
+ so_fat.should == 'when she sits'
84
+ she.should == 'sits around the house'
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ end
@@ -75,4 +75,33 @@ module Geo
75
75
  class GeonamesCountry < GeonamesPlace
76
76
  self.place_klass = Geo::Country
77
77
  end
78
+
79
+ # http://download.geonames.org/export/zip/
80
+ #
81
+ # country code : iso country code, 2 characters
82
+ # postal code : varchar(20)
83
+ # place name : varchar(180)
84
+ # admin name1 : 1. order subdivision (state) varchar(100)
85
+ # admin code1 : 1. order subdivision (state) varchar(20)
86
+ # admin name2 : 2. order subdivision (county/province) varchar(100)
87
+ # admin code2 : 2. order subdivision (county/province) varchar(20)
88
+ # admin name3 : 3. order subdivision (community) varchar(100)
89
+ # admin code3 : 3. order subdivision (community) varchar(20)
90
+ # latitude : estimated latitude (wgs84)
91
+ # longitude : estimated longitude (wgs84)
92
+ # accuracy : accuracy of lat/lng from 1=estimated to 6=centroid
93
+ class GeonamesPostal
94
+ field :country_id, String, doc: "iso country code, 2 characters"
95
+ field :postal_id, String, doc: "varchar(20)"
96
+ field :name, String, doc: "varchar(180)"
97
+ field :admin1_name, String, doc: "1. order subdivision (state) varchar(100)"
98
+ field :admin1_id, String, doc: "1. order subdivision (state) varchar(20)"
99
+ field :admin2_name, String, doc: "2. order subdivision (county/province) varchar(100)"
100
+ field :admin2_id, String, doc: "2. order subdivision (county/province) varchar(20)"
101
+ field :admin3_name, String, doc: "3. order subdivision (community) varchar(100)"
102
+ field :admin3_id, String, doc: "3. order subdivision (community) varchar(20)"
103
+ field :latitude, String, doc: "estimated latitude (wgs84)"
104
+ field :longitude, String, doc: "estimated longitude (wgs84)"
105
+ field :accuracy, String, doc: "accuracy of lat/lng from 1=estimated to 6=centroid"
106
+ end
78
107
  end
@@ -1,4 +1,5 @@
1
1
 
2
2
  require 'wukong'
3
3
  require 'multi_json'
4
+ require 'gorillib'
4
5
  require_relative '../utils/munging_utils.rb'
@@ -0,0 +1,66 @@
1
+
2
+ Settings.define :dbpedia_filetype, description: 'The dbpedia file type ("geo_coordinates", etc) -- taken from input filename if available'
3
+
4
+ # Settings[:dbpedia_filetype] ||= Settings[:input_paths].to_s
5
+ # Settings[:dbpedia_filetype] = File.basename(Settings[:dbpedia_filetype]).gsub(/[\.\-].*/, '')
6
+ # @flavor, flavor_info = DBPEDIA_FLAVOR_INFO.detect{|flavor, (filename, _r)| filename == Settings[:dbpedia_filetype] }
7
+ # @kind, @filename, @regexps = flavor_info
8
+
9
+ DBPEDIA_FLAVOR_INFO = {
10
+ title: ['labels_en', [:title, ], ],
11
+ page_id: ['page_ids_en', [:page_id, ], ],
12
+ wikipedia_link: ['wikipedia_links_en', [:wikipedia_links, :wikipedia_backlink, :wikipedia_lang, ], ],
13
+ abstract_short: ['short_abstracts_en', [:abstract_short, ], ],
14
+ abstract_long: ['long_abstracts_en', [:abstract_long, ], ],
15
+ geo_coordinates: ['geo_coordinates_en', [:geo_coordinates, :geo_coord_skip_a, :geo_coord_skip_b, ], ],
16
+ # #
17
+ page_links: ['page_links_unredirected_en', [:page_links, ], ],
18
+ disambiguations: ['disambiguations_unredirected_en', [:disambiguations, ], ],
19
+ redirects: ['redirects_transitive_en', [:redirects, ], ],
20
+ # #
21
+ external_links: ['external_links_en', [:external_links, ], ],
22
+ homepages: ['homepages_en', [:homepages, ], ],
23
+ geonames: ['geonames_links', [:geonames, ], ],
24
+ musicbrainz: ['musicbrainz_links', [:musicbrainz, ], ],
25
+ nytimes: ['nytimes_links', [:nytimes, ], ],
26
+ uscensus: ['uscensus_links', [:uscensus, ], ],
27
+ pnd: ['pnd_en', [:pnd, ], ],
28
+ # #
29
+ article_categories: ['article_categories_en', [:article_categories, ], ],
30
+ category_title: ['category_labels_en', [:title, ], ],
31
+ category_skos: ['skos_categories_en', [:category_skos_skip, :category_skos_title, :category_skos_reln ], ],
32
+ # #
33
+ wordnet: ['wordnet_links', [:wordnet, ], ],
34
+ persondata: ['persondata_unredirected_en', [:persondata_reln, :persondata_type, ], ],
35
+ yago: ['yago_links', [:yago, :instance_type_a, :instance_type_b, ], ],
36
+ instance_types: ['instance_types_en', [:yago, :instance_type_a, :instance_type_b, ], ],
37
+ property_specmap: ['specific_mappingbased_properties_en', [:property_specmap, ], ],
38
+ property_mapped: ['mappingbased_properties_unredirected_en', [
39
+ :property_str, :property_bool, :property_int,
40
+ :property_float, :property_date, :property_yearmonth, :property_monthday,
41
+ :persondata_reln, :persondata_type, :property_foaf, :property_desc, ], ],
42
+ topical_concepts: ['topical_concepts_unredirected_en', [:topical_concepts, ], ],
43
+ }
44
+
45
+ module Re
46
+ ##
47
+ # Container for the character classes specified in
48
+ # <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
49
+ # Borrowed from the addressable gem
50
+ module Uri
51
+ ALPHA = "a-zA-Z"
52
+ DIGIT = "0-9"
53
+ GEN_DELIMS = "\\:\\/\\?\\#\\[\\]\\@"
54
+ SUB_DELIMS = "\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\="
55
+ RESERVED = GEN_DELIMS + SUB_DELIMS
56
+ UNRESERVED = ALPHA + DIGIT + "\\-\\.\\_\\~"
57
+ PCHAR = UNRESERVED + SUB_DELIMS + "\\:\\@"
58
+ SCHEME = ALPHA + DIGIT + "\\-\\+\\."
59
+ AUTHORITY = PCHAR
60
+ PATH = PCHAR + "\\/"
61
+ QUERY = PCHAR + "\\/\\?"
62
+ FRAGMENT = PCHAR + "\\/\\?"
63
+ #
64
+ PATHSEG = ""
65
+ end
66
+ end
@@ -2,192 +2,259 @@
2
2
  require_relative './dbpedia_common'
3
3
  require 'ap'
4
4
 
5
- module Re
6
- ##
7
- # Container for the character classes specified in
8
- # <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
9
- # Borrowed from the addressable gem
10
- module Uri
11
- ALPHA = "a-zA-Z"
12
- DIGIT = "0-9"
13
- GEN_DELIMS = "\\:\\/\\?\\#\\[\\]\\@"
14
- SUB_DELIMS = "\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\="
15
- RESERVED = GEN_DELIMS + SUB_DELIMS
16
- UNRESERVED = ALPHA + DIGIT + "\\-\\.\\_\\~"
17
- PCHAR = UNRESERVED + SUB_DELIMS + "\\:\\@"
18
- SCHEME = ALPHA + DIGIT + "\\-\\+\\."
19
- AUTHORITY = PCHAR
20
- PATH = PCHAR + "\\/"
21
- QUERY = PCHAR + "\\/\\?"
22
- FRAGMENT = PCHAR + "\\/\\?"
23
- #
24
- PATHSEG = ""
25
- end
26
- end
27
- #
28
- #
5
+ # Notes:
29
6
  #
7
+ # * disambiguation: `generic disambiguates specifics` -- `["Alien", "Alien_(law)"]` and `["Alien", "Alien_(film)"]`
8
+ # * redirects: `dupe redirects to actual` -- `["Oxygen-13", "Isotopes_of_oxygen"]`
9
+ # * page_link: `from links to into` -- `["Achilles", "Greeks"]
10
+
30
11
  module Dbpedia
31
12
 
32
- DBLQ_STRING_C = '"(?<%s>\\\"|[^\"]+)+"'
33
13
  DECIMAL_NUM_RE = '[\-\+\d]+\.\d+'
14
+ URI_PATHCHARS = '\w\-\.\'~!$&()*+,;=:@'
15
+ # all backslash-escaped character, or non-quotes, up to first quote
16
+ DBLQ_STRING_C = '"(?<%s>(?:\\.|[^\"])*)"'
17
+
18
+ # output flavors:
19
+ #
20
+ # :abstract_long :abstract_short :category :category_reln :disambiguation
21
+ # :external_link :geo_coordinates :homepage :instance_of :page_id :page_link
22
+ # :persondata_reln :property :redirects :sameas :subject :title :wikipedia_link
23
+ #
24
+
25
+ MAPPING_INFO = {
26
+ # atomic topic properties
27
+ title: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :title, ], },
28
+ page_id: { kind: :page_id, fields: [:page_id, :wp_ns, :wikipedia_id, :wikipedia_pageid, ], },
29
+ abstract_short: { kind: :abstract_short, fields: [:page_id, :wp_ns, :wikipedia_id, :abstract, ], },
30
+ abstract_long: { kind: :abstract_long, fields: [:page_id, :wp_ns, :wikipedia_id, :abstract, ], },
31
+ wikipedia_lang: { kind: :skip, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :url, :slug, :lang, ], },
32
+ wikipedia_link: { kind: :wikipedia_link, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :url, :slug, :revision_id, ], },
33
+ wikipedia_backlink: { kind: :skip, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :url, :slug, :revision_id, ], },
34
+ geo_coordinates: { kind: :geo_coordinates, fields: [:page_id, :wp_ns, :wikipedia_id, :lat, :lng, ], },
35
+ geo_coord_skip_a: { kind: :skip, fields: [], },
36
+ geo_coord_skip_b: { kind: :skip, fields: [], },
37
+ # links between topics
38
+ page_link: { kind: :page_link, fields: [:page_id, :wp_ns, :from_id, :relation, :into_id, ], },
39
+ disambiguation: { kind: :disambiguation, fields: [:page_id, :wp_ns, :generic_wpid, :relation, :specific_wpid, ], },
40
+ redirects: { kind: :redirects, fields: [:page_id, :wp_ns, :dupe_id, :relation, :wikipedia_id, ], },
41
+ # external links and sameas'es
42
+ external_link: { kind: :external_link, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :weblink_url, ], },
43
+ homepage: { kind: :homepage, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :weblink_url, ], },
44
+ geonames: { kind: :sameas, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :geonames_id, ], },
45
+ musicbrainz: { kind: :sameas, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :musicbrainz_type, :musicbrainz_id,], },
46
+ nytimes: { kind: :sameas, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :nytimes_id, ], },
47
+ pnd: { kind: :sameas, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :pnd_id, ], },
48
+ uscensus: { kind: :sameas, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :country_id, :state_id, :kind, :adm2_id, :adm3_id, :adm4_id], },
49
+ # category links
50
+ category_skos_type: { kind: :instance_of, fields: [:page_id, :wp_ns, :wikipedia_id, :scheme, :obj_class ], },
51
+ category_skos_title: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :val_type, :category_title, ], },
52
+ category: { kind: :category, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :specific_wpid, ], },
53
+ category_subject: { kind: :subject, fields: [:page_id, :wp_ns, :wikipedia_id, :scheme, :into_wpid, ], },
54
+ category_reln: { kind: :category_reln, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :into_wpid, ], },
55
+ # properties
56
+ wordnet: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :wn_reln, :wn_class, :wn_pos, :wn_idx, ], },
57
+ property_bool: { kind: :property_bool, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
58
+ property_int: { kind: :property_int, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
59
+ property_float: { kind: :property_float, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
60
+ property_date: { kind: :property_date, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
61
+ property_yearmonth: { kind: :property_yearmonth, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
62
+ property_monthday: { kind: :property_monthday, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
63
+ property_str: { kind: :property_str, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
64
+ #
65
+ persondata_reln: { kind: :persondata_reln, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :into_wpid, ], },
66
+ # persondata_type: { kind: :# persondata_type, fields: [:page_id, :wp_ns, :wikipedia_id, :property, ], },
67
+ property_foaf: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
68
+ property_desc: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :name, ], },
69
+ yago: { kind: :category, fields: [:page_id, :wp_ns, :wikipedia_id, :scheme, :obj_class, ], },
70
+ instance_type_a: { kind: :instance_of, fields: [:page_id, :wp_ns, :wikipedia_id, :scheme, :obj_class, ], },
71
+ instance_type_b: { kind: :instance_of, fields: [:page_id, :wp_ns, :wikipedia_id, :scheme, :obj_class, ], },
72
+ property_specmap: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :units, :val, ], },
73
+ # topical_concepts: { kind: :# topical_concepts, fields: [:page_id, :wp_ns, :wikipedia_id, :skos_subject :x, ], },
74
+ }
34
75
 
35
76
  RDF_RES = {
77
+ # type descriptions
78
+ dbpedia_class: 'http://dbpedia\.org/class/(?<%s>[^>\s]+)',
36
79
  dbpedia_ontb: 'http://dbpedia\.org/ontology',
37
80
  dbpedia_ont: 'http://dbpedia\.org/ontology/(?<%s>[\w\/]+)',
38
81
  dbpedia_prop: 'http://dbpedia\.org/property/(?<%s>\w+)',
39
- dbpedia_class: 'http://dbpedia\.org/class/(?<%s>[^>\s]+)',
40
- dbpedia_rsrc: "http://dbpedia\\.org/resource/(?<%s>[#{Re::Uri::PCHAR}%%\/]+)",
41
- wikipedia_rsrc: "http://\\w\\w\\.wikipedia\\.org/wiki/(?<%s>[#{Re::Uri::PCHAR}%%\/]+)",
42
- dbpedia_value: '"(?<%s>(?:\\\"|[^\"]+)*)"\\^\\^<http://dbpedia\.org/datatype/(?<%s>[a-zA-Z]+)>',
43
- foaf_homepage: 'http://xmlns\.com/foaf/0\.1/homepage',
44
- foaf_name: 'http://xmlns\.com/foaf/0\.1/name',
45
- foaf_topic: 'http://xmlns\.com/foaf/0\.1/(?:isPrimaryTopicOf|primaryTopic)',
46
- foaf_prop: 'http://xmlns\.com/foaf/0\.1/(?<property>\w+)',
47
- geonames_rsrc: 'http://sws\.geonames\.org/(?<%s>\d+)/',
48
- georss_latlng: '\"(?<%s>[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?)\\s(?<%s>[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?)\"@en',
49
- georss_type: 'http://www\.georss\.org/georss/point',
50
- musicbrainz_rsrc: 'http://zitgist\.com/music/(?<%s>\w+)/(?<%s>[a-f0-9\-]+)',
51
- nytimes_rsrc: 'http://data\.nytimes\.com/(?<%s>[A-Z0-9]+)',
52
- purl_subject: 'http://purl\.org/dc/terms/subject',
82
+ dbpedia_rsrc: 'http://dbpedia\.org/resource/(?<%s>[' + URI_PATHCHARS + '%%\/]+)',
83
+ yago_class: 'http://dbpedia\.org/class/(?<%s>yago)/(?<%s>[' + URI_PATHCHARS + '%%\/]+)',
84
+ wikipedia_rsrc: '(?<%s>http://\w\w\.wikipedia\.org/wiki/(?<%s>[' + URI_PATHCHARS + '%%\/]+))',
85
+ wiki_category: 'http://en\.wikipedia\.org/wiki/Category:Futurama?oldid=485425712\\#absolute-line=1',
86
+ wiki_link_id: 'http://en\.wikipedia\.org/wiki/(?<%s>[^\?]+)\?oldid=(?<%s>\d+)(?:\\#absolute-line=(?<%s>\d+))?',
87
+ wiki_link_id_sec: 'http://en\.wikipedia\.org/wiki/(?<%s>[^\?]+)\?oldid=(?<%s>\d+)\\#?(?:section=(?<%s>.*?)\&relative-line=(?<%s>\d+))?(?:&?absolute-line=(?<%s>\d+))?',
53
88
  purl_desc: 'http://purl\.org/dc/elements/1\.1/(?<%s>description)',
54
89
  purl_lang: 'http://purl\.org/dc/elements/1\.1/language',
90
+ purl_subject: 'http://purl\.org/dc/terms/subject',
91
+ rdf_type: 'http://www\.w3\.org/1999/02/22-rdf-syntax-ns\\#type',
55
92
  rdf_comment: 'http://www\.w3\.org/2000/01/rdf-schema\\#comment',
56
- rdf_eol: '\\.',
57
- rdf_float: '\"(?<%s>[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?)\"\^\^<http://www\.w3\.org/2001/XMLSchema\#float>',
58
- rdf_integer: '\"(?<%s>\d+)\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#integer>',
59
- rdf_date: '\"(?<%s>\d\d\d\d-\d\d-\d\d)\"\^\^<http://www\.w3\.org/2001/XMLSchema\\#date>',
60
- rdf_string: '\"(?<%s>(?:\\\"|[^\"]+)*)"@(?<%s>\w+)\b',
61
93
  rdf_label: 'http://www\.w3\.org/2000/01/rdf-schema\\#label',
62
- rdf_type: 'http://www\.w3\.org/1999/02/22-rdf-syntax-ns\\#type',
94
+ # external links and sameas'es
63
95
  same_as: 'http://www\.w3\.org/2002/07/owl\\#sameAs',
64
- skos_concept: 'http://www\.w3\.org/2004/02/skos/core\\#(?<%s>[a-zA-Z]+)',
65
- skos_subject: 'http://www\.w3\.org/2004/02/skos/core\\#subject',
96
+ wordnet_inst: 'http://www\.w3\.org/2006/03/wn/wn20/instances/(?<%s>synset)-(?<%s>\w+)-(?<%s>noun)-(?<%s>[0-9]+)',
97
+ musicbrainz_rsrc: 'http://zitgist\.com/music/(?<%s>\w+)/(?<%s>[a-f0-9\-]+)',
98
+ nytimes_rsrc: 'http://data\.nytimes\.com/(?<%s>[A-Z0-9]+)',
99
+ geonames_rsrc: 'http://sws\.geonames\.org/(?<%s>\d+)/',
100
+ georss_type: 'http://www\.georss\.org/georss/point',
66
101
  wgs_latorlng: 'http://www\.w3\.org/2003/01/geo/wgs84_pos\\#(?:lat|long)',
102
+ # http://www.rdfabout.com/rdf/usgov/geo/ us / ak / counties /bethel_area /an_subarea /aniak >
103
+ uscensus_url: 'http://www.rdfabout.com/rdf/usgov/geo/(?<%s>us)/(?<%s>\w\w)(?:/(?<%s>counties)/(?<%s>\w+)(?:/(?<%s>\w+)\/?(?<%s>\w+)?)?)?',
104
+ # category links
105
+ skos_subject: 'http://www\.w3\.org/2004/02/skos/core\\#subject',
106
+ skos_concept: 'http://www\.w3\.org/2004/02/skos/core\\#(?<%s>[a-zA-Z]+)',
107
+ foaf_homepage: 'http://xmlns\.com/foaf/0\.1/homepage',
108
+ foaf_name: 'http://xmlns\.com/foaf/0\.1/name',
109
+ foaf_topic: 'http://xmlns\.com/foaf/0\.1/(?:isPrimaryTopicOf|primaryTopic)',
110
+ foaf_prop: 'http://xmlns\.com/foaf/0\.1/(?<property>\w+)',
111
+ # property values
112
+ georss_latlng: '\"(?<%s>[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?)\\s(?<%s>[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?)\"@\w\w',
113
+ rdf_eol: '\\.',
114
+ #
115
+ rdf_bool: '\"(?<%s>true|false )\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>boolean)>',
116
+ rdf_date: '\"(?<%s>-?\d\d\d\d-\d\d-\d\d )\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>date)>',
117
+ rdf_yearmonth: '\"(?<%s>-?\d\d\d\d-\d\d )\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>gYearMonth)>',
118
+ rdf_monthday: '\"(?<%s>--\d\d-\d\d )\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>gMonthDay)>',
119
+ rdf_int: '\"(?<%s>[\+\-]?\d+ )\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>integer|gYear|positiveInteger|nonNegativeInteger)>',
120
+ rdf_float: '\"(?<%s>[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?)\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>float|double)>',
121
+ # all backslash-escaped character, or non-quotes, up to first quote
122
+ rdf_string: '"(?<%s>(?:\\\\.|[^\"])*)"@en',
123
+ dbpedia_value: '"(?<%s>(?:\\\\.|[^\"])*)"\\^\\^<http://dbpedia\.org/datatype/(?<%s>[a-zA-Z]+)>',
124
+ #
67
125
  url_loose: '(?<%s>(?:https?|ftp)://(?:[a-zA-Z0-9\-]+\.)+(?:[a-zA-Z\-]+)[^\s>]*)',
68
- wiki_category: 'http://en\.wikipedia\.org/wiki/Category:Futurama?oldid=485425712\\#absolute-line=1',
69
- wiki_link_id: 'http://en\.wikipedia\.org/wiki/(?<%s>[^\?]+)\?oldid=(?<%s>\d+)\\#?(?:absolute-line=(?<%s>\d+))?',
70
- wiki_link_id_sec: 'http://en\.wikipedia\.org/wiki/(?<%s>[^\?]+)\?oldid=(?<%s>\d+)\\#?(?:section=(?<%s>.*?)\&relative-line=(?<%s>\d+))?(?:&?absolute-line=(?<%s>\d+))?',
71
- wordnet_inst: 'http://www\.w3\.org/2006/03/wn/wn20/instances/synset-(?<%s>\w+)-noun-(?<%s>[0-9]+)',
72
- yago_class: 'http://dbpedia\.org/class/yago',
73
- rdf_value: '\"(?<%s>
74
- -?\d\d\d\d-\d\d-\d\d|-?\d\d\d\d-\d\d|--\d\d-\d\d|
75
- [\+\-]?\d+ |
76
- [\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?|
77
- true|false)\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>integer|date|gYearMonth|gMonthDay|gYear|positiveInteger|nonNegativeInteger|float|double|boolean)>',
78
-
79
- schema_type: 'http://(?<%s>www\\.w3\\.org/2002/07/owl|schema\\.org|dbpedia\\.org/ontology|purl\\.org/ontology|xmlns.com/foaf/0\\.1)[/\#]([^>]+)'
126
+ # rdf_value: '\"(?<%s>-?\d\d\d\d-\d\d-\d\d|-?\d\d\d\d-\d\d|--\d\d-\d\d|[\+\-]?\d+|[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?|true|false)\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>integer|date|gYearMonth|gMonthDay|gYear|positiveInteger|nonNegativeInteger|float|double|boolean)>',
127
+ schema_type: 'http://(?<%s>www\\.w3\\.org/2002/07/owl|schema\\.org|dbpedia\\.org/ontology|purl\\.org/ontology|xmlns.com/foaf/0\\.1)[/\\#](?<%s>[^>]+)'
80
128
  }
81
129
 
82
- idx = 0;
83
- RDF_RES.each{|flavor, re_str| Regexp.new(re_str.gsub('%s'){|x| "l#{idx+=1}" }) }
130
+ SCHEMA_SCHEMES = {
131
+ 'www.w3.org/2002/07/owl' => 'owl',
132
+ 'schema.org' => 'schemaorg',
133
+ 'dbpedia.org/ontology' => 'dbpedia',
134
+ 'purl.org/ontology' => 'purl',
135
+ 'xmlns.com/foaf/0.1' => 'foaf'
136
+ }
84
137
 
138
+ # lookup regexp in above table, sub in variable names
85
139
  private
86
140
  def self.r(regexp_name, *args)
87
141
  RDF_RES[regexp_name] % args
88
142
  end
89
143
  public
90
144
 
91
- MAPPINGS = {
92
-
93
- geo_coordinates: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:georss_type)}> \s#{r(:georss_latlng, :lat, :lng)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
94
- geo_coord_skip_a: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<http://www\.opengis\.net/gml/_Feature> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
95
- geo_coord_skip_b: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:wgs_latorlng)}> \s#{r(:rdf_float, :val)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
96
- wordnet: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_prop, :property)}> \s<#{r(:wordnet_inst, :wn_class, :wn_idx)}> \s#{r(:rdf_eol)} \z}x,
97
- geonames: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:same_as)}> \s<#{r(:geonames_rsrc, :geonames_id)}> \s#{r(:rdf_eol)} \z}x,
98
-
99
- pnd: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/individualisedPnd> \s#{r(:rdf_string, :name, :nlang)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
100
- disambiguations: %r{\A<#{r(:dbpedia_rsrc, :generic_wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/wikiPageDisambiguates> \s<#{r(:dbpedia_rsrc, :specific_wikipedia_id)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
101
- page_ids: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/wikiPageID> \s#{r(:rdf_integer, :wikipedia_pageid)} \s<#{r(:wiki_link_id, :wikipedia_id2, :wikipedia_pageid, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
102
- redirects: %r{\A<#{r(:dbpedia_rsrc, :dupe_of)}> \s<#{r(:dbpedia_ontb)}/wikiPageRedirects> \s<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s#{r(:rdf_eol)} \z}x,
103
-
104
- article_categories: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:purl_subject)}> \s<#{r(:dbpedia_rsrc, :specific_wikipedia_id)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
105
- categories_skos: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<#{r(:skos_concept, :skos_relation)}> \s<#{r(:wiki_link_id, :wikipedia_id2, :wikipedia_pageid, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
106
- categories_skos_label: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:skos_concept, :skos_relation)}> \s#{r(:rdf_string, :val, :val_lang)} \s<#{r(:wiki_link_id, :wikipedia_id2, :wikipedia_pageid, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
107
- categories_skos_reln: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:skos_concept, :skos_relation)}> \s<#{r(:dbpedia_rsrc, :category_b)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
108
- category_labels: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_label)}> \s#{r(:rdf_string,:category_wikipedia_id, :ctl)} \s<#{r(:wiki_link_id, :wikipedia_id2, :wikipedia_pageid, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
109
-
110
- abstracts_short: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_comment)}> \s#{r(:rdf_string,:abstract, :al)} \s<#{r(:wiki_link_id, :wikipedia_id2, :wikipedia_pageid, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
111
- abstracts_long: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/abstract> \s#{r(:rdf_string,:abstract, :al)} \s<#{r(:wiki_link_id, :wikipedia_id2, :wikipedia_pageid, :article_lineno)}> \s#{r(:rdf_eol)} \z}xm,
112
- titles: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_label)}> \s#{r(:rdf_string,:wikipedia_id2, :tl )} \s<#{r(:wiki_link_id, :wikipedia_id2, :wikipedia_pageid, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
113
- nytimes: %r{\A<#{r(:nytimes_rsrc, :nyt_id)}> \s<#{r(:same_as)}> \s<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s#{r(:rdf_eol)} \z}x,
114
-
115
- external_links: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/wikiPageExternalLink> \s<#{r(:url_loose, :weblink_url)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
116
- homepages: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:foaf_homepage)}> \s<#{r(:url_loose, :weblink_url)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
117
- wikipedia_links: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:foaf_topic)}> \s<#{r(:url_loose, :weblink_url)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
118
- wikipedia_lang: %r{\A<#{r(:wikipedia_rsrc, :wikipedia_id)}> \s<#{r(:purl_lang)}> \s#{r(:rdf_string, :lang, :wll_lang)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
119
- wikipedia_rev: %r{\A<#{r(:wikipedia_rsrc, :wikipedia_id)}> \s<#{r(:foaf_topic)}> \s<#{r(:dbpedia_rsrc, :wikipedia_id2)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
120
- musicbrainz: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:same_as)}> \s<#{r(:musicbrainz_rsrc, :musicbrainz_type, :musicbrainz_id)}> \s#{r(:rdf_eol)} \z}x,
121
-
122
- properties_map_val: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_value, :val, :val_type)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
123
- properties_map_str: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_string, :val, :val_lang)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
124
- properties_map_foaf: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:foaf_prop, :property)}> \s#{r(:rdf_string, :val, :val_lang)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
125
- properties_specmap: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:dbpedia_value, :val, :dbpedia_units)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
126
-
127
- yago: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<#{r(:schema_type, :schema, :schema_type)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
128
-
129
- persondata_type: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<http://xmlns.com/foaf/0\.1/Person> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
130
- persondata_reln: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s<#{r(:dbpedia_rsrc, :target_wikipedia_id)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
131
- persondata_prop: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_value, :val, :val_type)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
132
- persondata_foaf: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:foaf_prop, :property)}> \s#{r(:rdf_string,:name, :name_lang)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
133
- persondata_desc: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:purl_desc, :property)}> \s#{r(:rdf_string,:name, :name_lang)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
134
-
135
-
136
- #topical_concepts: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:skos_subject)}> \s<#{r(:x, )} \z}x, #<http://dbpedia.org/resource/Futurama> \s<#{r(:wiki_category)}> .
137
- # uscensus: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:same_as)}> \s<#{r(:url_loose, :census_url)}> \s#{r(:rdf_eol)} \z}x,
138
- # instance_types: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<#{r(:dbpedia_ont, :specific_wikipedia_id)}> }x,# \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :wikipedia_pageid, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
145
+ MAPPING_RES = {
146
+ # atomic topic properties
147
+ title: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_label)}> \s#{r(:rdf_string, :title )} \s<#{r(:wiki_link_id, :wikipedia_id2, :revision_id, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
148
+ page_id: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/wikiPageID> \s#{r(:rdf_int, :wikipedia_pageid, :_dtyp)} \s<#{r(:wiki_link_id, :wikipedia_id2, :revision_id, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
149
+ wikipedia_lang: %r{\A<#{r(:wikipedia_rsrc, :url, :slug)}> \s<#{r(:purl_lang)}> \s#{r(:rdf_string, :lang)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
150
+ wikipedia_link: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:foaf_topic)}> \s<#{r(:wikipedia_rsrc, :url, :slug)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
151
+ wikipedia_backlink: %r{\A<#{r(:wikipedia_rsrc, :url, :slug)}> \s<#{r(:foaf_topic)}> \s<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
152
+ abstract_short: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_comment)}> \s#{r(:rdf_string, :abstract)} \s<#{r(:wiki_link_id, :wikipedia_id2, :revision_id, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
153
+ abstract_long: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/abstract> \s#{r(:rdf_string, :abstract)} \s<#{r(:wiki_link_id, :wikipedia_id2, :revision_id, :article_lineno)}> \s#{r(:rdf_eol)} \z}xm,
154
+ geo_coordinates: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:georss_type)}> \s#{r(:georss_latlng, :lat, :lng)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
155
+ geo_coord_skip_a: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<http://www\.opengis\.net/gml/_Feature> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
156
+ geo_coord_skip_b: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:wgs_latorlng)}> \s#{r(:rdf_float, :val, :_dtyp)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
157
+ # links between topic
158
+ page_link: %r{\A<#{r(:dbpedia_rsrc, :from_id)}> \s<#{r(:dbpedia_ontb)}/wikiPageWikiLink> \s<#{r(:dbpedia_rsrc, :into_id)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
159
+ disambiguation: %r{\A<#{r(:dbpedia_rsrc, :generic_wpid)}> \s<#{r(:dbpedia_ontb)}/wikiPageDisambiguates> \s<#{r(:dbpedia_rsrc, :specific_wpid)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
160
+ redirects: %r{\A<#{r(:dbpedia_rsrc, :dupe_id)}> \s<#{r(:dbpedia_ontb)}/wikiPageRedirects> \s<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s#{r(:rdf_eol)} \z}x,
161
+ # external links and sameas'es
162
+ external_link: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/wikiPageExternalLink> \s<#{r(:url_loose, :weblink_url)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
163
+ homepage: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:foaf_homepage)}> \s<#{r(:url_loose, :weblink_url)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
164
+ geonames: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:same_as)}> \s<#{r(:geonames_rsrc, :geonames_id)}> \s#{r(:rdf_eol)} \z}x,
165
+ musicbrainz: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:same_as)}> \s<#{r(:musicbrainz_rsrc, :musicbrainz_type, :musicbrainz_id)}> \s#{r(:rdf_eol)} \z}x,
166
+ nytimes: %r{\A<#{r(:nytimes_rsrc, :nytimes_id)}> \s<#{r(:same_as)}> \s<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s#{r(:rdf_eol)} \z}x,
167
+ uscensus: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:same_as)}> \s<#{r(:uscensus_url, :country_id, :state_id, :kind, :adm2_id, :adm3_id, :adm4_id)}> \s#{r(:rdf_eol)} \z}x,
168
+ pnd: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/individualisedPnd> \s#{r(:rdf_string, :pnd_id)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
169
+ # category links
170
+ category: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:purl_subject)}> \s<#{r(:dbpedia_rsrc, :specific_wpid)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
171
+ category_skos_type: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<#{r(:skos_concept, :obj_class)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
172
+ category_subject: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:skos_subject, :relation)}> \s<#{r(:dbpedia_rsrc, :into_wpid)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
173
+ category_reln: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:skos_concept, :relation)}> \s<#{r(:dbpedia_rsrc, :into_wpid)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
174
+ category_skos_title: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:skos_concept, :relation)}> \s#{r(:rdf_string, :category_title)} \s<#{r(:wiki_link_id, :wikipedia_id2, :revision_id, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
175
+ # properties
176
+ wordnet: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_prop, :property)}> \s<#{r(:wordnet_inst, :wn_reln, :wn_class, :wn_pos, :wn_idx)}> \s#{r(:rdf_eol)} \z}x,
177
+ #
178
+ property_bool: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_bool, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
179
+ property_int: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_int, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
180
+ property_float: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_float, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
181
+ property_date: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_date, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
182
+ property_yearmonth: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_yearmonth, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
183
+ property_monthday: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_monthday, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
184
+ property_str: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_string, :val) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
185
+ #
186
+ persondata_reln: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s<#{r(:dbpedia_rsrc, :into_wpid)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
187
+ property_foaf: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:foaf_prop, :property)}> \s#{r(:rdf_string, :val)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
188
+ property_desc: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:purl_desc, :property)}> \s#{r(:rdf_string,:name)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
189
+ yago: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<#{r(:yago_class, :scheme, :obj_class)}> \s#{r(:rdf_eol)} \z}x,
190
+ instance_type_a: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<#{r(:dbpedia_ont, :obj_class)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
191
+ instance_type_b: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<#{r(:schema_type, :org, :obj_class)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
192
+ property_specmap: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:dbpedia_value, :val, :units)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
193
+ # topical_concepts: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:skos_subject)}> \s<#{r(:x, )}> \z},
139
194
  }
140
-
141
- # ap MAPPINGS
195
+ MAPPING_RES.each{|re_name, re| MAPPING_INFO[re_name][:re] = re }
196
+ SKIPPAPLE_FIELDS = [:flavor, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno, :val_lang, :name_lang, :_dtyp]
142
197
 
143
198
  class RdfExtractor < Wukong::Streamer::LineStreamer
144
199
  include MungingUtils
200
+ attr_accessor :flavor, :kind, :filename, :regexps, :seen_keys, :seen_props
201
+
202
+ def initialize(*args)
203
+ @seen_keys = Hash.new(0)
204
+ @seen_props = Hash.new(0)
205
+ end
206
+
207
+ def record_for_flavor(kind, fields, flavor, hsh)
208
+ hsh.merge!( wp_ns: 0, flavor: flavor )
209
+ return if kind == :skip
210
+
211
+ case flavor
212
+ when :property_str, :property_foaf then hsh[:val] = MultiJson.encode(hsh[:val])
213
+ when :abstract_long, :abstract_short then hsh[:abstract] = MultiJson.encode(hsh[:abstract])
214
+ when :title then hsh[:title] = MultiJson.encode(hsh[:title]) ; hsh[:property] = 'title'
215
+ when :category_skos_title then hsh[:category_title] = MultiJson.encode(hsh[:category_title])
216
+ when :category_skos_type then hsh[:scheme] = 'skos'
217
+ when :category_subject then hsh[:scheme] = 'subject'
218
+ when :instance_type_a then hsh[:scheme] = 'dbpedia'
219
+ when :instance_type_b
220
+ hsh[:scheme] = SCHEMA_SCHEMES[hsh.delete(:org)]
221
+ return if hsh[:scheme] == 'owl'
222
+ when :wikipedia_link, :wikipedia_backlink
223
+ raise "Titles disagree!" unless hsh[:slug] == hsh[:wikipedia_id]
224
+ end
145
225
 
146
- def emit(hsh)
147
- super [MultiJson.encode(hsh)]
226
+ # record seen properties, seen fields
227
+ hsh.except(*fields).except(*SKIPPAPLE_FIELDS).
228
+ each{|key, val| @seen_keys[key] += 1 if val.present? }
229
+ seen_props[hsh[:property]] += 1 if hsh[:property].present?
230
+ sanity_check(hsh)
231
+ #
232
+ [kind] + hsh.values_at(*fields)
233
+ end
234
+
235
+ def sanity_check(hsh)
236
+ hsh.each{|key,val| raise if CONTROL_CHARS_RE =~ val.to_s }
237
+ end
238
+
239
+ def after_stream
240
+ Log.info ["seen keys:", seen_keys.inspect, "seen props:", seen_props.inspect].join("\t")
148
241
  end
149
242
 
150
243
  def process(line)
151
244
  return if line =~ /\A(?:\#|$)/
152
- if line =~ /=> \w+\.\w+ <=/ then yield [line] ; return ; end
153
- MAPPINGS.each do |flavor, re|
154
- next unless mm = re.match(line)
155
- yield( { flavor: flavor }.merge(mm.as_hash) )
245
+ if (line =~ /=> \w+\.\w+ <=/) then yield [line] ; return ; end
246
+
247
+ MAPPING_INFO.each do |flavor, info|
248
+ next unless mm = info[:re].match(line)
249
+ yield record_for_flavor(info[:kind], info[:fields], flavor, mm.as_hash)
156
250
  return
157
251
  end
158
- puts [line]
252
+
253
+ Log.warn ['not found:', line].join("\t")
159
254
  end
160
255
  end
161
256
  end
162
257
 
163
258
 
164
- # META = {
165
- # geo_coordinates: [:field, 'geo_coordinates_en.nq', ],
166
- # wordnet: [:joinkey, 'wordnet_links.nt', ],
167
- # geonames: [:joinkey, 'geonames_links.nt', ],
168
- # properties_specmap: [:properties, 'specific_mappingbased_properties_en.nq', ],
169
- # properties_mapped: [:properties, 'mappingbased_properties_unredirected_en.nq', ],
170
- # pnd: [:joinkey, 'pnd_en.nq', ],
171
- # disambiguations: [:pagelink, 'disambiguations_unredirected_en.nq', ],
172
- # external_links: [:weblink, 'external_links_en.nq', ],
173
- # page_ids: [:field, 'page_ids_en.nq', ],
174
- # redirects: [:pagelink, 'redirects_transitive_en.nt', ],
175
- # article_categories: [:categories, 'article_categories_en.nq', ],
176
- # instance_types: [:categories, 'instance_types_en.nq', ],
177
- # categories_skos: [:meta, 'skos_categories_en.nq', ],
178
- # abstracts_long: [:field, 'long_abstracts_en.nq', ],
179
- # abstracts_short: [:field, 'short_abstracts_en.nq', ],
180
- # category_labels: [:meta, 'category_labels_en.nq', ],
181
- # titles: [:field, 'labels_en.nq', ],
182
- # musicbrainz: [:joinkey, 'musicbrainz_links.nt', ],
183
- # nytimes: [:joinkey, 'nytimes_links.nt', ],
184
- # uscensus: [:joinkey, 'uscensus_links.nt', ],
185
- # topical_concepts: [ 'topical_concepts_unredirected_en.nq', ],
186
- # homepages: [:weblink, 'homepages_en.nq', ],
187
- # wikipedia_links: [:field, 'wikipedia_links_en.nq', ],
188
- # persondata: [:properties, 'persondata_unredirected_en.nq', ],
189
- # yago: [:joinkey, 'yago_links.nt', ],
190
- # }
191
-
192
259
 
193
260
  Wukong::Script.new(Dbpedia::RdfExtractor, nil).run