wukong 3.0.0.pre2 → 3.0.0.pre3
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +13 -0
- data/README.md +182 -6
- data/bin/wu-local +13 -5
- data/bin/wu-server +1 -1
- data/examples/Gemfile +2 -1
- data/examples/basic/string_reverser.rb +23 -0
- data/examples/{tiny_count.rb → basic/tiny_count.rb} +0 -0
- data/examples/{word_count → basic/word_count}/accumulator.rb +0 -0
- data/examples/{word_count → basic/word_count}/tokenizer.rb +0 -0
- data/examples/{word_count → basic/word_count}/word_count.rb +0 -0
- data/examples/deploy_pack/Gemfile +7 -0
- data/examples/deploy_pack/README.md +6 -0
- data/examples/{text/latinize_text.rb → deploy_pack/a/b/c/.gitkeep} +0 -0
- data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
- data/examples/deploy_pack/config/environment.rb +1 -0
- data/examples/{dataflow → dsl/dataflow}/fibonacci_series.rb +0 -0
- data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/{dataflow → dsl/dataflow}/simple.rb +0 -0
- data/examples/{dataflow → dsl/dataflow}/telegram.rb +0 -0
- data/examples/{workflow → dsl/workflow}/cherry_pie.dot +0 -0
- data/examples/{workflow → dsl/workflow}/cherry_pie.md +0 -0
- data/examples/{workflow → dsl/workflow}/cherry_pie.png +0 -0
- data/examples/{workflow → dsl/workflow}/cherry_pie.rb +0 -0
- data/examples/empty/.gitkeep +0 -0
- data/examples/graph/implied_geolocation/README.md +63 -0
- data/examples/graph/{minimum_spanning_tree.rb → minimum_spanning_tree/airfares_graphviz.rb} +0 -0
- data/examples/munging/airline_flights/indexable.rb +75 -0
- data/examples/munging/airline_flights/indexable_spec.rb +90 -0
- data/examples/munging/geo/geonames_models.rb +29 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +1 -0
- data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +213 -146
- data/examples/rake_helper.rb +12 -0
- data/examples/ruby_project/Gemfile +7 -0
- data/examples/ruby_project/README.md +6 -0
- data/examples/ruby_project/a/b/c/.gitkeep +0 -0
- data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/serverlogs/models/logline.rb +102 -0
- data/examples/{dataflow/parse_apache_logs.rb → serverlogs/parser/apache_parser_widget.rb} +0 -0
- data/examples/serverlogs/visit_paths/common.rb +4 -0
- data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
- data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
- data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
- data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
- data/examples/text/{pig_latin.rb → pig_latin/pig_latinizer.rb} +0 -0
- data/examples/{dataflow/pig_latinizer.rb → text/pig_latin/pig_latinizer_widget.rb} +0 -0
- data/lib/hanuman/graph.rb +6 -1
- data/lib/wu/geo.rb +4 -0
- data/lib/wu/geo/geo_grids.numbers +0 -0
- data/lib/wu/geo/geolocated.rb +331 -0
- data/lib/wu/geo/quadtile.rb +69 -0
- data/{examples → lib/wu}/graph/union_find.rb +0 -0
- data/lib/wu/model/reconcilable.rb +63 -0
- data/{examples/munging/wikipedia/utils/munging_utils.rb → lib/wu/munging.rb} +7 -4
- data/lib/wu/social/models/twitter.rb +31 -0
- data/{examples/models/wikipedia.rb → lib/wu/wikipedia/models.rb} +0 -0
- data/lib/wukong.rb +9 -4
- data/lib/wukong/boot.rb +10 -1
- data/lib/wukong/driver.rb +65 -71
- data/lib/wukong/logger.rb +93 -0
- data/lib/wukong/processor.rb +38 -29
- data/lib/wukong/runner.rb +144 -0
- data/lib/wukong/server.rb +119 -0
- data/lib/wukong/spec_helpers.rb +1 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +22 -9
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +26 -4
- data/lib/wukong/spec_helpers/processor_helpers.rb +4 -10
- data/lib/wukong/spec_helpers/shared_examples.rb +12 -13
- data/lib/wukong/version.rb +1 -1
- data/lib/wukong/widget/processors.rb +13 -0
- data/lib/wukong/widget/serializers.rb +55 -65
- data/lib/wukong/widgets.rb +0 -2
- data/spec/hanuman/graph_spec.rb +14 -0
- data/spec/spec_helper.rb +4 -30
- data/spec/support/{wukong_test_helpers.rb → example_test_helpers.rb} +29 -2
- data/spec/support/integration_helper.rb +38 -0
- data/spec/support/model_test_helpers.rb +115 -0
- data/spec/wu/geo/geolocated_spec.rb +247 -0
- data/spec/wu/model/reconcilable_spec.rb +152 -0
- data/spec/wukong/widget/processors_spec.rb +0 -1
- data/spec/wukong/widget/serializers_spec.rb +88 -62
- data/spec/wukong/wu_local_spec.rb +125 -0
- data/wukong.gemspec +3 -16
- metadata +72 -266
- data/examples/dataflow/apache_log_line.rb +0 -100
- data/examples/jabberwocky.txt +0 -36
- data/examples/munging/Gemfile +0 -8
- data/examples/munging/airline_flights/airline.rb +0 -57
- data/examples/munging/airline_flights/airport.rb +0 -211
- data/examples/munging/airline_flights/flight.rb +0 -156
- data/examples/munging/airline_flights/models.rb +0 -4
- data/examples/munging/airline_flights/parse.rb +0 -26
- data/examples/munging/airline_flights/route.rb +0 -35
- data/examples/munging/airline_flights/timezone_fixup.rb +0 -62
- data/examples/munging/airports/40_wbans.txt +0 -40
- data/examples/munging/airports/filter_weather_reports.rb +0 -37
- data/examples/munging/airports/join.pig +0 -31
- data/examples/munging/airports/to_tsv.rb +0 -33
- data/examples/munging/airports/usa_wbans.pig +0 -19
- data/examples/munging/airports/usa_wbans.txt +0 -2157
- data/examples/munging/airports/wbans.pig +0 -19
- data/examples/munging/airports/wbans.txt +0 -2310
- data/examples/munging/rake_helper.rb +0 -62
- data/examples/munging/weather/.gitignore +0 -1
- data/examples/munging/weather/Gemfile +0 -4
- data/examples/munging/weather/Rakefile +0 -28
- data/examples/munging/weather/extract_ish.rb +0 -13
- data/examples/munging/weather/models/weather.rb +0 -119
- data/examples/munging/weather/utils/noaa_downloader.rb +0 -46
- data/examples/munging/wikipedia/README.md +0 -34
- data/examples/munging/wikipedia/Rakefile +0 -193
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +0 -18
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +0 -21
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +0 -27
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +0 -29
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +0 -14
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +0 -25
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +0 -29
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +0 -32
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +0 -85
- data/examples/munging/wikipedia/pig_style_guide.md +0 -25
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +0 -19
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +0 -23
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +0 -24
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +0 -22
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +0 -22
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +0 -26
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +0 -29
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +0 -24
- data/examples/munging/wikipedia/utils/get_namespaces.rb +0 -86
- data/examples/munging/wikipedia/utils/namespaces.json +0 -1
- data/examples/string_reverser.rb +0 -26
- data/examples/twitter/locations.rb +0 -29
- data/examples/twitter/models.rb +0 -24
- data/examples/twitter/pt1-fiddle.pig +0 -8
- data/examples/twitter/pt2-simple_parse.pig +0 -31
- data/examples/twitter/pt2-simple_parse.rb +0 -18
- data/examples/twitter/pt3-join_on_zips.pig +0 -39
- data/examples/twitter/pt4-strong_links.rb +0 -20
- data/examples/twitter/pt5-lnglat_and_strong_links.pig +0 -16
- data/examples/twitter/states.tsv +0 -50
- data/examples/workflow/package_gem.rb +0 -55
- data/lib/wukong/widget/sink.rb +0 -16
- data/lib/wukong/widget/source.rb +0 -14
File without changes
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Gorillib
|
2
|
+
module Model
|
3
|
+
|
4
|
+
#
|
5
|
+
# @example
|
6
|
+
# class Airport
|
7
|
+
# include Gorillib::Model::Indexable
|
8
|
+
# # ... define model
|
9
|
+
# index_on :icao_code, :city
|
10
|
+
# end
|
11
|
+
# Airport.for_icao_code('KAUS') #=> #<Airport icao_code="KAUS" ... >
|
12
|
+
# Airport.for_city('Austin') #=> #<Airport icao_code="KAUS" ... >
|
13
|
+
#
|
14
|
+
# You must implement a `load` method
|
15
|
+
#
|
16
|
+
module Indexable
|
17
|
+
extend Gorillib::Concern
|
18
|
+
|
19
|
+
included do |base|
|
20
|
+
base.class_attribute :lookups, instance_writer: false
|
21
|
+
self.lookups ||= []
|
22
|
+
end
|
23
|
+
|
24
|
+
module ClassMethods
|
25
|
+
|
26
|
+
def values
|
27
|
+
@values ||= Array.new
|
28
|
+
end
|
29
|
+
|
30
|
+
def flush_lookups
|
31
|
+
lookups.each{|idx| remove_instance_variable("@#{idx}") if instance_variable_defined?("@#{idx}") }
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# @example
|
36
|
+
# class Airport
|
37
|
+
# index_on :icao_code, [:city, :cities]
|
38
|
+
# end
|
39
|
+
# Airport.for_icao_code('KAUS') #=> #<Airport icao_code="KAUS" ... >
|
40
|
+
# Airport.for_city('Austin') #=> #<Airport icao_code="KAUS" ... >
|
41
|
+
#
|
42
|
+
# NOTE: `.#{key_name}_index` method is NOT part of the framework interface.
|
43
|
+
# only the `.for_#{keyname}` method is suported.
|
44
|
+
def index_on(*key_names)
|
45
|
+
self.lookups += key_names
|
46
|
+
self.lookups.uniq!
|
47
|
+
#
|
48
|
+
key_names.each do |key_name, index_name|
|
49
|
+
index_name ||= "#{key_name}_index"
|
50
|
+
class_eval <<-EOV, __FILE__, __LINE__ + 1
|
51
|
+
class << self
|
52
|
+
def #{index_name} # def name_index
|
53
|
+
@#{index_name} ||= # @name_index ||=
|
54
|
+
Hash[values.map{|el| [el.#{key_name}, el] }] # Hash[values.map{|el| [el.name, el]
|
55
|
+
end # end
|
56
|
+
protected(:#{index_name}) # protected :name_index
|
57
|
+
end
|
58
|
+
EOV
|
59
|
+
|
60
|
+
instance_eval <<-EOV, __FILE__, __LINE__ + 1
|
61
|
+
def for_#{key_name}(*args, &block) # def for_name(*args, &block)
|
62
|
+
#{index_name}.fetch(*args, &block) # name_index.fetch(*args, &block)
|
63
|
+
end # end
|
64
|
+
EOV
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'gorillib/model'
|
3
|
+
require 'gorillib/pathname'
|
4
|
+
#
|
5
|
+
require 'gorillib/model/serialization'
|
6
|
+
require 'gorillib/model/serialization/tsv'
|
7
|
+
require 'gorillib/array/hashify'
|
8
|
+
#
|
9
|
+
require 'wu/model/indexable'
|
10
|
+
|
11
|
+
describe Gorillib::Model::Indexable, :model_spec, :only do
|
12
|
+
let(:mock_array){ mock('array') }
|
13
|
+
|
14
|
+
let(:country_code_class) do
|
15
|
+
module Gorillib::Test
|
16
|
+
remove_const(:CountryCode) if defined?(CountryCode)
|
17
|
+
|
18
|
+
class CountryCode
|
19
|
+
include Gorillib::Model
|
20
|
+
include Gorillib::Model::Indexable
|
21
|
+
field :alpha_2_code, String, position: 0
|
22
|
+
field :name, String, position: 1
|
23
|
+
def self.load
|
24
|
+
self.values << new('dj', 'Djibouti')
|
25
|
+
self.values << new('us', 'United States of America')
|
26
|
+
values
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
Gorillib::Test::CountryCode
|
32
|
+
end
|
33
|
+
|
34
|
+
let(:djibouti){ country_code_class.new('dj', 'Djibouti') }
|
35
|
+
let(:usa ){ country_code_class.new('us', 'United States of America') }
|
36
|
+
|
37
|
+
context 'test setup' do
|
38
|
+
subject{ country_code_class.load }
|
39
|
+
it{ should == [djibouti, usa] }
|
40
|
+
end
|
41
|
+
|
42
|
+
context '.values' do
|
43
|
+
# before{ country_code_class.send(:remove_instance_variable, '@values') }
|
44
|
+
it 'gets its values from .load' do
|
45
|
+
country_code_class.should_receive(:load).once.and_return mock_array
|
46
|
+
country_code_class.values.should equal(mock_array)
|
47
|
+
end
|
48
|
+
it 'memoizes once it is called' do
|
49
|
+
country_code_class.should_receive(:load).once.and_return mock_array
|
50
|
+
country_code_class.values.should equal(mock_array)
|
51
|
+
country_code_class.values.should equal(mock_array)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
context '.index_on' do
|
56
|
+
it 'defines a .for_foo method' do
|
57
|
+
country_code_class.should_not respond_to(:for_name)
|
58
|
+
country_code_class.index_on(:name)
|
59
|
+
country_code_class.should respond_to(:for_name)
|
60
|
+
country_code_class.protected_methods.should include(:name_index)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
context '.for_foo' do
|
65
|
+
before{ country_code_class.index_on :name }
|
66
|
+
context 'behaves like Hash#fetch:' do
|
67
|
+
context 'when key is not present' do
|
68
|
+
it 'retrieves a value if in the index' do
|
69
|
+
country_code_class.for_name('Djibouti').should == djibouti
|
70
|
+
end
|
71
|
+
end
|
72
|
+
context 'when key is not present' do
|
73
|
+
it 'and no default it raises KeyError' do
|
74
|
+
expect{ country_code_class.for_name('Yo Mama') }.to raise_error(KeyError, 'key not found: "Yo Mama"')
|
75
|
+
end
|
76
|
+
it 'returns default value if given' do
|
77
|
+
yo_mama = country_code_class.for_name('Yo Mama', 'wears combat boots')
|
78
|
+
yo_mama.should == 'wears combat boots'
|
79
|
+
end
|
80
|
+
it 'calls block if given' do
|
81
|
+
she = nil
|
82
|
+
so_fat = country_code_class.for_name('Yo Mama'){ she = 'sits around the house' ; 'when she sits' }
|
83
|
+
so_fat.should == 'when she sits'
|
84
|
+
she.should == 'sits around the house'
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
@@ -75,4 +75,33 @@ module Geo
|
|
75
75
|
class GeonamesCountry < GeonamesPlace
|
76
76
|
self.place_klass = Geo::Country
|
77
77
|
end
|
78
|
+
|
79
|
+
# http://download.geonames.org/export/zip/
|
80
|
+
#
|
81
|
+
# country code : iso country code, 2 characters
|
82
|
+
# postal code : varchar(20)
|
83
|
+
# place name : varchar(180)
|
84
|
+
# admin name1 : 1. order subdivision (state) varchar(100)
|
85
|
+
# admin code1 : 1. order subdivision (state) varchar(20)
|
86
|
+
# admin name2 : 2. order subdivision (county/province) varchar(100)
|
87
|
+
# admin code2 : 2. order subdivision (county/province) varchar(20)
|
88
|
+
# admin name3 : 3. order subdivision (community) varchar(100)
|
89
|
+
# admin code3 : 3. order subdivision (community) varchar(20)
|
90
|
+
# latitude : estimated latitude (wgs84)
|
91
|
+
# longitude : estimated longitude (wgs84)
|
92
|
+
# accuracy : accuracy of lat/lng from 1=estimated to 6=centroid
|
93
|
+
class GeonamesPostal
|
94
|
+
field :country_id, String, doc: "iso country code, 2 characters"
|
95
|
+
field :postal_id, String, doc: "varchar(20)"
|
96
|
+
field :name, String, doc: "varchar(180)"
|
97
|
+
field :admin1_name, String, doc: "1. order subdivision (state) varchar(100)"
|
98
|
+
field :admin1_id, String, doc: "1. order subdivision (state) varchar(20)"
|
99
|
+
field :admin2_name, String, doc: "2. order subdivision (county/province) varchar(100)"
|
100
|
+
field :admin2_id, String, doc: "2. order subdivision (county/province) varchar(20)"
|
101
|
+
field :admin3_name, String, doc: "3. order subdivision (community) varchar(100)"
|
102
|
+
field :admin3_id, String, doc: "3. order subdivision (community) varchar(20)"
|
103
|
+
field :latitude, String, doc: "estimated latitude (wgs84)"
|
104
|
+
field :longitude, String, doc: "estimated longitude (wgs84)"
|
105
|
+
field :accuracy, String, doc: "accuracy of lat/lng from 1=estimated to 6=centroid"
|
106
|
+
end
|
78
107
|
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
|
2
|
+
Settings.define :dbpedia_filetype, description: 'The dbpedia file type ("geo_coordinates", etc) -- taken from input filename if available'
|
3
|
+
|
4
|
+
# Settings[:dbpedia_filetype] ||= Settings[:input_paths].to_s
|
5
|
+
# Settings[:dbpedia_filetype] = File.basename(Settings[:dbpedia_filetype]).gsub(/[\.\-].*/, '')
|
6
|
+
# @flavor, flavor_info = DBPEDIA_FLAVOR_INFO.detect{|flavor, (filename, _r)| filename == Settings[:dbpedia_filetype] }
|
7
|
+
# @kind, @filename, @regexps = flavor_info
|
8
|
+
|
9
|
+
DBPEDIA_FLAVOR_INFO = {
|
10
|
+
title: ['labels_en', [:title, ], ],
|
11
|
+
page_id: ['page_ids_en', [:page_id, ], ],
|
12
|
+
wikipedia_link: ['wikipedia_links_en', [:wikipedia_links, :wikipedia_backlink, :wikipedia_lang, ], ],
|
13
|
+
abstract_short: ['short_abstracts_en', [:abstract_short, ], ],
|
14
|
+
abstract_long: ['long_abstracts_en', [:abstract_long, ], ],
|
15
|
+
geo_coordinates: ['geo_coordinates_en', [:geo_coordinates, :geo_coord_skip_a, :geo_coord_skip_b, ], ],
|
16
|
+
# #
|
17
|
+
page_links: ['page_links_unredirected_en', [:page_links, ], ],
|
18
|
+
disambiguations: ['disambiguations_unredirected_en', [:disambiguations, ], ],
|
19
|
+
redirects: ['redirects_transitive_en', [:redirects, ], ],
|
20
|
+
# #
|
21
|
+
external_links: ['external_links_en', [:external_links, ], ],
|
22
|
+
homepages: ['homepages_en', [:homepages, ], ],
|
23
|
+
geonames: ['geonames_links', [:geonames, ], ],
|
24
|
+
musicbrainz: ['musicbrainz_links', [:musicbrainz, ], ],
|
25
|
+
nytimes: ['nytimes_links', [:nytimes, ], ],
|
26
|
+
uscensus: ['uscensus_links', [:uscensus, ], ],
|
27
|
+
pnd: ['pnd_en', [:pnd, ], ],
|
28
|
+
# #
|
29
|
+
article_categories: ['article_categories_en', [:article_categories, ], ],
|
30
|
+
category_title: ['category_labels_en', [:title, ], ],
|
31
|
+
category_skos: ['skos_categories_en', [:category_skos_skip, :category_skos_title, :category_skos_reln ], ],
|
32
|
+
# #
|
33
|
+
wordnet: ['wordnet_links', [:wordnet, ], ],
|
34
|
+
persondata: ['persondata_unredirected_en', [:persondata_reln, :persondata_type, ], ],
|
35
|
+
yago: ['yago_links', [:yago, :instance_type_a, :instance_type_b, ], ],
|
36
|
+
instance_types: ['instance_types_en', [:yago, :instance_type_a, :instance_type_b, ], ],
|
37
|
+
property_specmap: ['specific_mappingbased_properties_en', [:property_specmap, ], ],
|
38
|
+
property_mapped: ['mappingbased_properties_unredirected_en', [
|
39
|
+
:property_str, :property_bool, :property_int,
|
40
|
+
:property_float, :property_date, :property_yearmonth, :property_monthday,
|
41
|
+
:persondata_reln, :persondata_type, :property_foaf, :property_desc, ], ],
|
42
|
+
topical_concepts: ['topical_concepts_unredirected_en', [:topical_concepts, ], ],
|
43
|
+
}
|
44
|
+
|
45
|
+
module Re
|
46
|
+
##
|
47
|
+
# Container for the character classes specified in
|
48
|
+
# <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
|
49
|
+
# Borrowed from the addressable gem
|
50
|
+
module Uri
|
51
|
+
ALPHA = "a-zA-Z"
|
52
|
+
DIGIT = "0-9"
|
53
|
+
GEN_DELIMS = "\\:\\/\\?\\#\\[\\]\\@"
|
54
|
+
SUB_DELIMS = "\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\="
|
55
|
+
RESERVED = GEN_DELIMS + SUB_DELIMS
|
56
|
+
UNRESERVED = ALPHA + DIGIT + "\\-\\.\\_\\~"
|
57
|
+
PCHAR = UNRESERVED + SUB_DELIMS + "\\:\\@"
|
58
|
+
SCHEME = ALPHA + DIGIT + "\\-\\+\\."
|
59
|
+
AUTHORITY = PCHAR
|
60
|
+
PATH = PCHAR + "\\/"
|
61
|
+
QUERY = PCHAR + "\\/\\?"
|
62
|
+
FRAGMENT = PCHAR + "\\/\\?"
|
63
|
+
#
|
64
|
+
PATHSEG = ""
|
65
|
+
end
|
66
|
+
end
|
@@ -2,192 +2,259 @@
|
|
2
2
|
require_relative './dbpedia_common'
|
3
3
|
require 'ap'
|
4
4
|
|
5
|
-
|
6
|
-
##
|
7
|
-
# Container for the character classes specified in
|
8
|
-
# <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
|
9
|
-
# Borrowed from the addressable gem
|
10
|
-
module Uri
|
11
|
-
ALPHA = "a-zA-Z"
|
12
|
-
DIGIT = "0-9"
|
13
|
-
GEN_DELIMS = "\\:\\/\\?\\#\\[\\]\\@"
|
14
|
-
SUB_DELIMS = "\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\="
|
15
|
-
RESERVED = GEN_DELIMS + SUB_DELIMS
|
16
|
-
UNRESERVED = ALPHA + DIGIT + "\\-\\.\\_\\~"
|
17
|
-
PCHAR = UNRESERVED + SUB_DELIMS + "\\:\\@"
|
18
|
-
SCHEME = ALPHA + DIGIT + "\\-\\+\\."
|
19
|
-
AUTHORITY = PCHAR
|
20
|
-
PATH = PCHAR + "\\/"
|
21
|
-
QUERY = PCHAR + "\\/\\?"
|
22
|
-
FRAGMENT = PCHAR + "\\/\\?"
|
23
|
-
#
|
24
|
-
PATHSEG = ""
|
25
|
-
end
|
26
|
-
end
|
27
|
-
#
|
28
|
-
#
|
5
|
+
# Notes:
|
29
6
|
#
|
7
|
+
# * disambiguation: `generic disambiguates specifics` -- `["Alien", "Alien_(law)"]` and `["Alien", "Alien_(film)"]`
|
8
|
+
# * redirects: `dupe redirects to actual` -- `["Oxygen-13", "Isotopes_of_oxygen"]`
|
9
|
+
# * page_link: `from links to into` -- `["Achilles", "Greeks"]
|
10
|
+
|
30
11
|
module Dbpedia
|
31
12
|
|
32
|
-
DBLQ_STRING_C = '"(?<%s>\\\"|[^\"]+)+"'
|
33
13
|
DECIMAL_NUM_RE = '[\-\+\d]+\.\d+'
|
14
|
+
URI_PATHCHARS = '\w\-\.\'~!$&()*+,;=:@'
|
15
|
+
# all backslash-escaped character, or non-quotes, up to first quote
|
16
|
+
DBLQ_STRING_C = '"(?<%s>(?:\\.|[^\"])*)"'
|
17
|
+
|
18
|
+
# output flavors:
|
19
|
+
#
|
20
|
+
# :abstract_long :abstract_short :category :category_reln :disambiguation
|
21
|
+
# :external_link :geo_coordinates :homepage :instance_of :page_id :page_link
|
22
|
+
# :persondata_reln :property :redirects :sameas :subject :title :wikipedia_link
|
23
|
+
#
|
24
|
+
|
25
|
+
MAPPING_INFO = {
|
26
|
+
# atomic topic properties
|
27
|
+
title: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :title, ], },
|
28
|
+
page_id: { kind: :page_id, fields: [:page_id, :wp_ns, :wikipedia_id, :wikipedia_pageid, ], },
|
29
|
+
abstract_short: { kind: :abstract_short, fields: [:page_id, :wp_ns, :wikipedia_id, :abstract, ], },
|
30
|
+
abstract_long: { kind: :abstract_long, fields: [:page_id, :wp_ns, :wikipedia_id, :abstract, ], },
|
31
|
+
wikipedia_lang: { kind: :skip, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :url, :slug, :lang, ], },
|
32
|
+
wikipedia_link: { kind: :wikipedia_link, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :url, :slug, :revision_id, ], },
|
33
|
+
wikipedia_backlink: { kind: :skip, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :url, :slug, :revision_id, ], },
|
34
|
+
geo_coordinates: { kind: :geo_coordinates, fields: [:page_id, :wp_ns, :wikipedia_id, :lat, :lng, ], },
|
35
|
+
geo_coord_skip_a: { kind: :skip, fields: [], },
|
36
|
+
geo_coord_skip_b: { kind: :skip, fields: [], },
|
37
|
+
# links between topics
|
38
|
+
page_link: { kind: :page_link, fields: [:page_id, :wp_ns, :from_id, :relation, :into_id, ], },
|
39
|
+
disambiguation: { kind: :disambiguation, fields: [:page_id, :wp_ns, :generic_wpid, :relation, :specific_wpid, ], },
|
40
|
+
redirects: { kind: :redirects, fields: [:page_id, :wp_ns, :dupe_id, :relation, :wikipedia_id, ], },
|
41
|
+
# external links and sameas'es
|
42
|
+
external_link: { kind: :external_link, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :weblink_url, ], },
|
43
|
+
homepage: { kind: :homepage, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :weblink_url, ], },
|
44
|
+
geonames: { kind: :sameas, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :geonames_id, ], },
|
45
|
+
musicbrainz: { kind: :sameas, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :musicbrainz_type, :musicbrainz_id,], },
|
46
|
+
nytimes: { kind: :sameas, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :nytimes_id, ], },
|
47
|
+
pnd: { kind: :sameas, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :pnd_id, ], },
|
48
|
+
uscensus: { kind: :sameas, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :country_id, :state_id, :kind, :adm2_id, :adm3_id, :adm4_id], },
|
49
|
+
# category links
|
50
|
+
category_skos_type: { kind: :instance_of, fields: [:page_id, :wp_ns, :wikipedia_id, :scheme, :obj_class ], },
|
51
|
+
category_skos_title: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :val_type, :category_title, ], },
|
52
|
+
category: { kind: :category, fields: [:page_id, :wp_ns, :wikipedia_id, :flavor, :specific_wpid, ], },
|
53
|
+
category_subject: { kind: :subject, fields: [:page_id, :wp_ns, :wikipedia_id, :scheme, :into_wpid, ], },
|
54
|
+
category_reln: { kind: :category_reln, fields: [:page_id, :wp_ns, :wikipedia_id, :relation, :into_wpid, ], },
|
55
|
+
# properties
|
56
|
+
wordnet: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :wn_reln, :wn_class, :wn_pos, :wn_idx, ], },
|
57
|
+
property_bool: { kind: :property_bool, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
|
58
|
+
property_int: { kind: :property_int, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
|
59
|
+
property_float: { kind: :property_float, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
|
60
|
+
property_date: { kind: :property_date, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
|
61
|
+
property_yearmonth: { kind: :property_yearmonth, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
|
62
|
+
property_monthday: { kind: :property_monthday, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
|
63
|
+
property_str: { kind: :property_str, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
|
64
|
+
#
|
65
|
+
persondata_reln: { kind: :persondata_reln, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :into_wpid, ], },
|
66
|
+
# persondata_type: { kind: :# persondata_type, fields: [:page_id, :wp_ns, :wikipedia_id, :property, ], },
|
67
|
+
property_foaf: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :val, ], },
|
68
|
+
property_desc: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :val_type, :name, ], },
|
69
|
+
yago: { kind: :category, fields: [:page_id, :wp_ns, :wikipedia_id, :scheme, :obj_class, ], },
|
70
|
+
instance_type_a: { kind: :instance_of, fields: [:page_id, :wp_ns, :wikipedia_id, :scheme, :obj_class, ], },
|
71
|
+
instance_type_b: { kind: :instance_of, fields: [:page_id, :wp_ns, :wikipedia_id, :scheme, :obj_class, ], },
|
72
|
+
property_specmap: { kind: :property, fields: [:page_id, :wp_ns, :wikipedia_id, :property, :units, :val, ], },
|
73
|
+
# topical_concepts: { kind: :# topical_concepts, fields: [:page_id, :wp_ns, :wikipedia_id, :skos_subject :x, ], },
|
74
|
+
}
|
34
75
|
|
35
76
|
RDF_RES = {
|
77
|
+
# type descriptions
|
78
|
+
dbpedia_class: 'http://dbpedia\.org/class/(?<%s>[^>\s]+)',
|
36
79
|
dbpedia_ontb: 'http://dbpedia\.org/ontology',
|
37
80
|
dbpedia_ont: 'http://dbpedia\.org/ontology/(?<%s>[\w\/]+)',
|
38
81
|
dbpedia_prop: 'http://dbpedia\.org/property/(?<%s>\w+)',
|
39
|
-
|
40
|
-
|
41
|
-
wikipedia_rsrc:
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
foaf_topic: 'http://xmlns\.com/foaf/0\.1/(?:isPrimaryTopicOf|primaryTopic)',
|
46
|
-
foaf_prop: 'http://xmlns\.com/foaf/0\.1/(?<property>\w+)',
|
47
|
-
geonames_rsrc: 'http://sws\.geonames\.org/(?<%s>\d+)/',
|
48
|
-
georss_latlng: '\"(?<%s>[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?)\\s(?<%s>[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?)\"@en',
|
49
|
-
georss_type: 'http://www\.georss\.org/georss/point',
|
50
|
-
musicbrainz_rsrc: 'http://zitgist\.com/music/(?<%s>\w+)/(?<%s>[a-f0-9\-]+)',
|
51
|
-
nytimes_rsrc: 'http://data\.nytimes\.com/(?<%s>[A-Z0-9]+)',
|
52
|
-
purl_subject: 'http://purl\.org/dc/terms/subject',
|
82
|
+
dbpedia_rsrc: 'http://dbpedia\.org/resource/(?<%s>[' + URI_PATHCHARS + '%%\/]+)',
|
83
|
+
yago_class: 'http://dbpedia\.org/class/(?<%s>yago)/(?<%s>[' + URI_PATHCHARS + '%%\/]+)',
|
84
|
+
wikipedia_rsrc: '(?<%s>http://\w\w\.wikipedia\.org/wiki/(?<%s>[' + URI_PATHCHARS + '%%\/]+))',
|
85
|
+
wiki_category: 'http://en\.wikipedia\.org/wiki/Category:Futurama?oldid=485425712\\#absolute-line=1',
|
86
|
+
wiki_link_id: 'http://en\.wikipedia\.org/wiki/(?<%s>[^\?]+)\?oldid=(?<%s>\d+)(?:\\#absolute-line=(?<%s>\d+))?',
|
87
|
+
wiki_link_id_sec: 'http://en\.wikipedia\.org/wiki/(?<%s>[^\?]+)\?oldid=(?<%s>\d+)\\#?(?:section=(?<%s>.*?)\&relative-line=(?<%s>\d+))?(?:&?absolute-line=(?<%s>\d+))?',
|
53
88
|
purl_desc: 'http://purl\.org/dc/elements/1\.1/(?<%s>description)',
|
54
89
|
purl_lang: 'http://purl\.org/dc/elements/1\.1/language',
|
90
|
+
purl_subject: 'http://purl\.org/dc/terms/subject',
|
91
|
+
rdf_type: 'http://www\.w3\.org/1999/02/22-rdf-syntax-ns\\#type',
|
55
92
|
rdf_comment: 'http://www\.w3\.org/2000/01/rdf-schema\\#comment',
|
56
|
-
rdf_eol: '\\.',
|
57
|
-
rdf_float: '\"(?<%s>[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?)\"\^\^<http://www\.w3\.org/2001/XMLSchema\#float>',
|
58
|
-
rdf_integer: '\"(?<%s>\d+)\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#integer>',
|
59
|
-
rdf_date: '\"(?<%s>\d\d\d\d-\d\d-\d\d)\"\^\^<http://www\.w3\.org/2001/XMLSchema\\#date>',
|
60
|
-
rdf_string: '\"(?<%s>(?:\\\"|[^\"]+)*)"@(?<%s>\w+)\b',
|
61
93
|
rdf_label: 'http://www\.w3\.org/2000/01/rdf-schema\\#label',
|
62
|
-
|
94
|
+
# external links and sameas'es
|
63
95
|
same_as: 'http://www\.w3\.org/2002/07/owl\\#sameAs',
|
64
|
-
|
65
|
-
|
96
|
+
wordnet_inst: 'http://www\.w3\.org/2006/03/wn/wn20/instances/(?<%s>synset)-(?<%s>\w+)-(?<%s>noun)-(?<%s>[0-9]+)',
|
97
|
+
musicbrainz_rsrc: 'http://zitgist\.com/music/(?<%s>\w+)/(?<%s>[a-f0-9\-]+)',
|
98
|
+
nytimes_rsrc: 'http://data\.nytimes\.com/(?<%s>[A-Z0-9]+)',
|
99
|
+
geonames_rsrc: 'http://sws\.geonames\.org/(?<%s>\d+)/',
|
100
|
+
georss_type: 'http://www\.georss\.org/georss/point',
|
66
101
|
wgs_latorlng: 'http://www\.w3\.org/2003/01/geo/wgs84_pos\\#(?:lat|long)',
|
102
|
+
# http://www.rdfabout.com/rdf/usgov/geo/ us / ak / counties /bethel_area /an_subarea /aniak >
|
103
|
+
uscensus_url: 'http://www.rdfabout.com/rdf/usgov/geo/(?<%s>us)/(?<%s>\w\w)(?:/(?<%s>counties)/(?<%s>\w+)(?:/(?<%s>\w+)\/?(?<%s>\w+)?)?)?',
|
104
|
+
# category links
|
105
|
+
skos_subject: 'http://www\.w3\.org/2004/02/skos/core\\#subject',
|
106
|
+
skos_concept: 'http://www\.w3\.org/2004/02/skos/core\\#(?<%s>[a-zA-Z]+)',
|
107
|
+
foaf_homepage: 'http://xmlns\.com/foaf/0\.1/homepage',
|
108
|
+
foaf_name: 'http://xmlns\.com/foaf/0\.1/name',
|
109
|
+
foaf_topic: 'http://xmlns\.com/foaf/0\.1/(?:isPrimaryTopicOf|primaryTopic)',
|
110
|
+
foaf_prop: 'http://xmlns\.com/foaf/0\.1/(?<property>\w+)',
|
111
|
+
# property values
|
112
|
+
georss_latlng: '\"(?<%s>[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?)\\s(?<%s>[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?)\"@\w\w',
|
113
|
+
rdf_eol: '\\.',
|
114
|
+
#
|
115
|
+
rdf_bool: '\"(?<%s>true|false )\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>boolean)>',
|
116
|
+
rdf_date: '\"(?<%s>-?\d\d\d\d-\d\d-\d\d )\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>date)>',
|
117
|
+
rdf_yearmonth: '\"(?<%s>-?\d\d\d\d-\d\d )\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>gYearMonth)>',
|
118
|
+
rdf_monthday: '\"(?<%s>--\d\d-\d\d )\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>gMonthDay)>',
|
119
|
+
rdf_int: '\"(?<%s>[\+\-]?\d+ )\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>integer|gYear|positiveInteger|nonNegativeInteger)>',
|
120
|
+
rdf_float: '\"(?<%s>[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?)\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>float|double)>',
|
121
|
+
# all backslash-escaped character, or non-quotes, up to first quote
|
122
|
+
rdf_string: '"(?<%s>(?:\\\\.|[^\"])*)"@en',
|
123
|
+
dbpedia_value: '"(?<%s>(?:\\\\.|[^\"])*)"\\^\\^<http://dbpedia\.org/datatype/(?<%s>[a-zA-Z]+)>',
|
124
|
+
#
|
67
125
|
url_loose: '(?<%s>(?:https?|ftp)://(?:[a-zA-Z0-9\-]+\.)+(?:[a-zA-Z\-]+)[^\s>]*)',
|
68
|
-
|
69
|
-
|
70
|
-
wiki_link_id_sec: 'http://en\.wikipedia\.org/wiki/(?<%s>[^\?]+)\?oldid=(?<%s>\d+)\\#?(?:section=(?<%s>.*?)\&relative-line=(?<%s>\d+))?(?:&?absolute-line=(?<%s>\d+))?',
|
71
|
-
wordnet_inst: 'http://www\.w3\.org/2006/03/wn/wn20/instances/synset-(?<%s>\w+)-noun-(?<%s>[0-9]+)',
|
72
|
-
yago_class: 'http://dbpedia\.org/class/yago',
|
73
|
-
rdf_value: '\"(?<%s>
|
74
|
-
-?\d\d\d\d-\d\d-\d\d|-?\d\d\d\d-\d\d|--\d\d-\d\d|
|
75
|
-
[\+\-]?\d+ |
|
76
|
-
[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?|
|
77
|
-
true|false)\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>integer|date|gYearMonth|gMonthDay|gYear|positiveInteger|nonNegativeInteger|float|double|boolean)>',
|
78
|
-
|
79
|
-
schema_type: 'http://(?<%s>www\\.w3\\.org/2002/07/owl|schema\\.org|dbpedia\\.org/ontology|purl\\.org/ontology|xmlns.com/foaf/0\\.1)[/\#]([^>]+)'
|
126
|
+
# rdf_value: '\"(?<%s>-?\d\d\d\d-\d\d-\d\d|-?\d\d\d\d-\d\d|--\d\d-\d\d|[\+\-]?\d+|[\+\-]?\d+\.\d+(?:[eE][\+\-]?\d+)?|true|false)\"\\^\\^<http://www\.w3\.org/2001/XMLSchema\\#(?<%s>integer|date|gYearMonth|gMonthDay|gYear|positiveInteger|nonNegativeInteger|float|double|boolean)>',
|
127
|
+
schema_type: 'http://(?<%s>www\\.w3\\.org/2002/07/owl|schema\\.org|dbpedia\\.org/ontology|purl\\.org/ontology|xmlns.com/foaf/0\\.1)[/\\#](?<%s>[^>]+)'
|
80
128
|
}
|
81
129
|
|
82
|
-
|
83
|
-
|
130
|
+
SCHEMA_SCHEMES = {
|
131
|
+
'www.w3.org/2002/07/owl' => 'owl',
|
132
|
+
'schema.org' => 'schemaorg',
|
133
|
+
'dbpedia.org/ontology' => 'dbpedia',
|
134
|
+
'purl.org/ontology' => 'purl',
|
135
|
+
'xmlns.com/foaf/0.1' => 'foaf'
|
136
|
+
}
|
84
137
|
|
138
|
+
# lookup regexp in above table, sub in variable names
|
85
139
|
private
|
86
140
|
def self.r(regexp_name, *args)
|
87
141
|
RDF_RES[regexp_name] % args
|
88
142
|
end
|
89
143
|
public
|
90
144
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
145
|
+
MAPPING_RES = {
|
146
|
+
# atomic topic properties
|
147
|
+
title: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_label)}> \s#{r(:rdf_string, :title )} \s<#{r(:wiki_link_id, :wikipedia_id2, :revision_id, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
148
|
+
page_id: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/wikiPageID> \s#{r(:rdf_int, :wikipedia_pageid, :_dtyp)} \s<#{r(:wiki_link_id, :wikipedia_id2, :revision_id, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
149
|
+
wikipedia_lang: %r{\A<#{r(:wikipedia_rsrc, :url, :slug)}> \s<#{r(:purl_lang)}> \s#{r(:rdf_string, :lang)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
150
|
+
wikipedia_link: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:foaf_topic)}> \s<#{r(:wikipedia_rsrc, :url, :slug)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
151
|
+
wikipedia_backlink: %r{\A<#{r(:wikipedia_rsrc, :url, :slug)}> \s<#{r(:foaf_topic)}> \s<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
152
|
+
abstract_short: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_comment)}> \s#{r(:rdf_string, :abstract)} \s<#{r(:wiki_link_id, :wikipedia_id2, :revision_id, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
153
|
+
abstract_long: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/abstract> \s#{r(:rdf_string, :abstract)} \s<#{r(:wiki_link_id, :wikipedia_id2, :revision_id, :article_lineno)}> \s#{r(:rdf_eol)} \z}xm,
|
154
|
+
geo_coordinates: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:georss_type)}> \s#{r(:georss_latlng, :lat, :lng)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
155
|
+
geo_coord_skip_a: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<http://www\.opengis\.net/gml/_Feature> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
156
|
+
geo_coord_skip_b: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:wgs_latorlng)}> \s#{r(:rdf_float, :val, :_dtyp)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
157
|
+
# links between topic
|
158
|
+
page_link: %r{\A<#{r(:dbpedia_rsrc, :from_id)}> \s<#{r(:dbpedia_ontb)}/wikiPageWikiLink> \s<#{r(:dbpedia_rsrc, :into_id)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
159
|
+
disambiguation: %r{\A<#{r(:dbpedia_rsrc, :generic_wpid)}> \s<#{r(:dbpedia_ontb)}/wikiPageDisambiguates> \s<#{r(:dbpedia_rsrc, :specific_wpid)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
160
|
+
redirects: %r{\A<#{r(:dbpedia_rsrc, :dupe_id)}> \s<#{r(:dbpedia_ontb)}/wikiPageRedirects> \s<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s#{r(:rdf_eol)} \z}x,
|
161
|
+
# external links and sameas'es
|
162
|
+
external_link: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/wikiPageExternalLink> \s<#{r(:url_loose, :weblink_url)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
163
|
+
homepage: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:foaf_homepage)}> \s<#{r(:url_loose, :weblink_url)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
164
|
+
geonames: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:same_as)}> \s<#{r(:geonames_rsrc, :geonames_id)}> \s#{r(:rdf_eol)} \z}x,
|
165
|
+
musicbrainz: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:same_as)}> \s<#{r(:musicbrainz_rsrc, :musicbrainz_type, :musicbrainz_id)}> \s#{r(:rdf_eol)} \z}x,
|
166
|
+
nytimes: %r{\A<#{r(:nytimes_rsrc, :nytimes_id)}> \s<#{r(:same_as)}> \s<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s#{r(:rdf_eol)} \z}x,
|
167
|
+
uscensus: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:same_as)}> \s<#{r(:uscensus_url, :country_id, :state_id, :kind, :adm2_id, :adm3_id, :adm4_id)}> \s#{r(:rdf_eol)} \z}x,
|
168
|
+
pnd: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ontb)}/individualisedPnd> \s#{r(:rdf_string, :pnd_id)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
169
|
+
# category links
|
170
|
+
category: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:purl_subject)}> \s<#{r(:dbpedia_rsrc, :specific_wpid)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
171
|
+
category_skos_type: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<#{r(:skos_concept, :obj_class)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
172
|
+
category_subject: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:skos_subject, :relation)}> \s<#{r(:dbpedia_rsrc, :into_wpid)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
173
|
+
category_reln: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:skos_concept, :relation)}> \s<#{r(:dbpedia_rsrc, :into_wpid)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
174
|
+
category_skos_title: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:skos_concept, :relation)}> \s#{r(:rdf_string, :category_title)} \s<#{r(:wiki_link_id, :wikipedia_id2, :revision_id, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
175
|
+
# properties
|
176
|
+
wordnet: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_prop, :property)}> \s<#{r(:wordnet_inst, :wn_reln, :wn_class, :wn_pos, :wn_idx)}> \s#{r(:rdf_eol)} \z}x,
|
177
|
+
#
|
178
|
+
property_bool: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_bool, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
179
|
+
property_int: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_int, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
180
|
+
property_float: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_float, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
181
|
+
property_date: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_date, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
182
|
+
property_yearmonth: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_yearmonth, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
183
|
+
property_monthday: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_monthday, :val, :val_type) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
184
|
+
property_str: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:rdf_string, :val) } \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
185
|
+
#
|
186
|
+
persondata_reln: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s<#{r(:dbpedia_rsrc, :into_wpid)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
187
|
+
property_foaf: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:foaf_prop, :property)}> \s#{r(:rdf_string, :val)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
188
|
+
property_desc: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:purl_desc, :property)}> \s#{r(:rdf_string,:name)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
189
|
+
yago: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<#{r(:yago_class, :scheme, :obj_class)}> \s#{r(:rdf_eol)} \z}x,
|
190
|
+
instance_type_a: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<#{r(:dbpedia_ont, :obj_class)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
191
|
+
instance_type_b: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:rdf_type)}> \s<#{r(:schema_type, :org, :obj_class)}> \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
192
|
+
property_specmap: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:dbpedia_ont, :property)}> \s#{r(:dbpedia_value, :val, :units)} \s<#{r(:wiki_link_id_sec, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno)}> \s#{r(:rdf_eol)} \z}x,
|
193
|
+
# topical_concepts: %r{\A<#{r(:dbpedia_rsrc, :wikipedia_id)}> \s<#{r(:skos_subject)}> \s<#{r(:x, )}> \z},
|
139
194
|
}
|
140
|
-
|
141
|
-
|
195
|
+
MAPPING_RES.each{|re_name, re| MAPPING_INFO[re_name][:re] = re }
|
196
|
+
SKIPPAPLE_FIELDS = [:flavor, :wikipedia_id2, :revision_id, :article_section, :section_lineno, :article_lineno, :val_lang, :name_lang, :_dtyp]
|
142
197
|
|
143
198
|
class RdfExtractor < Wukong::Streamer::LineStreamer
|
144
199
|
include MungingUtils
|
200
|
+
attr_accessor :flavor, :kind, :filename, :regexps, :seen_keys, :seen_props
|
201
|
+
|
202
|
+
def initialize(*args)
|
203
|
+
@seen_keys = Hash.new(0)
|
204
|
+
@seen_props = Hash.new(0)
|
205
|
+
end
|
206
|
+
|
207
|
+
def record_for_flavor(kind, fields, flavor, hsh)
|
208
|
+
hsh.merge!( wp_ns: 0, flavor: flavor )
|
209
|
+
return if kind == :skip
|
210
|
+
|
211
|
+
case flavor
|
212
|
+
when :property_str, :property_foaf then hsh[:val] = MultiJson.encode(hsh[:val])
|
213
|
+
when :abstract_long, :abstract_short then hsh[:abstract] = MultiJson.encode(hsh[:abstract])
|
214
|
+
when :title then hsh[:title] = MultiJson.encode(hsh[:title]) ; hsh[:property] = 'title'
|
215
|
+
when :category_skos_title then hsh[:category_title] = MultiJson.encode(hsh[:category_title])
|
216
|
+
when :category_skos_type then hsh[:scheme] = 'skos'
|
217
|
+
when :category_subject then hsh[:scheme] = 'subject'
|
218
|
+
when :instance_type_a then hsh[:scheme] = 'dbpedia'
|
219
|
+
when :instance_type_b
|
220
|
+
hsh[:scheme] = SCHEMA_SCHEMES[hsh.delete(:org)]
|
221
|
+
return if hsh[:scheme] == 'owl'
|
222
|
+
when :wikipedia_link, :wikipedia_backlink
|
223
|
+
raise "Titles disagree!" unless hsh[:slug] == hsh[:wikipedia_id]
|
224
|
+
end
|
145
225
|
|
146
|
-
|
147
|
-
|
226
|
+
# record seen properties, seen fields
|
227
|
+
hsh.except(*fields).except(*SKIPPAPLE_FIELDS).
|
228
|
+
each{|key, val| @seen_keys[key] += 1 if val.present? }
|
229
|
+
seen_props[hsh[:property]] += 1 if hsh[:property].present?
|
230
|
+
sanity_check(hsh)
|
231
|
+
#
|
232
|
+
[kind] + hsh.values_at(*fields)
|
233
|
+
end
|
234
|
+
|
235
|
+
def sanity_check(hsh)
|
236
|
+
hsh.each{|key,val| raise if CONTROL_CHARS_RE =~ val.to_s }
|
237
|
+
end
|
238
|
+
|
239
|
+
def after_stream
|
240
|
+
Log.info ["seen keys:", seen_keys.inspect, "seen props:", seen_props.inspect].join("\t")
|
148
241
|
end
|
149
242
|
|
150
243
|
def process(line)
|
151
244
|
return if line =~ /\A(?:\#|$)/
|
152
|
-
if line =~ /=> \w+\.\w+ <=/ then yield [line] ; return ; end
|
153
|
-
|
154
|
-
|
155
|
-
|
245
|
+
if (line =~ /=> \w+\.\w+ <=/) then yield [line] ; return ; end
|
246
|
+
|
247
|
+
MAPPING_INFO.each do |flavor, info|
|
248
|
+
next unless mm = info[:re].match(line)
|
249
|
+
yield record_for_flavor(info[:kind], info[:fields], flavor, mm.as_hash)
|
156
250
|
return
|
157
251
|
end
|
158
|
-
|
252
|
+
|
253
|
+
Log.warn ['not found:', line].join("\t")
|
159
254
|
end
|
160
255
|
end
|
161
256
|
end
|
162
257
|
|
163
258
|
|
164
|
-
# META = {
|
165
|
-
# geo_coordinates: [:field, 'geo_coordinates_en.nq', ],
|
166
|
-
# wordnet: [:joinkey, 'wordnet_links.nt', ],
|
167
|
-
# geonames: [:joinkey, 'geonames_links.nt', ],
|
168
|
-
# properties_specmap: [:properties, 'specific_mappingbased_properties_en.nq', ],
|
169
|
-
# properties_mapped: [:properties, 'mappingbased_properties_unredirected_en.nq', ],
|
170
|
-
# pnd: [:joinkey, 'pnd_en.nq', ],
|
171
|
-
# disambiguations: [:pagelink, 'disambiguations_unredirected_en.nq', ],
|
172
|
-
# external_links: [:weblink, 'external_links_en.nq', ],
|
173
|
-
# page_ids: [:field, 'page_ids_en.nq', ],
|
174
|
-
# redirects: [:pagelink, 'redirects_transitive_en.nt', ],
|
175
|
-
# article_categories: [:categories, 'article_categories_en.nq', ],
|
176
|
-
# instance_types: [:categories, 'instance_types_en.nq', ],
|
177
|
-
# categories_skos: [:meta, 'skos_categories_en.nq', ],
|
178
|
-
# abstracts_long: [:field, 'long_abstracts_en.nq', ],
|
179
|
-
# abstracts_short: [:field, 'short_abstracts_en.nq', ],
|
180
|
-
# category_labels: [:meta, 'category_labels_en.nq', ],
|
181
|
-
# titles: [:field, 'labels_en.nq', ],
|
182
|
-
# musicbrainz: [:joinkey, 'musicbrainz_links.nt', ],
|
183
|
-
# nytimes: [:joinkey, 'nytimes_links.nt', ],
|
184
|
-
# uscensus: [:joinkey, 'uscensus_links.nt', ],
|
185
|
-
# topical_concepts: [ 'topical_concepts_unredirected_en.nq', ],
|
186
|
-
# homepages: [:weblink, 'homepages_en.nq', ],
|
187
|
-
# wikipedia_links: [:field, 'wikipedia_links_en.nq', ],
|
188
|
-
# persondata: [:properties, 'persondata_unredirected_en.nq', ],
|
189
|
-
# yago: [:joinkey, 'yago_links.nt', ],
|
190
|
-
# }
|
191
|
-
|
192
259
|
|
193
260
|
Wukong::Script.new(Dbpedia::RdfExtractor, nil).run
|