semantic-crawler 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.travis.yml +4 -0
- data/.yardopts +5 -0
- data/Gemfile +18 -0
- data/README.rdoc +25 -8
- data/Rakefile +0 -1
- data/changelog.sh +4 -0
- data/exploitation/freebase.rb +13 -0
- data/lib/semantic_crawler.rb +2 -0
- data/lib/semantic_crawler/factbook/country.rb +1 -0
- data/lib/semantic_crawler/fao/country.rb +30 -4
- data/lib/semantic_crawler/gdacs/feed.rb +1 -0
- data/lib/semantic_crawler/geo_names/country.rb +48 -0
- data/lib/semantic_crawler/linked_geo_data/relevant_node.rb +25 -12
- data/lib/semantic_crawler/linked_geo_data/relevant_nodes.rb +6 -0
- data/lib/semantic_crawler/version.rb +1 -1
- data/log/.gitkeep +0 -0
- data/semantic_crawler.gemspec +35 -0
- data/spec/dbpedia_spec.rb +9 -0
- data/spec/factbook_spec.rb +89 -0
- data/spec/fao_austria_spec.rb +118 -0
- data/spec/fao_papua_new_guinea_spec.rb +97 -0
- data/spec/freebase_spec.rb +17 -0
- data/spec/gdacs_spec.rb +111 -0
- data/spec/geo_names_spec.rb +36 -0
- data/spec/linked_geo_data_spec.rb +44 -0
- data/spec/spec_helper.rb +13 -0
- data/test/dummy/app/mailers/.gitkeep +0 -0
- data/test/dummy/app/models/.gitkeep +0 -0
- data/test/dummy/lib/assets/.gitkeep +0 -0
- data/test/dummy/log/.gitkeep +0 -0
- data/test/dummy/log/test.log +30 -0
- metadata +90 -43
- data/log/semantic-crawler.log +0 -33
- data/log/semantic-crawler.log.20120413 +0 -31
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/.yardopts
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
|
3
|
+
# Declare your gem's dependencies in semantic_crawler.gemspec.
|
4
|
+
# Bundler will treat runtime dependencies like base dependencies, and
|
5
|
+
# development dependencies will be added by default to the :development group.
|
6
|
+
gemspec
|
7
|
+
|
8
|
+
# jquery-rails is used by the dummy application
|
9
|
+
gem "jquery-rails"
|
10
|
+
|
11
|
+
#
|
12
|
+
# Declare any dependencies that are still in development here instead of in
|
13
|
+
# your gemspec. These might include edge Rails or gems from your path or
|
14
|
+
# Git. Remember to move these dependencies to your gemspec before releasing
|
15
|
+
# your gem to rubygems.org.
|
16
|
+
|
17
|
+
# To use debugger
|
18
|
+
# gem 'ruby-debug'
|
data/README.rdoc
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
= SemanticCrawler
|
1
|
+
= SemanticCrawler
|
2
2
|
|
3
3
|
This project encapsulates data gathering from different sources.
|
4
4
|
It simplifies the extension of internal data with public available
|
@@ -8,15 +8,15 @@ to bypass complex NLP (natural language processing).
|
|
8
8
|
|
9
9
|
== Supported Sources
|
10
10
|
|
11
|
-
*
|
12
|
-
*
|
13
|
-
* FAO - Food and Agriculture Organization of the United Nations
|
14
|
-
* LinkedGeoData - LGD
|
15
|
-
* [
|
11
|
+
* {Geonames}[http://www.geonames.org/]
|
12
|
+
* {CIA Factbook RDF Dump}[http://www4.wiwiss.fu-berlin.de/factbook/directory/countries]
|
13
|
+
* {FAO - Food and Agriculture Organization of the United Nations}[http://www.fao.org]
|
14
|
+
* {LinkedGeoData - LGD}[http://linkedgeodata.org]
|
15
|
+
* {GDACS}[http://gdacs.org]
|
16
|
+
* [Started] {Freebase}[http://freebase.com]
|
16
17
|
|
17
18
|
=== TODO
|
18
19
|
|
19
|
-
* Geonames
|
20
20
|
* DBPedia
|
21
21
|
* Different Government Sources
|
22
22
|
|
@@ -45,8 +45,25 @@ Or from source:
|
|
45
45
|
|
46
46
|
== Examples
|
47
47
|
|
48
|
+
This examples are only a short outline how to use the library. For more
|
49
|
+
information read the documentation or look into the source code. To use the
|
50
|
+
library include or execute the following line:
|
51
|
+
|
48
52
|
>> require "semantic_crawler"
|
49
53
|
|
54
|
+
=== GeoNames
|
55
|
+
|
56
|
+
The GeoNames module is able to return a Factbook::Country and Fao::Country
|
57
|
+
module on the base of input GPS coordinates (lat/long).
|
58
|
+
|
59
|
+
>> @innsbruck = SemanticCrawler::GeoNames::Country.new(47.271338, 11.395333)
|
60
|
+
>> articles = @innsbruck.get_wikipedia_articles
|
61
|
+
>> articles.each do |article|
|
62
|
+
>> puts article.wikipedia_url
|
63
|
+
>> end
|
64
|
+
>> factbook_obj = @innsbruck.get_factbook_country
|
65
|
+
>> fao_obj = @innsbruck.get_fao_country
|
66
|
+
|
50
67
|
=== Factbook
|
51
68
|
|
52
69
|
Fetch Factbook information about Austria:
|
@@ -90,7 +107,7 @@ Country information from {FAO}[http://www.fao.org]:
|
|
90
107
|
|
91
108
|
=== LinkedGeoData
|
92
109
|
|
93
|
-
Geo information from {LinkedGeoData}[http://linkedgeodata.org]
|
110
|
+
Geo information from {LinkedGeoData}[http://linkedgeodata.org]:
|
94
111
|
|
95
112
|
>> # All nodes around the center of dresden, in a radius of 1000m
|
96
113
|
>> @dresden = SemanticCrawler::LinkedGeoData::RelevantNodes.new(51.033333, 13.733333, 1000)
|
data/Rakefile
CHANGED
data/changelog.sh
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
require 'httparty'
|
3
|
+
require 'json'
|
4
|
+
require 'pp'
|
5
|
+
|
6
|
+
#query = [{'id' => '/en/austria', 'name' => nil, 'type' => '/location/country', '*' => [{}] }]
|
7
|
+
query = [{'id' => '/en/austria', 'type' => '/type/property' }]
|
8
|
+
query_envelope = {'query' => query }
|
9
|
+
service_url = 'http://api.freebase.com/api/service/mqlread'
|
10
|
+
url = service_url + '?query=' + CGI::escape(query_envelope.to_json)
|
11
|
+
|
12
|
+
response = HTTParty.get(url, :format => :json)
|
13
|
+
puts response
|
data/lib/semantic_crawler.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
require "httparty"
|
4
|
+
require "geonames"
|
4
5
|
|
5
6
|
# The top level module contains the different data sources
|
6
7
|
# as sub-modules. Currently there are the following modules
|
@@ -38,6 +39,7 @@ require "semantic_crawler/dbpedia"
|
|
38
39
|
|
39
40
|
# GeoNames.org - module: GeoNames
|
40
41
|
require "semantic_crawler/geo_names"
|
42
|
+
require "semantic_crawler/geo_names/country"
|
41
43
|
|
42
44
|
# LinkedGeoData.org - module: LinkedGeoData
|
43
45
|
require "semantic_crawler/linked_geo_data"
|
@@ -3,6 +3,7 @@ module SemanticCrawler
|
|
3
3
|
# Represents Food and Agriculture information about one country.
|
4
4
|
class Country
|
5
5
|
|
6
|
+
# The URI prefix of the fao country object
|
6
7
|
@@URI_PREFIX = "http://www.fao.org/countryprofiles/geoinfo/geopolitical/data/"
|
7
8
|
|
8
9
|
# Namespace hash
|
@@ -35,11 +36,13 @@ module SemanticCrawler
|
|
35
36
|
end
|
36
37
|
|
37
38
|
# The dbpedia identifier (from fao:codeDBPediaID)
|
39
|
+
# @return [String]
|
38
40
|
def code_dbpedia_id
|
39
41
|
query_root_node("fao:codeDBPediaID/text()", @@NAMESPACES).to_s
|
40
42
|
end
|
41
43
|
|
42
44
|
# Links to additional information (from owl:sameAs)
|
45
|
+
# @return [Array<String>]
|
43
46
|
def same_as
|
44
47
|
returnLinks = []
|
45
48
|
links = query_root_node("owl:sameAs/@rdf:resource", @@NAMESPACES)
|
@@ -50,63 +53,75 @@ module SemanticCrawler
|
|
50
53
|
end
|
51
54
|
|
52
55
|
# The type as URL of this entity (from rdf:type)
|
56
|
+
# @return [String]
|
53
57
|
def type_url
|
54
58
|
query_root_node("rdf:type/@rdf:resource", @@NAMESPACES).to_s
|
55
59
|
end
|
56
60
|
|
57
61
|
# The maximum latitude (from fao:hasMaxLatitude)
|
62
|
+
# @return [String]
|
58
63
|
def max_latitude
|
59
64
|
query_root_node("fao:hasMaxLatitude/text()", @@NAMESPACES).to_s
|
60
65
|
end
|
61
66
|
|
62
67
|
# The maximum longitude (from fao:hasMaxLongitude)
|
68
|
+
# @return [String]
|
63
69
|
def max_longitude
|
64
70
|
query_root_node("fao:hasMaxLongitude/text()", @@NAMESPACES).to_s
|
65
71
|
end
|
66
72
|
|
67
73
|
# The minimum latitude (from fao:hasMinLatitude)
|
74
|
+
# @return [String]
|
68
75
|
def min_latitude
|
69
76
|
query_root_node("fao:hasMinLatitude/text()", @@NAMESPACES).to_s
|
70
77
|
end
|
71
78
|
|
72
79
|
# The minimum longitude (from fao:hasMinLongitude)
|
80
|
+
# @return [String]
|
73
81
|
def min_longitude
|
74
82
|
query_root_node("fao:hasMinLongitude/text()", @@NAMESPACES).to_s
|
75
83
|
end
|
76
84
|
|
77
85
|
# Human readable description about the land area (from fao:landAreaNotes)
|
86
|
+
# @return [String]
|
78
87
|
def land_area_notes
|
79
88
|
query_root_node("fao:landAreaNotes/text()", @@NAMESPACES).to_s
|
80
89
|
end
|
81
90
|
|
82
91
|
# Land area total value (from fao:landAreaTotal)
|
92
|
+
# @return [String]
|
83
93
|
def land_area_total
|
84
94
|
query_root_node("fao:landAreaTotal/text()", @@NAMESPACES).to_s
|
85
95
|
end
|
86
96
|
|
87
97
|
# Land area unit (from fao:landAreaUnit)
|
98
|
+
# @return [String]
|
88
99
|
def land_area_unit
|
89
100
|
query_root_node("fao:landAreaUnit/text()", @@NAMESPACES).to_s
|
90
101
|
end
|
91
102
|
|
92
103
|
# Land area year (from fao:landAreaYear)
|
104
|
+
# @return [String]
|
93
105
|
def land_area_year
|
94
106
|
query_root_node("fao:landAreaYear/text()", @@NAMESPACES).to_s
|
95
107
|
end
|
96
108
|
|
97
109
|
# The currency name.
|
98
|
-
# @param [
|
99
|
-
|
110
|
+
# @param [String] The language in which the currency name should be returned
|
111
|
+
# @return [String]
|
112
|
+
def name_currency(lang = 'en')
|
100
113
|
query_root_node("fao:nameCurrency[@xml:lang='#{lang}']/text()", @@NAMESPACES).to_s
|
101
114
|
end
|
102
115
|
|
103
116
|
# The official country name
|
104
|
-
# @param [
|
105
|
-
|
117
|
+
# @param [String] The language in which the official name should be returned
|
118
|
+
# @return [String]
|
119
|
+
def official_name(lang = 'en')
|
106
120
|
query_root_node("fao:nameOfficial[@xml:lang='#{lang}']/text()", @@NAMESPACES).to_s
|
107
121
|
end
|
108
122
|
|
109
123
|
# Classification of this country as name (from fao:isInGroup)
|
124
|
+
# @return [Array<String>]
|
110
125
|
def is_in_group_name
|
111
126
|
returnGroup = []
|
112
127
|
group = query_root_node("fao:isInGroup/@rdf:resource", @@NAMESPACES)
|
@@ -120,6 +135,7 @@ module SemanticCrawler
|
|
120
135
|
|
121
136
|
# Classification of this country as dereferenceable URL (from
|
122
137
|
# fao:isInGroup)
|
138
|
+
# @return [Array<String>]
|
123
139
|
def is_in_group_url
|
124
140
|
returnGroup = []
|
125
141
|
group = query_root_node("fao:isInGroup/@rdf:resource", @@NAMESPACES)
|
@@ -133,6 +149,7 @@ module SemanticCrawler
|
|
133
149
|
|
134
150
|
# Returns all countries that share a boarder with this country (as
|
135
151
|
# dereferencable URL - from fao:hasBorderWith)
|
152
|
+
# @return [Array<String>]
|
136
153
|
def has_boarder_with_url
|
137
154
|
returnGroup = []
|
138
155
|
group = query_root_node("fao:hasBorderWith/@rdf:resource", @@NAMESPACES)
|
@@ -146,6 +163,7 @@ module SemanticCrawler
|
|
146
163
|
|
147
164
|
# Returns all countries that share a boarder with this country (as
|
148
165
|
# name)
|
166
|
+
# @return [Array<String>]
|
149
167
|
def has_boarder_with_name
|
150
168
|
returnGroup = []
|
151
169
|
group = query_root_node("fao:hasBorderWith/@rdf:resource", @@NAMESPACES)
|
@@ -158,36 +176,43 @@ module SemanticCrawler
|
|
158
176
|
end
|
159
177
|
|
160
178
|
# Population notes (from fao:populationNotes)
|
179
|
+
# @return [String]
|
161
180
|
def population_notes
|
162
181
|
query_root_node("fao:populationNotes/text()", @@NAMESPACES).to_s
|
163
182
|
end
|
164
183
|
|
165
184
|
# Population total (from fao:populationTotal)
|
185
|
+
# @return [String]
|
166
186
|
def population_total
|
167
187
|
query_root_node("fao:populationTotal/text()", @@NAMESPACES).to_s
|
168
188
|
end
|
169
189
|
|
170
190
|
# Population unit (from fao:populationUnit)
|
191
|
+
# @return [String]
|
171
192
|
def population_unit
|
172
193
|
query_root_node("fao:populationUnit/text()", @@NAMESPACES).to_s
|
173
194
|
end
|
174
195
|
|
175
196
|
# Population year (from fao:populationYear)
|
197
|
+
# @return [String]
|
176
198
|
def population_year
|
177
199
|
query_root_node("fao:populationYear/text()", @@NAMESPACES).to_s
|
178
200
|
end
|
179
201
|
|
180
202
|
# Entity is valid since (from fao:validSince)
|
203
|
+
# @return [String]
|
181
204
|
def valid_since
|
182
205
|
query_root_node("fao:validSince/text()", @@NAMESPACES).to_s
|
183
206
|
end
|
184
207
|
|
185
208
|
# Entity is valid until (from fao:validUntil)
|
209
|
+
# @return [String]
|
186
210
|
def valid_until
|
187
211
|
query_root_node("fao:validUntil/text()", @@NAMESPACES).to_s
|
188
212
|
end
|
189
213
|
|
190
214
|
# Executes a xpath query with optional a hash with namespaces
|
215
|
+
# @return [String]
|
191
216
|
def query_root_node(xpath_query, namespaces = {})
|
192
217
|
if !@root_node.nil?
|
193
218
|
@root_node.xpath(xpath_query, namespaces)
|
@@ -195,6 +220,7 @@ module SemanticCrawler
|
|
195
220
|
end
|
196
221
|
|
197
222
|
# Outputs the document as XML
|
223
|
+
# @return [String] The document serialized as XML
|
198
224
|
def xml_document
|
199
225
|
@root_node.to_s
|
200
226
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module SemanticCrawler
|
2
|
+
module GeoNames
|
3
|
+
# Represents Food and Agriculture information about one country.
|
4
|
+
class Country
|
5
|
+
|
6
|
+
# @attribute [r]
|
7
|
+
# The input latitude
|
8
|
+
attr_reader :latitude
|
9
|
+
|
10
|
+
# @attribute [r]
|
11
|
+
# The input longitude
|
12
|
+
attr_reader :longitude
|
13
|
+
|
14
|
+
def initialize(new_latitude, new_longitude)
|
15
|
+
@latitude = new_latitude
|
16
|
+
@longitude = new_longitude
|
17
|
+
end
|
18
|
+
|
19
|
+
# Returns an ISO 3166-1 alpha-2 country code
|
20
|
+
def get_country_code
|
21
|
+
Geonames::WebService.country_code @latitude, @longitude
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns the name of the country
|
25
|
+
def get_country_name
|
26
|
+
subdivision = Geonames::WebService.country_subdivision @latitude, @longitude
|
27
|
+
subdivision.country_name
|
28
|
+
end
|
29
|
+
|
30
|
+
# Returns a Wikipedia object that contains a link to the article,
|
31
|
+
# summary and many more fields.
|
32
|
+
def get_wikipedia_articles
|
33
|
+
Geonames::WebService.find_nearby_wikipedia :lat => @latitude, :long => @longitude
|
34
|
+
end
|
35
|
+
|
36
|
+
# @return [SemanticCrawler::Factbook::Country] A Factbook country object
|
37
|
+
def get_factbook_country
|
38
|
+
SemanticCrawler::Factbook::Country.new get_country_name
|
39
|
+
end
|
40
|
+
|
41
|
+
# @return [SemanticCrawler::Fao::Country] A Food and Agriculture country object
|
42
|
+
def get_fao_country
|
43
|
+
SemanticCrawler::Fao::Country.new get_country_name
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -23,33 +23,39 @@ module SemanticCrawler
|
|
23
23
|
end
|
24
24
|
|
25
25
|
# geo:lat
|
26
|
+
# @return [String]
|
26
27
|
def latitude
|
27
|
-
query_root_node("geo:lat/text()", @@NAMESPACES)
|
28
|
+
query_root_node("geo:lat/text()", @@NAMESPACES).to_s
|
28
29
|
end
|
29
30
|
|
30
31
|
# geo:long
|
32
|
+
# @return [String]
|
31
33
|
def longitude
|
32
|
-
query_root_node("geo:long/text()", @@NAMESPACES)
|
34
|
+
query_root_node("geo:long/text()", @@NAMESPACES).to_s
|
33
35
|
end
|
34
36
|
|
35
37
|
# The type as URL
|
38
|
+
# @return [String]
|
36
39
|
def type
|
37
|
-
query_root_node("rdf:type/@rdf:resource", @@NAMESPACES)
|
40
|
+
query_root_node("rdf:type/@rdf:resource", @@NAMESPACES).to_s
|
38
41
|
end
|
39
42
|
|
40
43
|
# Comment about the entity
|
44
|
+
# @return [String]
|
41
45
|
def note
|
42
|
-
query_root_node("lgdp:note/text()", @@NAMESPACES)
|
46
|
+
query_root_node("lgdp:note/text()", @@NAMESPACES).to_s
|
43
47
|
end
|
44
48
|
|
45
49
|
# Data Source
|
50
|
+
# @return [String]
|
46
51
|
def created_by
|
47
|
-
query_root_node("lgdp:created_by/text()", @@NAMESPACES)
|
52
|
+
query_root_node("lgdp:created_by/text()", @@NAMESPACES).to_s
|
48
53
|
end
|
49
54
|
|
50
55
|
# User link
|
56
|
+
# @return [String]
|
51
57
|
def contributor
|
52
|
-
query_root_node("lgdo:contributor/@rdf:resource", @@NAMESPACES)
|
58
|
+
query_root_node("lgdo:contributor/@rdf:resource", @@NAMESPACES).to_s
|
53
59
|
end
|
54
60
|
|
55
61
|
# Link to way
|
@@ -58,33 +64,39 @@ module SemanticCrawler
|
|
58
64
|
end
|
59
65
|
|
60
66
|
# Street name
|
67
|
+
# @return [String]
|
61
68
|
def has_street
|
62
|
-
query_root_node("lgdo:hasStreet/text()", @@NAMESPACES)
|
69
|
+
query_root_node("lgdo:hasStreet/text()", @@NAMESPACES).to_s
|
63
70
|
end
|
64
71
|
|
65
72
|
# Postal code
|
73
|
+
# @return [String]
|
66
74
|
def has_postal_code
|
67
|
-
query_root_node("lgdo:hasPostalCode/text()", @@NAMESPACES)
|
75
|
+
query_root_node("lgdo:hasPostalCode/text()", @@NAMESPACES).to_s
|
68
76
|
end
|
69
77
|
|
70
78
|
# House number
|
79
|
+
# @return [String]
|
71
80
|
def has_house_number
|
72
|
-
query_root_node("lgdo:hasHouseNumber/text()", @@NAMESPACES)
|
81
|
+
query_root_node("lgdo:hasHouseNumber/text()", @@NAMESPACES).to_s
|
73
82
|
end
|
74
83
|
|
75
84
|
# Country in ISO 3166-1 alpha-2
|
85
|
+
# @return [String]
|
76
86
|
def has_country
|
77
|
-
query_root_node("lgdo:hasCountry/text()", @@NAMESPACES)
|
87
|
+
query_root_node("lgdo:hasCountry/text()", @@NAMESPACES).to_s
|
78
88
|
end
|
79
89
|
|
80
90
|
# City name
|
91
|
+
# @return [String]
|
81
92
|
def has_city
|
82
|
-
query_root_node("lgdo:hasCity/text()", @@NAMESPACES)
|
93
|
+
query_root_node("lgdo:hasCity/text()", @@NAMESPACES).to_s
|
83
94
|
end
|
84
95
|
|
85
96
|
# Is this place wheelchair friendly?
|
97
|
+
# @return [String]
|
86
98
|
def wheelchair
|
87
|
-
query_root_node("lgdo:wheelchair/@rdf:resource", @@NAMESPACES)
|
99
|
+
query_root_node("lgdo:wheelchair/@rdf:resource", @@NAMESPACES).to_s
|
88
100
|
end
|
89
101
|
|
90
102
|
# Query the root_node
|
@@ -95,6 +107,7 @@ module SemanticCrawler
|
|
95
107
|
end
|
96
108
|
|
97
109
|
# Outputs the document as XML
|
110
|
+
# @return [String] The document serialized as XML
|
98
111
|
def xml_document
|
99
112
|
@root_node.to_s
|
100
113
|
end
|