semantic-crawler 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.travis.yml +4 -0
- data/.yardopts +5 -0
- data/Gemfile +18 -0
- data/README.rdoc +25 -8
- data/Rakefile +0 -1
- data/changelog.sh +4 -0
- data/exploitation/freebase.rb +13 -0
- data/lib/semantic_crawler.rb +2 -0
- data/lib/semantic_crawler/factbook/country.rb +1 -0
- data/lib/semantic_crawler/fao/country.rb +30 -4
- data/lib/semantic_crawler/gdacs/feed.rb +1 -0
- data/lib/semantic_crawler/geo_names/country.rb +48 -0
- data/lib/semantic_crawler/linked_geo_data/relevant_node.rb +25 -12
- data/lib/semantic_crawler/linked_geo_data/relevant_nodes.rb +6 -0
- data/lib/semantic_crawler/version.rb +1 -1
- data/log/.gitkeep +0 -0
- data/semantic_crawler.gemspec +35 -0
- data/spec/dbpedia_spec.rb +9 -0
- data/spec/factbook_spec.rb +89 -0
- data/spec/fao_austria_spec.rb +118 -0
- data/spec/fao_papua_new_guinea_spec.rb +97 -0
- data/spec/freebase_spec.rb +17 -0
- data/spec/gdacs_spec.rb +111 -0
- data/spec/geo_names_spec.rb +36 -0
- data/spec/linked_geo_data_spec.rb +44 -0
- data/spec/spec_helper.rb +13 -0
- data/test/dummy/app/mailers/.gitkeep +0 -0
- data/test/dummy/app/models/.gitkeep +0 -0
- data/test/dummy/lib/assets/.gitkeep +0 -0
- data/test/dummy/log/.gitkeep +0 -0
- data/test/dummy/log/test.log +30 -0
- metadata +90 -43
- data/log/semantic-crawler.log +0 -33
- data/log/semantic-crawler.log.20120413 +0 -31
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/.yardopts
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
|
3
|
+
# Declare your gem's dependencies in semantic_crawler.gemspec.
|
4
|
+
# Bundler will treat runtime dependencies like base dependencies, and
|
5
|
+
# development dependencies will be added by default to the :development group.
|
6
|
+
gemspec
|
7
|
+
|
8
|
+
# jquery-rails is used by the dummy application
|
9
|
+
gem "jquery-rails"
|
10
|
+
|
11
|
+
#
|
12
|
+
# Declare any dependencies that are still in development here instead of in
|
13
|
+
# your gemspec. These might include edge Rails or gems from your path or
|
14
|
+
# Git. Remember to move these dependencies to your gemspec before releasing
|
15
|
+
# your gem to rubygems.org.
|
16
|
+
|
17
|
+
# To use debugger
|
18
|
+
# gem 'ruby-debug'
|
data/README.rdoc
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
= SemanticCrawler
|
1
|
+
= SemanticCrawler
|
2
2
|
|
3
3
|
This project encapsulates data gathering from different sources.
|
4
4
|
It simplifies the extension of internal data with public available
|
@@ -8,15 +8,15 @@ to bypass complex NLP (natural language processing).
|
|
8
8
|
|
9
9
|
== Supported Sources
|
10
10
|
|
11
|
-
*
|
12
|
-
*
|
13
|
-
* FAO - Food and Agriculture Organization of the United Nations
|
14
|
-
* LinkedGeoData - LGD
|
15
|
-
* [
|
11
|
+
* {Geonames}[http://www.geonames.org/]
|
12
|
+
* {CIA Factbook RDF Dump}[http://www4.wiwiss.fu-berlin.de/factbook/directory/countries]
|
13
|
+
* {FAO - Food and Agriculture Organization of the United Nations}[http://www.fao.org]
|
14
|
+
* {LinkedGeoData - LGD}[http://linkedgeodata.org]
|
15
|
+
* {GDACS}[http://gdacs.org]
|
16
|
+
* [Started] {Freebase}[http://freebase.com]
|
16
17
|
|
17
18
|
=== TODO
|
18
19
|
|
19
|
-
* Geonames
|
20
20
|
* DBPedia
|
21
21
|
* Different Government Sources
|
22
22
|
|
@@ -45,8 +45,25 @@ Or from source:
|
|
45
45
|
|
46
46
|
== Examples
|
47
47
|
|
48
|
+
This examples are only a short outline how to use the library. For more
|
49
|
+
information read the documentation or look into the source code. To use the
|
50
|
+
library include or execute the following line:
|
51
|
+
|
48
52
|
>> require "semantic_crawler"
|
49
53
|
|
54
|
+
=== GeoNames
|
55
|
+
|
56
|
+
The GeoNames module is able to return a Factbook::Country and Fao::Country
|
57
|
+
module on the base of input GPS coordinates (lat/long).
|
58
|
+
|
59
|
+
>> @innsbruck = SemanticCrawler::GeoNames::Country.new(47.271338, 11.395333)
|
60
|
+
>> articles = @innsbruck.get_wikipedia_articles
|
61
|
+
>> articles.each do |article|
|
62
|
+
>> puts article.wikipedia_url
|
63
|
+
>> end
|
64
|
+
>> factbook_obj = @innsbruck.get_factbook_country
|
65
|
+
>> fao_obj = @innsbruck.get_fao_country
|
66
|
+
|
50
67
|
=== Factbook
|
51
68
|
|
52
69
|
Fetch Factbook information about Austria:
|
@@ -90,7 +107,7 @@ Country information from {FAO}[http://www.fao.org]:
|
|
90
107
|
|
91
108
|
=== LinkedGeoData
|
92
109
|
|
93
|
-
Geo information from {LinkedGeoData}[http://linkedgeodata.org]
|
110
|
+
Geo information from {LinkedGeoData}[http://linkedgeodata.org]:
|
94
111
|
|
95
112
|
>> # All nodes around the center of dresden, in a radius of 1000m
|
96
113
|
>> @dresden = SemanticCrawler::LinkedGeoData::RelevantNodes.new(51.033333, 13.733333, 1000)
|
data/Rakefile
CHANGED
data/changelog.sh
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
require 'httparty'
|
3
|
+
require 'json'
|
4
|
+
require 'pp'
|
5
|
+
|
6
|
+
#query = [{'id' => '/en/austria', 'name' => nil, 'type' => '/location/country', '*' => [{}] }]
|
7
|
+
query = [{'id' => '/en/austria', 'type' => '/type/property' }]
|
8
|
+
query_envelope = {'query' => query }
|
9
|
+
service_url = 'http://api.freebase.com/api/service/mqlread'
|
10
|
+
url = service_url + '?query=' + CGI::escape(query_envelope.to_json)
|
11
|
+
|
12
|
+
response = HTTParty.get(url, :format => :json)
|
13
|
+
puts response
|
data/lib/semantic_crawler.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
require "httparty"
|
4
|
+
require "geonames"
|
4
5
|
|
5
6
|
# The top level module contains the different data sources
|
6
7
|
# as sub-modules. Currently there are the following modules
|
@@ -38,6 +39,7 @@ require "semantic_crawler/dbpedia"
|
|
38
39
|
|
39
40
|
# GeoNames.org - module: GeoNames
|
40
41
|
require "semantic_crawler/geo_names"
|
42
|
+
require "semantic_crawler/geo_names/country"
|
41
43
|
|
42
44
|
# LinkedGeoData.org - module: LinkedGeoData
|
43
45
|
require "semantic_crawler/linked_geo_data"
|
@@ -3,6 +3,7 @@ module SemanticCrawler
|
|
3
3
|
# Represents Food and Agriculture information about one country.
|
4
4
|
class Country
|
5
5
|
|
6
|
+
# The URI prefix of the fao country object
|
6
7
|
@@URI_PREFIX = "http://www.fao.org/countryprofiles/geoinfo/geopolitical/data/"
|
7
8
|
|
8
9
|
# Namespace hash
|
@@ -35,11 +36,13 @@ module SemanticCrawler
|
|
35
36
|
end
|
36
37
|
|
37
38
|
# The dbpedia identifier (from fao:codeDBPediaID)
|
39
|
+
# @return [String]
|
38
40
|
def code_dbpedia_id
|
39
41
|
query_root_node("fao:codeDBPediaID/text()", @@NAMESPACES).to_s
|
40
42
|
end
|
41
43
|
|
42
44
|
# Links to additional information (from owl:sameAs)
|
45
|
+
# @return [Array<String>]
|
43
46
|
def same_as
|
44
47
|
returnLinks = []
|
45
48
|
links = query_root_node("owl:sameAs/@rdf:resource", @@NAMESPACES)
|
@@ -50,63 +53,75 @@ module SemanticCrawler
|
|
50
53
|
end
|
51
54
|
|
52
55
|
# The type as URL of this entity (from rdf:type)
|
56
|
+
# @return [String]
|
53
57
|
def type_url
|
54
58
|
query_root_node("rdf:type/@rdf:resource", @@NAMESPACES).to_s
|
55
59
|
end
|
56
60
|
|
57
61
|
# The maximum latitude (from fao:hasMaxLatitude)
|
62
|
+
# @return [String]
|
58
63
|
def max_latitude
|
59
64
|
query_root_node("fao:hasMaxLatitude/text()", @@NAMESPACES).to_s
|
60
65
|
end
|
61
66
|
|
62
67
|
# The maximum longitude (from fao:hasMaxLongitude)
|
68
|
+
# @return [String]
|
63
69
|
def max_longitude
|
64
70
|
query_root_node("fao:hasMaxLongitude/text()", @@NAMESPACES).to_s
|
65
71
|
end
|
66
72
|
|
67
73
|
# The minimum latitude (from fao:hasMinLatitude)
|
74
|
+
# @return [String]
|
68
75
|
def min_latitude
|
69
76
|
query_root_node("fao:hasMinLatitude/text()", @@NAMESPACES).to_s
|
70
77
|
end
|
71
78
|
|
72
79
|
# The minimum longitude (from fao:hasMinLongitude)
|
80
|
+
# @return [String]
|
73
81
|
def min_longitude
|
74
82
|
query_root_node("fao:hasMinLongitude/text()", @@NAMESPACES).to_s
|
75
83
|
end
|
76
84
|
|
77
85
|
# Human readable description about the land area (from fao:landAreaNotes)
|
86
|
+
# @return [String]
|
78
87
|
def land_area_notes
|
79
88
|
query_root_node("fao:landAreaNotes/text()", @@NAMESPACES).to_s
|
80
89
|
end
|
81
90
|
|
82
91
|
# Land area total value (from fao:landAreaTotal)
|
92
|
+
# @return [String]
|
83
93
|
def land_area_total
|
84
94
|
query_root_node("fao:landAreaTotal/text()", @@NAMESPACES).to_s
|
85
95
|
end
|
86
96
|
|
87
97
|
# Land area unit (from fao:landAreaUnit)
|
98
|
+
# @return [String]
|
88
99
|
def land_area_unit
|
89
100
|
query_root_node("fao:landAreaUnit/text()", @@NAMESPACES).to_s
|
90
101
|
end
|
91
102
|
|
92
103
|
# Land area year (from fao:landAreaYear)
|
104
|
+
# @return [String]
|
93
105
|
def land_area_year
|
94
106
|
query_root_node("fao:landAreaYear/text()", @@NAMESPACES).to_s
|
95
107
|
end
|
96
108
|
|
97
109
|
# The currency name.
|
98
|
-
# @param [
|
99
|
-
|
110
|
+
# @param [String] The language in which the currency name should be returned
|
111
|
+
# @return [String]
|
112
|
+
def name_currency(lang = 'en')
|
100
113
|
query_root_node("fao:nameCurrency[@xml:lang='#{lang}']/text()", @@NAMESPACES).to_s
|
101
114
|
end
|
102
115
|
|
103
116
|
# The official country name
|
104
|
-
# @param [
|
105
|
-
|
117
|
+
# @param [String] The language in which the official name should be returned
|
118
|
+
# @return [String]
|
119
|
+
def official_name(lang = 'en')
|
106
120
|
query_root_node("fao:nameOfficial[@xml:lang='#{lang}']/text()", @@NAMESPACES).to_s
|
107
121
|
end
|
108
122
|
|
109
123
|
# Classification of this country as name (from fao:isInGroup)
|
124
|
+
# @return [Array<String>]
|
110
125
|
def is_in_group_name
|
111
126
|
returnGroup = []
|
112
127
|
group = query_root_node("fao:isInGroup/@rdf:resource", @@NAMESPACES)
|
@@ -120,6 +135,7 @@ module SemanticCrawler
|
|
120
135
|
|
121
136
|
# Classification of this country as dereferenceable URL (from
|
122
137
|
# fao:isInGroup)
|
138
|
+
# @return [Array<String>]
|
123
139
|
def is_in_group_url
|
124
140
|
returnGroup = []
|
125
141
|
group = query_root_node("fao:isInGroup/@rdf:resource", @@NAMESPACES)
|
@@ -133,6 +149,7 @@ module SemanticCrawler
|
|
133
149
|
|
134
150
|
# Returns all countries that share a boarder with this country (as
|
135
151
|
# dereferencable URL - from fao:hasBorderWith)
|
152
|
+
# @return [Array<String>]
|
136
153
|
def has_boarder_with_url
|
137
154
|
returnGroup = []
|
138
155
|
group = query_root_node("fao:hasBorderWith/@rdf:resource", @@NAMESPACES)
|
@@ -146,6 +163,7 @@ module SemanticCrawler
|
|
146
163
|
|
147
164
|
# Returns all countries that share a boarder with this country (as
|
148
165
|
# name)
|
166
|
+
# @return [Array<String>]
|
149
167
|
def has_boarder_with_name
|
150
168
|
returnGroup = []
|
151
169
|
group = query_root_node("fao:hasBorderWith/@rdf:resource", @@NAMESPACES)
|
@@ -158,36 +176,43 @@ module SemanticCrawler
|
|
158
176
|
end
|
159
177
|
|
160
178
|
# Population notes (from fao:populationNotes)
|
179
|
+
# @return [String]
|
161
180
|
def population_notes
|
162
181
|
query_root_node("fao:populationNotes/text()", @@NAMESPACES).to_s
|
163
182
|
end
|
164
183
|
|
165
184
|
# Population total (from fao:populationTotal)
|
185
|
+
# @return [String]
|
166
186
|
def population_total
|
167
187
|
query_root_node("fao:populationTotal/text()", @@NAMESPACES).to_s
|
168
188
|
end
|
169
189
|
|
170
190
|
# Population unit (from fao:populationUnit)
|
191
|
+
# @return [String]
|
171
192
|
def population_unit
|
172
193
|
query_root_node("fao:populationUnit/text()", @@NAMESPACES).to_s
|
173
194
|
end
|
174
195
|
|
175
196
|
# Population year (from fao:populationYear)
|
197
|
+
# @return [String]
|
176
198
|
def population_year
|
177
199
|
query_root_node("fao:populationYear/text()", @@NAMESPACES).to_s
|
178
200
|
end
|
179
201
|
|
180
202
|
# Entity is valid since (from fao:validSince)
|
203
|
+
# @return [String]
|
181
204
|
def valid_since
|
182
205
|
query_root_node("fao:validSince/text()", @@NAMESPACES).to_s
|
183
206
|
end
|
184
207
|
|
185
208
|
# Entity is valid until (from fao:validUntil)
|
209
|
+
# @return [String]
|
186
210
|
def valid_until
|
187
211
|
query_root_node("fao:validUntil/text()", @@NAMESPACES).to_s
|
188
212
|
end
|
189
213
|
|
190
214
|
# Executes a xpath query with optional a hash with namespaces
|
215
|
+
# @return [String]
|
191
216
|
def query_root_node(xpath_query, namespaces = {})
|
192
217
|
if !@root_node.nil?
|
193
218
|
@root_node.xpath(xpath_query, namespaces)
|
@@ -195,6 +220,7 @@ module SemanticCrawler
|
|
195
220
|
end
|
196
221
|
|
197
222
|
# Outputs the document as XML
|
223
|
+
# @return [String] The document serialized as XML
|
198
224
|
def xml_document
|
199
225
|
@root_node.to_s
|
200
226
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module SemanticCrawler
|
2
|
+
module GeoNames
|
3
|
+
# Represents Food and Agriculture information about one country.
|
4
|
+
class Country
|
5
|
+
|
6
|
+
# @attribute [r]
|
7
|
+
# The input latitude
|
8
|
+
attr_reader :latitude
|
9
|
+
|
10
|
+
# @attribute [r]
|
11
|
+
# The input longitude
|
12
|
+
attr_reader :longitude
|
13
|
+
|
14
|
+
def initialize(new_latitude, new_longitude)
|
15
|
+
@latitude = new_latitude
|
16
|
+
@longitude = new_longitude
|
17
|
+
end
|
18
|
+
|
19
|
+
# Returns an ISO 3166-1 alpha-2 country code
|
20
|
+
def get_country_code
|
21
|
+
Geonames::WebService.country_code @latitude, @longitude
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns the name of the country
|
25
|
+
def get_country_name
|
26
|
+
subdivision = Geonames::WebService.country_subdivision @latitude, @longitude
|
27
|
+
subdivision.country_name
|
28
|
+
end
|
29
|
+
|
30
|
+
# Returns a Wikipedia object that contains a link to the article,
|
31
|
+
# summary and many more fields.
|
32
|
+
def get_wikipedia_articles
|
33
|
+
Geonames::WebService.find_nearby_wikipedia :lat => @latitude, :long => @longitude
|
34
|
+
end
|
35
|
+
|
36
|
+
# @return [SemanticCrawler::Factbook::Country] A Factbook country object
|
37
|
+
def get_factbook_country
|
38
|
+
SemanticCrawler::Factbook::Country.new get_country_name
|
39
|
+
end
|
40
|
+
|
41
|
+
# @return [SemanticCrawler::Fao::Country] A Food and Agriculture country object
|
42
|
+
def get_fao_country
|
43
|
+
SemanticCrawler::Fao::Country.new get_country_name
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -23,33 +23,39 @@ module SemanticCrawler
|
|
23
23
|
end
|
24
24
|
|
25
25
|
# geo:lat
|
26
|
+
# @return [String]
|
26
27
|
def latitude
|
27
|
-
query_root_node("geo:lat/text()", @@NAMESPACES)
|
28
|
+
query_root_node("geo:lat/text()", @@NAMESPACES).to_s
|
28
29
|
end
|
29
30
|
|
30
31
|
# geo:long
|
32
|
+
# @return [String]
|
31
33
|
def longitude
|
32
|
-
query_root_node("geo:long/text()", @@NAMESPACES)
|
34
|
+
query_root_node("geo:long/text()", @@NAMESPACES).to_s
|
33
35
|
end
|
34
36
|
|
35
37
|
# The type as URL
|
38
|
+
# @return [String]
|
36
39
|
def type
|
37
|
-
query_root_node("rdf:type/@rdf:resource", @@NAMESPACES)
|
40
|
+
query_root_node("rdf:type/@rdf:resource", @@NAMESPACES).to_s
|
38
41
|
end
|
39
42
|
|
40
43
|
# Comment about the entity
|
44
|
+
# @return [String]
|
41
45
|
def note
|
42
|
-
query_root_node("lgdp:note/text()", @@NAMESPACES)
|
46
|
+
query_root_node("lgdp:note/text()", @@NAMESPACES).to_s
|
43
47
|
end
|
44
48
|
|
45
49
|
# Data Source
|
50
|
+
# @return [String]
|
46
51
|
def created_by
|
47
|
-
query_root_node("lgdp:created_by/text()", @@NAMESPACES)
|
52
|
+
query_root_node("lgdp:created_by/text()", @@NAMESPACES).to_s
|
48
53
|
end
|
49
54
|
|
50
55
|
# User link
|
56
|
+
# @return [String]
|
51
57
|
def contributor
|
52
|
-
query_root_node("lgdo:contributor/@rdf:resource", @@NAMESPACES)
|
58
|
+
query_root_node("lgdo:contributor/@rdf:resource", @@NAMESPACES).to_s
|
53
59
|
end
|
54
60
|
|
55
61
|
# Link to way
|
@@ -58,33 +64,39 @@ module SemanticCrawler
|
|
58
64
|
end
|
59
65
|
|
60
66
|
# Street name
|
67
|
+
# @return [String]
|
61
68
|
def has_street
|
62
|
-
query_root_node("lgdo:hasStreet/text()", @@NAMESPACES)
|
69
|
+
query_root_node("lgdo:hasStreet/text()", @@NAMESPACES).to_s
|
63
70
|
end
|
64
71
|
|
65
72
|
# Postal code
|
73
|
+
# @return [String]
|
66
74
|
def has_postal_code
|
67
|
-
query_root_node("lgdo:hasPostalCode/text()", @@NAMESPACES)
|
75
|
+
query_root_node("lgdo:hasPostalCode/text()", @@NAMESPACES).to_s
|
68
76
|
end
|
69
77
|
|
70
78
|
# House number
|
79
|
+
# @return [String]
|
71
80
|
def has_house_number
|
72
|
-
query_root_node("lgdo:hasHouseNumber/text()", @@NAMESPACES)
|
81
|
+
query_root_node("lgdo:hasHouseNumber/text()", @@NAMESPACES).to_s
|
73
82
|
end
|
74
83
|
|
75
84
|
# Country in ISO 3166-1 alpha-2
|
85
|
+
# @return [String]
|
76
86
|
def has_country
|
77
|
-
query_root_node("lgdo:hasCountry/text()", @@NAMESPACES)
|
87
|
+
query_root_node("lgdo:hasCountry/text()", @@NAMESPACES).to_s
|
78
88
|
end
|
79
89
|
|
80
90
|
# City name
|
91
|
+
# @return [String]
|
81
92
|
def has_city
|
82
|
-
query_root_node("lgdo:hasCity/text()", @@NAMESPACES)
|
93
|
+
query_root_node("lgdo:hasCity/text()", @@NAMESPACES).to_s
|
83
94
|
end
|
84
95
|
|
85
96
|
# Is this place wheelchair friendly?
|
97
|
+
# @return [String]
|
86
98
|
def wheelchair
|
87
|
-
query_root_node("lgdo:wheelchair/@rdf:resource", @@NAMESPACES)
|
99
|
+
query_root_node("lgdo:wheelchair/@rdf:resource", @@NAMESPACES).to_s
|
88
100
|
end
|
89
101
|
|
90
102
|
# Query the root_node
|
@@ -95,6 +107,7 @@ module SemanticCrawler
|
|
95
107
|
end
|
96
108
|
|
97
109
|
# Outputs the document as XML
|
110
|
+
# @return [String] The document serialized as XML
|
98
111
|
def xml_document
|
99
112
|
@root_node.to_s
|
100
113
|
end
|