semantic-crawler 0.0.3 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +25 -0
- data/README.rdoc +44 -12
- data/Rakefile +9 -1
- data/lib/semantic_crawler.rb +26 -10
- data/lib/semantic_crawler/dbpedia.rb +4 -0
- data/lib/semantic_crawler/factbook/country.rb +13 -9
- data/lib/semantic_crawler/fao.rb +6 -0
- data/lib/semantic_crawler/fao/country.rb +198 -0
- data/lib/semantic_crawler/freebase/country.rb +43 -32
- data/lib/semantic_crawler/gdacs.rb +1 -1
- data/lib/semantic_crawler/gdacs/emergency_feed.rb +67 -0
- data/lib/semantic_crawler/gdacs/emergency_feed_item.rb +55 -0
- data/lib/semantic_crawler/gdacs/feed.rb +2 -1
- data/lib/semantic_crawler/gdacs/resource.rb +0 -4
- data/lib/semantic_crawler/geo_names.rb +4 -0
- data/lib/semantic_crawler/version.rb +1 -1
- data/lib/tasks/semantic_crawler_tasks.rake +4 -4
- data/log/semantic-crawler.log +31 -0
- data/test/dummy/log/test.log +753 -0
- data/test/semantic_crawler_test.rb +0 -19
- metadata +167 -115
- data/lib/semantic_crawler/freebase/crawler.rb +0 -22
- data/lib/semantic_crawler/freebase/entity.rb +0 -4
- data/test/factbook_test.rb +0 -86
- data/test/gdacs_test.rb +0 -80
data/CHANGELOG.rdoc
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
= Semantic Crawler CHANGELOG
|
2
|
+
|
3
|
+
== 0.0.4
|
4
|
+
|
5
|
+
* Adding module for parsing country information from "Food and Agriculture
|
6
|
+
Organization of the United Nations"
|
7
|
+
* Adding emergency feed to Gdacs module
|
8
|
+
* Adding basic module structure for all modules (not all implemented)
|
9
|
+
* Adding basic rspec sructure for all modules (not all implemented)
|
10
|
+
|
11
|
+
== 0.0.3
|
12
|
+
|
13
|
+
* Implementing the Gdacs module
|
14
|
+
* Improving documentation
|
15
|
+
* Better Exception Handling
|
16
|
+
* Minor bug fixing
|
17
|
+
|
18
|
+
== 0.0.2
|
19
|
+
|
20
|
+
* Factbook::Country implemented
|
21
|
+
* First version that is released to rubygems.org for testing purpose
|
22
|
+
|
23
|
+
== 0.0.1
|
24
|
+
|
25
|
+
* Initial version that is not usable and not released.
|
data/README.rdoc
CHANGED
@@ -1,6 +1,4 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
= SemanticCrawler
|
1
|
+
= SemanticCrawler {<img src="https://secure.travis-ci.org/obale/semantic_crawler.png?branch=master"/>}[http://travis-ci.org/#!/obale/semantic_crawler]
|
4
2
|
|
5
3
|
This project encapsulates data gathering from different sources.
|
6
4
|
It simplifies the extension of internal data with public available
|
@@ -11,8 +9,9 @@ to bypass complex NLP (natural language processing).
|
|
11
9
|
== Supported Sources
|
12
10
|
|
13
11
|
* CIA Factbook RDF Dump, see http://www4.wiwiss.fu-berlin.de/factbook/directory/countries
|
12
|
+
* GDACS (see http://gdacs.org)
|
13
|
+
* FAO - Food and Agriculture Organization of the United Nations (see http://www.fao.org)
|
14
14
|
* [Started] LinkedGeoData - LGD (see http://linkedgeodata.org)
|
15
|
-
* [Started] GDACS (see http://gdacs.org)
|
16
15
|
* [Started] Freebase (see http://freebase.com)
|
17
16
|
|
18
17
|
=== TODO
|
@@ -36,23 +35,60 @@ Or from source:
|
|
36
35
|
|
37
36
|
You can add this library also as dependency in your Gemfile:
|
38
37
|
|
39
|
-
gem "semantic-
|
38
|
+
gem "semantic-crawler"
|
40
39
|
|
41
40
|
Or from source:
|
42
41
|
|
43
|
-
gem "semantic-crawler", :git => "git://github.com/obale/semantic_crawler.git"
|
44
|
-
gem "semantic-crawler", :git => "git://github.com/obale/semantic_crawler.git", :
|
42
|
+
gem "semantic-crawler", :git => "git://github.com/obale/semantic_crawler.git" # for the master branch or
|
43
|
+
gem "semantic-crawler", :git => "git://github.com/obale/semantic_crawler.git", :branch => "develop" # for the developer branch
|
45
44
|
|
46
45
|
|
47
46
|
== Examples
|
48
47
|
|
48
|
+
>> require "semantic_crawler"
|
49
|
+
|
50
|
+
=== Factbook
|
51
|
+
|
49
52
|
Fetch Factbook information about Austria:
|
50
53
|
|
51
|
-
>> require "semantic_crawler"
|
52
54
|
>> austria = SemanticCrawler::Factbook::Country.new("austria")
|
53
55
|
>> puts austria.background
|
54
56
|
>> puts austria.climate
|
55
57
|
|
58
|
+
=== GDACS
|
59
|
+
|
60
|
+
Parse crisis information feed from GDACS.org:
|
61
|
+
|
62
|
+
>> feed = SemanticCrawler::Gdacs::Feed.new
|
63
|
+
>> puts feed.title.to_s
|
64
|
+
>> puts feed.description.to_s
|
65
|
+
>> feed.items.each do |item|
|
66
|
+
>> puts item.title.to_s
|
67
|
+
>> puts item.eventtype.to_s
|
68
|
+
>> item.resources.each do |resource|
|
69
|
+
>> puts resource.url.to_s
|
70
|
+
>> end
|
71
|
+
>> end
|
72
|
+
|
73
|
+
Get information from the the GDACS.org emergency feed:
|
74
|
+
|
75
|
+
>> emergency_feed = SemanticCrawler::Gdacs::EmergencyFeed.new
|
76
|
+
>> puts emergency_feed.title.to_s
|
77
|
+
>> items = @emergency_feed.items
|
78
|
+
>> items.each do |item|
|
79
|
+
>> puts item.title.to_s
|
80
|
+
>> puts item.link.to_s
|
81
|
+
>> end
|
82
|
+
|
83
|
+
=== FAO
|
84
|
+
|
85
|
+
Country information from {FAO}[http://www.fao.org]:
|
86
|
+
|
87
|
+
>> austria = SemanticCrawler::Fao::Country.new("Austria")
|
88
|
+
>> puts austria.name_currency("en")
|
89
|
+
>> puts austria.official_name("es")
|
90
|
+
|
91
|
+
|
56
92
|
== Changelog
|
57
93
|
|
58
94
|
see CHANGELOG.rdoc
|
@@ -63,10 +99,6 @@ see CHANGELOG.rdoc
|
|
63
99
|
* Ruby 1.8.7-p358 and Rails 3.2.2
|
64
100
|
* Ruby 1.9.3-p125 and Rails 3.2.2
|
65
101
|
|
66
|
-
=== Development Branch Health
|
67
|
-
|
68
|
-
{<img src="https://secure.travis-ci.org/obale/semantic_crawler.png?branch=master"/>}[http://travis-ci.org/#!/obale/semantic_crawler]
|
69
|
-
|
70
102
|
|
71
103
|
== License
|
72
104
|
|
data/Rakefile
CHANGED
@@ -12,6 +12,9 @@ rescue LoadError
|
|
12
12
|
RDoc::Task = Rake::RDocTask
|
13
13
|
end
|
14
14
|
|
15
|
+
require 'rake'
|
16
|
+
require 'rspec/core/rake_task'
|
17
|
+
|
15
18
|
RDoc::Task.new(:rdoc) do |rdoc|
|
16
19
|
rdoc.rdoc_dir = 'rdoc'
|
17
20
|
rdoc.title = 'SemanticCrawler'
|
@@ -26,6 +29,9 @@ Bundler::GemHelper.install_tasks
|
|
26
29
|
|
27
30
|
require 'rake/testtask'
|
28
31
|
|
32
|
+
# Import own written tasks
|
33
|
+
Dir.glob('lib/tasks/*.rake').each { |r| import r }
|
34
|
+
|
29
35
|
Rake::TestTask.new(:test) do |t|
|
30
36
|
t.libs << 'lib'
|
31
37
|
t.libs << 'test'
|
@@ -33,4 +39,6 @@ Rake::TestTask.new(:test) do |t|
|
|
33
39
|
t.verbose = false
|
34
40
|
end
|
35
41
|
|
36
|
-
|
42
|
+
RSpec::Core::RakeTask.new(:spec)
|
43
|
+
|
44
|
+
task :default => [ :test, :spec ]
|
data/lib/semantic_crawler.rb
CHANGED
@@ -13,24 +13,40 @@ require "httparty"
|
|
13
13
|
#
|
14
14
|
# The existing modules are extended stepwise and additional
|
15
15
|
# sources are added in the future.
|
16
|
+
require 'logger'
|
17
|
+
|
18
|
+
# Top module that contains the whole library. Each sub-module
|
19
|
+
# is wrappes one source.
|
16
20
|
module SemanticCrawler
|
21
|
+
$log = Logger.new(File.expand_path('../../log/semantic-crawler.log', __FILE__), 'daily')
|
17
22
|
end
|
18
23
|
|
19
|
-
# Freebase.com - module: Freebase
|
20
|
-
require "semantic_crawler/freebase/crawler"
|
21
|
-
require "semantic_crawler/freebase/entity"
|
22
|
-
require "semantic_crawler/freebase/country"
|
23
|
-
|
24
24
|
# CIA Factbook RDF Dump - module: Factbook
|
25
25
|
require "semantic_crawler/factbook"
|
26
26
|
require "semantic_crawler/factbook/country"
|
27
27
|
|
28
|
-
#
|
29
|
-
require "semantic_crawler/linked_geo_data"
|
30
|
-
require "semantic_crawler/linked_geo_data/relevant_node"
|
31
|
-
|
32
|
-
# GDACS (http://gdacs.org) - module: Gdacs
|
28
|
+
# GDACS.org - module: Gdacs
|
33
29
|
require "semantic_crawler/gdacs"
|
34
30
|
require "semantic_crawler/gdacs/feed.rb"
|
35
31
|
require "semantic_crawler/gdacs/feed_item.rb"
|
36
32
|
require "semantic_crawler/gdacs/resource.rb"
|
33
|
+
require "semantic_crawler/gdacs/emergency_feed.rb"
|
34
|
+
require "semantic_crawler/gdacs/emergency_feed_item.rb"
|
35
|
+
|
36
|
+
# DBPedia - module: Dbpedia
|
37
|
+
require "semantic_crawler/dbpedia"
|
38
|
+
|
39
|
+
# GeoNames.org - module: GeoNames
|
40
|
+
require "semantic_crawler/geo_names"
|
41
|
+
|
42
|
+
# LinkedGeoData.org - module: LinkedGeoData
|
43
|
+
require "semantic_crawler/linked_geo_data"
|
44
|
+
require "semantic_crawler/linked_geo_data/relevant_node"
|
45
|
+
|
46
|
+
# Fao.org - module: Fao
|
47
|
+
require "semantic_crawler/fao"
|
48
|
+
require "semantic_crawler/fao/country"
|
49
|
+
|
50
|
+
# Freebase.com - module: Freebase
|
51
|
+
require "semantic_crawler/freebase/country"
|
52
|
+
|
@@ -35,16 +35,20 @@ module SemanticCrawler
|
|
35
35
|
# Argumenst:
|
36
36
|
# new_country_name: (String)
|
37
37
|
def initialize(new_country_name)
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
38
|
+
if !new_country_name.nil?
|
39
|
+
@country_name = new_country_name
|
40
|
+
@url = @@URI_PREFIX + @country_name.downcase.gsub(" ", "_").gsub("usa", "united_states")
|
41
|
+
begin
|
42
|
+
fetch_rdf
|
43
|
+
rescue => e
|
44
|
+
$log.error("Not able to get country information, through exception: #{e}")
|
45
|
+
end
|
44
46
|
end
|
45
47
|
end
|
46
48
|
|
47
49
|
# Returns the country name (rdfs:label)
|
50
|
+
# XXX: If nothing was found this method returns
|
51
|
+
# <?xml version="1.0"?>
|
48
52
|
def name
|
49
53
|
get_rdfs_property("label", "/rdf:RDF/rdf:Description/factbook:landboundary/factbook:Country")
|
50
54
|
end
|
@@ -74,7 +78,7 @@ module SemanticCrawler
|
|
74
78
|
if !@doc.nil?
|
75
79
|
@doc.xpath("//factbook:landboundary/rdf:Description/@rdf:about", @@NAMESPACES)
|
76
80
|
else
|
77
|
-
|
81
|
+
nil
|
78
82
|
end
|
79
83
|
end
|
80
84
|
|
@@ -110,7 +114,7 @@ module SemanticCrawler
|
|
110
114
|
if !@doc.nil?
|
111
115
|
@doc.xpath(prefix + "/factbook:" + property_name + "/text()", @@NAMESPACES)
|
112
116
|
else
|
113
|
-
|
117
|
+
nil
|
114
118
|
end
|
115
119
|
end
|
116
120
|
|
@@ -120,7 +124,7 @@ module SemanticCrawler
|
|
120
124
|
if !@doc.nil?
|
121
125
|
@doc.xpath(prefix + "/rdfs:" + property_name + "/text()", @@NAMESPACES)
|
122
126
|
else
|
123
|
-
|
127
|
+
nil
|
124
128
|
end
|
125
129
|
end
|
126
130
|
|
@@ -0,0 +1,198 @@
|
|
1
|
+
module SemanticCrawler
|
2
|
+
module Fao
|
3
|
+
# Represents Food and Agriculture information about one country.
|
4
|
+
class Country
|
5
|
+
|
6
|
+
# Namespace hash
|
7
|
+
@@NAMESPACES = {
|
8
|
+
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
9
|
+
"fao" => "http://www.fao.org/countryprofiles/geoinfo/geopolitical/resource/",
|
10
|
+
"owl" => "http://www.w3.org/2002/07/owl#"
|
11
|
+
}
|
12
|
+
|
13
|
+
# @attribute [r]
|
14
|
+
# The read only country name
|
15
|
+
attr_reader :country_name
|
16
|
+
|
17
|
+
# @attribute [r]
|
18
|
+
# The read only URL to the FAO resource
|
19
|
+
attr_reader :url
|
20
|
+
|
21
|
+
# Initialize a new Fao country object
|
22
|
+
def initialize(new_country_name)
|
23
|
+
@country_name = new_country_name
|
24
|
+
@url = "http://www.fao.org/countryprofiles/geoinfo/geopolitical/data/#{@country_name}"
|
25
|
+
@root_node = nil
|
26
|
+
begin
|
27
|
+
fetch_rdf
|
28
|
+
rescue => e
|
29
|
+
$log.error("Not able to get country information, through exception: #{e}")
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# The dbpedia identifier (from fao:codeDBPediaID)
|
34
|
+
def code_dbpedia_id
|
35
|
+
query_root_node("fao:codeDBPediaID/text()", @@NAMESPACES).to_s
|
36
|
+
end
|
37
|
+
|
38
|
+
# Links to additional information (from owl:sameAs)
|
39
|
+
def same_as
|
40
|
+
returnLinks = []
|
41
|
+
links = query_root_node("owl:sameAs/@rdf:resource", @@NAMESPACES)
|
42
|
+
links.each do |link|
|
43
|
+
returnLinks << link.to_s
|
44
|
+
end
|
45
|
+
returnLinks
|
46
|
+
end
|
47
|
+
|
48
|
+
# The type as URL of this entity (from rdf:type)
|
49
|
+
def type_url
|
50
|
+
query_root_node("rdf:type/@rdf:resource", @@NAMESPACES).to_s
|
51
|
+
end
|
52
|
+
|
53
|
+
# The maximum latitude (from fao:hasMaxLatitude)
|
54
|
+
def max_latitude
|
55
|
+
query_root_node("fao:hasMaxLatitude/text()", @@NAMESPACES).to_s
|
56
|
+
end
|
57
|
+
|
58
|
+
# The maximum longitude (from fao:hasMaxLongitude)
|
59
|
+
def max_longitude
|
60
|
+
query_root_node("fao:hasMaxLongitude/text()", @@NAMESPACES).to_s
|
61
|
+
end
|
62
|
+
|
63
|
+
# The minimum latitude (from fao:hasMinLatitude)
|
64
|
+
def min_latitude
|
65
|
+
query_root_node("fao:hasMinLatitude/text()", @@NAMESPACES).to_s
|
66
|
+
end
|
67
|
+
|
68
|
+
# The minimum longitude (from fao:hasMinLongitude)
|
69
|
+
def min_longitude
|
70
|
+
query_root_node("fao:hasMinLongitude/text()", @@NAMESPACES).to_s
|
71
|
+
end
|
72
|
+
|
73
|
+
# Human readable description about the land area (from fao:landAreaNotes)
|
74
|
+
def land_area_notes
|
75
|
+
query_root_node("fao:landAreaNotes/text()", @@NAMESPACES).to_s
|
76
|
+
end
|
77
|
+
|
78
|
+
# Land area total value (from fao:landAreaTotal)
|
79
|
+
def land_area_total
|
80
|
+
query_root_node("fao:landAreaTotal/text()", @@NAMESPACES).to_s
|
81
|
+
end
|
82
|
+
|
83
|
+
# Land area unit (from fao:landAreaUnit)
|
84
|
+
def land_area_unit
|
85
|
+
query_root_node("fao:landAreaUnit/text()", @@NAMESPACES).to_s
|
86
|
+
end
|
87
|
+
|
88
|
+
# Land area year (from fao:landAreaYear)
|
89
|
+
def land_area_year
|
90
|
+
query_root_node("fao:landAreaYear/text()", @@NAMESPACES).to_s
|
91
|
+
end
|
92
|
+
|
93
|
+
# The currency name.
|
94
|
+
# @param [Lang] The language in which the currency name should be returned
|
95
|
+
def name_currency(lang)
|
96
|
+
query_root_node("fao:nameCurrency[@xml:lang='#{lang}']/text()", @@NAMESPACES).to_s
|
97
|
+
end
|
98
|
+
|
99
|
+
# The official country name
|
100
|
+
# @param [Lang] The language in which the official name should be returned
|
101
|
+
def official_name(lang)
|
102
|
+
query_root_node("fao:nameOfficial[@xml:lang='#{lang}']/text()", @@NAMESPACES).to_s
|
103
|
+
end
|
104
|
+
|
105
|
+
# Classification of this country as name (from fao:isInGroup)
|
106
|
+
def is_in_group_name
|
107
|
+
returnGroup = []
|
108
|
+
group = query_root_node("fao:isInGroup/@rdf:resource", @@NAMESPACES)
|
109
|
+
group.each do |entry|
|
110
|
+
returnGroup << entry.to_s.split("/")[7]
|
111
|
+
end
|
112
|
+
returnGroup
|
113
|
+
end
|
114
|
+
|
115
|
+
# Classification of this country as dereferenceable URL (from
|
116
|
+
# fao:isInGroup)
|
117
|
+
def is_in_group_url
|
118
|
+
returnGroup = []
|
119
|
+
group = query_root_node("fao:isInGroup/@rdf:resource", @@NAMESPACES)
|
120
|
+
group.each do |entry|
|
121
|
+
returnGroup << entry.to_s
|
122
|
+
end
|
123
|
+
returnGroup
|
124
|
+
end
|
125
|
+
|
126
|
+
# Returns all countries that share a boarder with this country (as
|
127
|
+
# dereferencable URL - from fao:hasBorderWith)
|
128
|
+
def has_boarder_with_url
|
129
|
+
returnGroup = []
|
130
|
+
group = query_root_node("fao:hasBorderWith/@rdf:resource", @@NAMESPACES)
|
131
|
+
group.each do |entry|
|
132
|
+
returnGroup << entry.to_s
|
133
|
+
end
|
134
|
+
returnGroup
|
135
|
+
end
|
136
|
+
|
137
|
+
# Returns all countries that share a boarder with this country (as
|
138
|
+
# name)
|
139
|
+
def has_boarder_with_name
|
140
|
+
returnGroup = []
|
141
|
+
group = query_root_node("fao:hasBorderWith/@rdf:resource", @@NAMESPACES)
|
142
|
+
group.each do |entry|
|
143
|
+
returnGroup << entry.to_s.split("/")[7]
|
144
|
+
end
|
145
|
+
returnGroup
|
146
|
+
end
|
147
|
+
|
148
|
+
# Population notes (from fao:populationNotes)
|
149
|
+
def population_notes
|
150
|
+
query_root_node("fao:populationNotes/text()", @@NAMESPACES).to_s
|
151
|
+
end
|
152
|
+
|
153
|
+
# Population total (from fao:populationTotal)
|
154
|
+
def population_total
|
155
|
+
query_root_node("fao:populationTotal/text()", @@NAMESPACES).to_s
|
156
|
+
end
|
157
|
+
|
158
|
+
# Population unit (from fao:populationUnit)
|
159
|
+
def population_unit
|
160
|
+
query_root_node("fao:populationUnit/text()", @@NAMESPACES).to_s
|
161
|
+
end
|
162
|
+
|
163
|
+
# Population year (from fao:populationYear)
|
164
|
+
def population_year
|
165
|
+
query_root_node("fao:populationYear/text()", @@NAMESPACES).to_s
|
166
|
+
end
|
167
|
+
|
168
|
+
# Entity is valid since (from fao:validSince)
|
169
|
+
def valid_since
|
170
|
+
query_root_node("fao:validSince/text()", @@NAMESPACES).to_s
|
171
|
+
end
|
172
|
+
|
173
|
+
# Entity is valid until (from fao:validUntil)
|
174
|
+
def valid_until
|
175
|
+
query_root_node("fao:validUntil/text()", @@NAMESPACES).to_s
|
176
|
+
end
|
177
|
+
|
178
|
+
# Executes a xpath query with optional a hash with namespaces
|
179
|
+
def query_root_node(xpath_query, namespaces = {})
|
180
|
+
if !@root_node.nil?
|
181
|
+
@root_node.xpath(xpath_query, namespaces)
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
# Outputs the document as XML
|
186
|
+
def xml_document
|
187
|
+
@root_node.to_s
|
188
|
+
end
|
189
|
+
|
190
|
+
private
|
191
|
+
# Retrieves the RDF file
|
192
|
+
def fetch_rdf
|
193
|
+
@doc = Nokogiri::XML(open(@url))
|
194
|
+
@root_node = @doc.xpath("/rdf:RDF/rdf:Description", @@NAMESPACES)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|