semantic-crawler 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +25 -0
- data/README.rdoc +44 -12
- data/Rakefile +9 -1
- data/lib/semantic_crawler.rb +26 -10
- data/lib/semantic_crawler/dbpedia.rb +4 -0
- data/lib/semantic_crawler/factbook/country.rb +13 -9
- data/lib/semantic_crawler/fao.rb +6 -0
- data/lib/semantic_crawler/fao/country.rb +198 -0
- data/lib/semantic_crawler/freebase/country.rb +43 -32
- data/lib/semantic_crawler/gdacs.rb +1 -1
- data/lib/semantic_crawler/gdacs/emergency_feed.rb +67 -0
- data/lib/semantic_crawler/gdacs/emergency_feed_item.rb +55 -0
- data/lib/semantic_crawler/gdacs/feed.rb +2 -1
- data/lib/semantic_crawler/gdacs/resource.rb +0 -4
- data/lib/semantic_crawler/geo_names.rb +4 -0
- data/lib/semantic_crawler/version.rb +1 -1
- data/lib/tasks/semantic_crawler_tasks.rake +4 -4
- data/log/semantic-crawler.log +31 -0
- data/test/dummy/log/test.log +753 -0
- data/test/semantic_crawler_test.rb +0 -19
- metadata +167 -115
- data/lib/semantic_crawler/freebase/crawler.rb +0 -22
- data/lib/semantic_crawler/freebase/entity.rb +0 -4
- data/test/factbook_test.rb +0 -86
- data/test/gdacs_test.rb +0 -80
data/CHANGELOG.rdoc
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
= Semantic Crawler CHANGELOG
|
2
|
+
|
3
|
+
== 0.0.4
|
4
|
+
|
5
|
+
* Adding module for parsing country information from "Food and Agriculture
|
6
|
+
Organization of the United Nations"
|
7
|
+
* Adding emergency feed to Gdacs module
|
8
|
+
* Adding basic module structure for all modules (not all implemented)
|
9
|
+
* Adding basic rspec sructure for all modules (not all implemented)
|
10
|
+
|
11
|
+
== 0.0.3
|
12
|
+
|
13
|
+
* Implementing the Gdacs module
|
14
|
+
* Improving documentation
|
15
|
+
* Better Exception Handling
|
16
|
+
* Minor bug fixing
|
17
|
+
|
18
|
+
== 0.0.2
|
19
|
+
|
20
|
+
* Factbook::Country implemented
|
21
|
+
* First version that is released to rubygems.org for testing purpose
|
22
|
+
|
23
|
+
== 0.0.1
|
24
|
+
|
25
|
+
* Initial version that is not usable and not released.
|
data/README.rdoc
CHANGED
@@ -1,6 +1,4 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
= SemanticCrawler
|
1
|
+
= SemanticCrawler {<img src="https://secure.travis-ci.org/obale/semantic_crawler.png?branch=master"/>}[http://travis-ci.org/#!/obale/semantic_crawler]
|
4
2
|
|
5
3
|
This project encapsulates data gathering from different sources.
|
6
4
|
It simplifies the extension of internal data with public available
|
@@ -11,8 +9,9 @@ to bypass complex NLP (natural language processing).
|
|
11
9
|
== Supported Sources
|
12
10
|
|
13
11
|
* CIA Factbook RDF Dump, see http://www4.wiwiss.fu-berlin.de/factbook/directory/countries
|
12
|
+
* GDACS (see http://gdacs.org)
|
13
|
+
* FAO - Food and Agriculture Organization of the United Nations (see http://www.fao.org)
|
14
14
|
* [Started] LinkedGeoData - LGD (see http://linkedgeodata.org)
|
15
|
-
* [Started] GDACS (see http://gdacs.org)
|
16
15
|
* [Started] Freebase (see http://freebase.com)
|
17
16
|
|
18
17
|
=== TODO
|
@@ -36,23 +35,60 @@ Or from source:
|
|
36
35
|
|
37
36
|
You can add this library also as dependency in your Gemfile:
|
38
37
|
|
39
|
-
gem "semantic-
|
38
|
+
gem "semantic-crawler"
|
40
39
|
|
41
40
|
Or from source:
|
42
41
|
|
43
|
-
gem "semantic-crawler", :git => "git://github.com/obale/semantic_crawler.git"
|
44
|
-
gem "semantic-crawler", :git => "git://github.com/obale/semantic_crawler.git", :
|
42
|
+
gem "semantic-crawler", :git => "git://github.com/obale/semantic_crawler.git" # for the master branch or
|
43
|
+
gem "semantic-crawler", :git => "git://github.com/obale/semantic_crawler.git", :branch => "develop" # for the developer branch
|
45
44
|
|
46
45
|
|
47
46
|
== Examples
|
48
47
|
|
48
|
+
>> require "semantic_crawler"
|
49
|
+
|
50
|
+
=== Factbook
|
51
|
+
|
49
52
|
Fetch Factbook information about Austria:
|
50
53
|
|
51
|
-
>> require "semantic_crawler"
|
52
54
|
>> austria = SemanticCrawler::Factbook::Country.new("austria")
|
53
55
|
>> puts austria.background
|
54
56
|
>> puts austria.climate
|
55
57
|
|
58
|
+
=== GDACS
|
59
|
+
|
60
|
+
Parse crisis information feed from GDACS.org:
|
61
|
+
|
62
|
+
>> feed = SemanticCrawler::Gdacs::Feed.new
|
63
|
+
>> puts feed.title.to_s
|
64
|
+
>> puts feed.description.to_s
|
65
|
+
>> feed.items.each do |item|
|
66
|
+
>> puts item.title.to_s
|
67
|
+
>> puts item.eventtype.to_s
|
68
|
+
>> item.resources.each do |resource|
|
69
|
+
>> puts resource.url.to_s
|
70
|
+
>> end
|
71
|
+
>> end
|
72
|
+
|
73
|
+
Get information from the the GDACS.org emergency feed:
|
74
|
+
|
75
|
+
>> emergency_feed = SemanticCrawler::Gdacs::EmergencyFeed.new
|
76
|
+
>> puts emergency_feed.title.to_s
|
77
|
+
>> items = @emergency_feed.items
|
78
|
+
>> items.each do |item|
|
79
|
+
>> puts item.title.to_s
|
80
|
+
>> puts item.link.to_s
|
81
|
+
>> end
|
82
|
+
|
83
|
+
=== FAO
|
84
|
+
|
85
|
+
Country information from {FAO}[http://www.fao.org]:
|
86
|
+
|
87
|
+
>> austria = SemanticCrawler::Fao::Country.new("Austria")
|
88
|
+
>> puts austria.name_currency("en")
|
89
|
+
>> puts austria.official_name("es")
|
90
|
+
|
91
|
+
|
56
92
|
== Changelog
|
57
93
|
|
58
94
|
see CHANGELOG.rdoc
|
@@ -63,10 +99,6 @@ see CHANGELOG.rdoc
|
|
63
99
|
* Ruby 1.8.7-p358 and Rails 3.2.2
|
64
100
|
* Ruby 1.9.3-p125 and Rails 3.2.2
|
65
101
|
|
66
|
-
=== Development Branch Health
|
67
|
-
|
68
|
-
{<img src="https://secure.travis-ci.org/obale/semantic_crawler.png?branch=master"/>}[http://travis-ci.org/#!/obale/semantic_crawler]
|
69
|
-
|
70
102
|
|
71
103
|
== License
|
72
104
|
|
data/Rakefile
CHANGED
@@ -12,6 +12,9 @@ rescue LoadError
|
|
12
12
|
RDoc::Task = Rake::RDocTask
|
13
13
|
end
|
14
14
|
|
15
|
+
require 'rake'
|
16
|
+
require 'rspec/core/rake_task'
|
17
|
+
|
15
18
|
RDoc::Task.new(:rdoc) do |rdoc|
|
16
19
|
rdoc.rdoc_dir = 'rdoc'
|
17
20
|
rdoc.title = 'SemanticCrawler'
|
@@ -26,6 +29,9 @@ Bundler::GemHelper.install_tasks
|
|
26
29
|
|
27
30
|
require 'rake/testtask'
|
28
31
|
|
32
|
+
# Import own written tasks
|
33
|
+
Dir.glob('lib/tasks/*.rake').each { |r| import r }
|
34
|
+
|
29
35
|
Rake::TestTask.new(:test) do |t|
|
30
36
|
t.libs << 'lib'
|
31
37
|
t.libs << 'test'
|
@@ -33,4 +39,6 @@ Rake::TestTask.new(:test) do |t|
|
|
33
39
|
t.verbose = false
|
34
40
|
end
|
35
41
|
|
36
|
-
|
42
|
+
RSpec::Core::RakeTask.new(:spec)
|
43
|
+
|
44
|
+
task :default => [ :test, :spec ]
|
data/lib/semantic_crawler.rb
CHANGED
@@ -13,24 +13,40 @@ require "httparty"
|
|
13
13
|
#
|
14
14
|
# The existing modules are extended stepwise and additional
|
15
15
|
# sources are added in the future.
|
16
|
+
require 'logger'
|
17
|
+
|
18
|
+
# Top module that contains the whole library. Each sub-module
|
19
|
+
# is wrappes one source.
|
16
20
|
module SemanticCrawler
|
21
|
+
$log = Logger.new(File.expand_path('../../log/semantic-crawler.log', __FILE__), 'daily')
|
17
22
|
end
|
18
23
|
|
19
|
-
# Freebase.com - module: Freebase
|
20
|
-
require "semantic_crawler/freebase/crawler"
|
21
|
-
require "semantic_crawler/freebase/entity"
|
22
|
-
require "semantic_crawler/freebase/country"
|
23
|
-
|
24
24
|
# CIA Factbook RDF Dump - module: Factbook
|
25
25
|
require "semantic_crawler/factbook"
|
26
26
|
require "semantic_crawler/factbook/country"
|
27
27
|
|
28
|
-
#
|
29
|
-
require "semantic_crawler/linked_geo_data"
|
30
|
-
require "semantic_crawler/linked_geo_data/relevant_node"
|
31
|
-
|
32
|
-
# GDACS (http://gdacs.org) - module: Gdacs
|
28
|
+
# GDACS.org - module: Gdacs
|
33
29
|
require "semantic_crawler/gdacs"
|
34
30
|
require "semantic_crawler/gdacs/feed.rb"
|
35
31
|
require "semantic_crawler/gdacs/feed_item.rb"
|
36
32
|
require "semantic_crawler/gdacs/resource.rb"
|
33
|
+
require "semantic_crawler/gdacs/emergency_feed.rb"
|
34
|
+
require "semantic_crawler/gdacs/emergency_feed_item.rb"
|
35
|
+
|
36
|
+
# DBPedia - module: Dbpedia
|
37
|
+
require "semantic_crawler/dbpedia"
|
38
|
+
|
39
|
+
# GeoNames.org - module: GeoNames
|
40
|
+
require "semantic_crawler/geo_names"
|
41
|
+
|
42
|
+
# LinkedGeoData.org - module: LinkedGeoData
|
43
|
+
require "semantic_crawler/linked_geo_data"
|
44
|
+
require "semantic_crawler/linked_geo_data/relevant_node"
|
45
|
+
|
46
|
+
# Fao.org - module: Fao
|
47
|
+
require "semantic_crawler/fao"
|
48
|
+
require "semantic_crawler/fao/country"
|
49
|
+
|
50
|
+
# Freebase.com - module: Freebase
|
51
|
+
require "semantic_crawler/freebase/country"
|
52
|
+
|
@@ -35,16 +35,20 @@ module SemanticCrawler
|
|
35
35
|
# Argumenst:
|
36
36
|
# new_country_name: (String)
|
37
37
|
def initialize(new_country_name)
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
38
|
+
if !new_country_name.nil?
|
39
|
+
@country_name = new_country_name
|
40
|
+
@url = @@URI_PREFIX + @country_name.downcase.gsub(" ", "_").gsub("usa", "united_states")
|
41
|
+
begin
|
42
|
+
fetch_rdf
|
43
|
+
rescue => e
|
44
|
+
$log.error("Not able to get country information, through exception: #{e}")
|
45
|
+
end
|
44
46
|
end
|
45
47
|
end
|
46
48
|
|
47
49
|
# Returns the country name (rdfs:label)
|
50
|
+
# XXX: If nothing was found this method returns
|
51
|
+
# <?xml version="1.0"?>
|
48
52
|
def name
|
49
53
|
get_rdfs_property("label", "/rdf:RDF/rdf:Description/factbook:landboundary/factbook:Country")
|
50
54
|
end
|
@@ -74,7 +78,7 @@ module SemanticCrawler
|
|
74
78
|
if !@doc.nil?
|
75
79
|
@doc.xpath("//factbook:landboundary/rdf:Description/@rdf:about", @@NAMESPACES)
|
76
80
|
else
|
77
|
-
|
81
|
+
nil
|
78
82
|
end
|
79
83
|
end
|
80
84
|
|
@@ -110,7 +114,7 @@ module SemanticCrawler
|
|
110
114
|
if !@doc.nil?
|
111
115
|
@doc.xpath(prefix + "/factbook:" + property_name + "/text()", @@NAMESPACES)
|
112
116
|
else
|
113
|
-
|
117
|
+
nil
|
114
118
|
end
|
115
119
|
end
|
116
120
|
|
@@ -120,7 +124,7 @@ module SemanticCrawler
|
|
120
124
|
if !@doc.nil?
|
121
125
|
@doc.xpath(prefix + "/rdfs:" + property_name + "/text()", @@NAMESPACES)
|
122
126
|
else
|
123
|
-
|
127
|
+
nil
|
124
128
|
end
|
125
129
|
end
|
126
130
|
|
@@ -0,0 +1,198 @@
|
|
1
|
+
module SemanticCrawler
|
2
|
+
module Fao
|
3
|
+
# Represents Food and Agriculture information about one country.
|
4
|
+
class Country
|
5
|
+
|
6
|
+
# Namespace hash
|
7
|
+
@@NAMESPACES = {
|
8
|
+
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
9
|
+
"fao" => "http://www.fao.org/countryprofiles/geoinfo/geopolitical/resource/",
|
10
|
+
"owl" => "http://www.w3.org/2002/07/owl#"
|
11
|
+
}
|
12
|
+
|
13
|
+
# @attribute [r]
|
14
|
+
# The read only country name
|
15
|
+
attr_reader :country_name
|
16
|
+
|
17
|
+
# @attribute [r]
|
18
|
+
# The read only URL to the FAO resource
|
19
|
+
attr_reader :url
|
20
|
+
|
21
|
+
# Initialize a new Fao country object
|
22
|
+
def initialize(new_country_name)
|
23
|
+
@country_name = new_country_name
|
24
|
+
@url = "http://www.fao.org/countryprofiles/geoinfo/geopolitical/data/#{@country_name}"
|
25
|
+
@root_node = nil
|
26
|
+
begin
|
27
|
+
fetch_rdf
|
28
|
+
rescue => e
|
29
|
+
$log.error("Not able to get country information, through exception: #{e}")
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# The dbpedia identifier (from fao:codeDBPediaID)
|
34
|
+
def code_dbpedia_id
|
35
|
+
query_root_node("fao:codeDBPediaID/text()", @@NAMESPACES).to_s
|
36
|
+
end
|
37
|
+
|
38
|
+
# Links to additional information (from owl:sameAs)
|
39
|
+
def same_as
|
40
|
+
returnLinks = []
|
41
|
+
links = query_root_node("owl:sameAs/@rdf:resource", @@NAMESPACES)
|
42
|
+
links.each do |link|
|
43
|
+
returnLinks << link.to_s
|
44
|
+
end
|
45
|
+
returnLinks
|
46
|
+
end
|
47
|
+
|
48
|
+
# The type as URL of this entity (from rdf:type)
|
49
|
+
def type_url
|
50
|
+
query_root_node("rdf:type/@rdf:resource", @@NAMESPACES).to_s
|
51
|
+
end
|
52
|
+
|
53
|
+
# The maximum latitude (from fao:hasMaxLatitude)
|
54
|
+
def max_latitude
|
55
|
+
query_root_node("fao:hasMaxLatitude/text()", @@NAMESPACES).to_s
|
56
|
+
end
|
57
|
+
|
58
|
+
# The maximum longitude (from fao:hasMaxLongitude)
|
59
|
+
def max_longitude
|
60
|
+
query_root_node("fao:hasMaxLongitude/text()", @@NAMESPACES).to_s
|
61
|
+
end
|
62
|
+
|
63
|
+
# The minimum latitude (from fao:hasMinLatitude)
|
64
|
+
def min_latitude
|
65
|
+
query_root_node("fao:hasMinLatitude/text()", @@NAMESPACES).to_s
|
66
|
+
end
|
67
|
+
|
68
|
+
# The minimum longitude (from fao:hasMinLongitude)
|
69
|
+
def min_longitude
|
70
|
+
query_root_node("fao:hasMinLongitude/text()", @@NAMESPACES).to_s
|
71
|
+
end
|
72
|
+
|
73
|
+
# Human readable description about the land area (from fao:landAreaNotes)
|
74
|
+
def land_area_notes
|
75
|
+
query_root_node("fao:landAreaNotes/text()", @@NAMESPACES).to_s
|
76
|
+
end
|
77
|
+
|
78
|
+
# Land area total value (from fao:landAreaTotal)
|
79
|
+
def land_area_total
|
80
|
+
query_root_node("fao:landAreaTotal/text()", @@NAMESPACES).to_s
|
81
|
+
end
|
82
|
+
|
83
|
+
# Land area unit (from fao:landAreaUnit)
|
84
|
+
def land_area_unit
|
85
|
+
query_root_node("fao:landAreaUnit/text()", @@NAMESPACES).to_s
|
86
|
+
end
|
87
|
+
|
88
|
+
# Land area year (from fao:landAreaYear)
|
89
|
+
def land_area_year
|
90
|
+
query_root_node("fao:landAreaYear/text()", @@NAMESPACES).to_s
|
91
|
+
end
|
92
|
+
|
93
|
+
# The currency name.
|
94
|
+
# @param [Lang] The language in which the currency name should be returned
|
95
|
+
def name_currency(lang)
|
96
|
+
query_root_node("fao:nameCurrency[@xml:lang='#{lang}']/text()", @@NAMESPACES).to_s
|
97
|
+
end
|
98
|
+
|
99
|
+
# The official country name
|
100
|
+
# @param [Lang] The language in which the official name should be returned
|
101
|
+
def official_name(lang)
|
102
|
+
query_root_node("fao:nameOfficial[@xml:lang='#{lang}']/text()", @@NAMESPACES).to_s
|
103
|
+
end
|
104
|
+
|
105
|
+
# Classification of this country as name (from fao:isInGroup)
|
106
|
+
def is_in_group_name
|
107
|
+
returnGroup = []
|
108
|
+
group = query_root_node("fao:isInGroup/@rdf:resource", @@NAMESPACES)
|
109
|
+
group.each do |entry|
|
110
|
+
returnGroup << entry.to_s.split("/")[7]
|
111
|
+
end
|
112
|
+
returnGroup
|
113
|
+
end
|
114
|
+
|
115
|
+
# Classification of this country as dereferenceable URL (from
|
116
|
+
# fao:isInGroup)
|
117
|
+
def is_in_group_url
|
118
|
+
returnGroup = []
|
119
|
+
group = query_root_node("fao:isInGroup/@rdf:resource", @@NAMESPACES)
|
120
|
+
group.each do |entry|
|
121
|
+
returnGroup << entry.to_s
|
122
|
+
end
|
123
|
+
returnGroup
|
124
|
+
end
|
125
|
+
|
126
|
+
# Returns all countries that share a boarder with this country (as
|
127
|
+
# dereferencable URL - from fao:hasBorderWith)
|
128
|
+
def has_boarder_with_url
|
129
|
+
returnGroup = []
|
130
|
+
group = query_root_node("fao:hasBorderWith/@rdf:resource", @@NAMESPACES)
|
131
|
+
group.each do |entry|
|
132
|
+
returnGroup << entry.to_s
|
133
|
+
end
|
134
|
+
returnGroup
|
135
|
+
end
|
136
|
+
|
137
|
+
# Returns all countries that share a boarder with this country (as
|
138
|
+
# name)
|
139
|
+
def has_boarder_with_name
|
140
|
+
returnGroup = []
|
141
|
+
group = query_root_node("fao:hasBorderWith/@rdf:resource", @@NAMESPACES)
|
142
|
+
group.each do |entry|
|
143
|
+
returnGroup << entry.to_s.split("/")[7]
|
144
|
+
end
|
145
|
+
returnGroup
|
146
|
+
end
|
147
|
+
|
148
|
+
# Population notes (from fao:populationNotes)
|
149
|
+
def population_notes
|
150
|
+
query_root_node("fao:populationNotes/text()", @@NAMESPACES).to_s
|
151
|
+
end
|
152
|
+
|
153
|
+
# Population total (from fao:populationTotal)
|
154
|
+
def population_total
|
155
|
+
query_root_node("fao:populationTotal/text()", @@NAMESPACES).to_s
|
156
|
+
end
|
157
|
+
|
158
|
+
# Population unit (from fao:populationUnit)
|
159
|
+
def population_unit
|
160
|
+
query_root_node("fao:populationUnit/text()", @@NAMESPACES).to_s
|
161
|
+
end
|
162
|
+
|
163
|
+
# Population year (from fao:populationYear)
|
164
|
+
def population_year
|
165
|
+
query_root_node("fao:populationYear/text()", @@NAMESPACES).to_s
|
166
|
+
end
|
167
|
+
|
168
|
+
# Entity is valid since (from fao:validSince)
|
169
|
+
def valid_since
|
170
|
+
query_root_node("fao:validSince/text()", @@NAMESPACES).to_s
|
171
|
+
end
|
172
|
+
|
173
|
+
# Entity is valid until (from fao:validUntil)
|
174
|
+
def valid_until
|
175
|
+
query_root_node("fao:validUntil/text()", @@NAMESPACES).to_s
|
176
|
+
end
|
177
|
+
|
178
|
+
# Executes a xpath query with optional a hash with namespaces
|
179
|
+
def query_root_node(xpath_query, namespaces = {})
|
180
|
+
if !@root_node.nil?
|
181
|
+
@root_node.xpath(xpath_query, namespaces)
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
# Outputs the document as XML
|
186
|
+
def xml_document
|
187
|
+
@root_node.to_s
|
188
|
+
end
|
189
|
+
|
190
|
+
private
|
191
|
+
# Retrieves the RDF file
|
192
|
+
def fetch_rdf
|
193
|
+
@doc = Nokogiri::XML(open(@url))
|
194
|
+
@root_node = @doc.xpath("/rdf:RDF/rdf:Description", @@NAMESPACES)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|