semantic-crawler 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +17 -6
- data/Rakefile +2 -4
- data/lib/semantic_crawler.rb +25 -2
- data/lib/semantic_crawler/factbook.rb +11 -0
- data/lib/semantic_crawler/factbook/country.rb +15 -5
- data/lib/semantic_crawler/freebase.rb +4 -0
- data/lib/semantic_crawler/freebase/country.rb +5 -0
- data/lib/semantic_crawler/freebase/crawler.rb +1 -0
- data/lib/semantic_crawler/freebase/entity.rb +1 -0
- data/lib/semantic_crawler/gdacs.rb +6 -0
- data/lib/semantic_crawler/gdacs/feed.rb +100 -0
- data/lib/semantic_crawler/gdacs/feed_item.rb +172 -0
- data/lib/semantic_crawler/gdacs/resource.rb +66 -0
- data/lib/semantic_crawler/linked_geo_data.rb +5 -0
- data/lib/semantic_crawler/linked_geo_data/relevant_node.rb +13 -0
- data/lib/semantic_crawler/version.rb +3 -1
- data/test/dummy/log/test.log +4802 -0
- data/test/factbook_test.rb +1 -1
- data/test/gdacs_test.rb +80 -0
- metadata +37 -27
data/README.rdoc
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
[ATTENTION] This library is under heavy development
|
2
|
-
for a usable version.
|
1
|
+
[ATTENTION] This library is under heavy development. Please be patient for a usable version.
|
3
2
|
|
4
3
|
= SemanticCrawler
|
5
4
|
|
@@ -12,27 +11,35 @@ to bypass complex NLP (natural language processing).
|
|
12
11
|
== Supported Sources
|
13
12
|
|
14
13
|
* CIA Factbook RDF Dump, see http://www4.wiwiss.fu-berlin.de/factbook/directory/countries
|
14
|
+
* [Started] LinkedGeoData - LGD (see http://linkedgeodata.org)
|
15
|
+
* [Started] GDACS (see http://gdacs.org)
|
16
|
+
* [Started] Freebase (see http://freebase.com)
|
15
17
|
|
16
18
|
=== TODO
|
17
19
|
|
18
|
-
* LinkedGeoData - LGD(see http://linkedgeodata.org)
|
19
|
-
* Freebase
|
20
20
|
* Geonames
|
21
21
|
* DBPedia
|
22
22
|
* Different Government Sources
|
23
23
|
|
24
|
+
|
24
25
|
== Installation
|
25
26
|
|
27
|
+
$ gem install semantic-crawler
|
28
|
+
|
29
|
+
Or from source:
|
30
|
+
|
26
31
|
$ git clone git://github.com/obale/semantic_crawler.git
|
27
32
|
$ cd semantic_crawler
|
28
33
|
$ bundle install
|
29
34
|
$ rake build
|
30
35
|
$ rake install pkg/semantic-crawler-*.gem
|
31
36
|
|
32
|
-
If the library is stable enough it will be published at rubygems.org. That simplifies the installation.
|
33
|
-
|
34
37
|
You can add this library also as dependency in your Gemfile:
|
35
38
|
|
39
|
+
gem "semantic-cralwer"
|
40
|
+
|
41
|
+
Or from source:
|
42
|
+
|
36
43
|
gem "semantic-crawler", :git => "git://github.com/obale/semantic_crawler.git" # for the master branch or
|
37
44
|
gem "semantic-crawler", :git => "git://github.com/obale/semantic_crawler.git", :tags => "xyz" # for the xyz tag
|
38
45
|
|
@@ -56,6 +63,10 @@ see CHANGELOG.rdoc
|
|
56
63
|
* Ruby 1.8.7-p358 and Rails 3.2.2
|
57
64
|
* Ruby 1.9.3-p125 and Rails 3.2.2
|
58
65
|
|
66
|
+
=== Development Branch Health
|
67
|
+
|
68
|
+
{<img src="https://secure.travis-ci.org/obale/semantic_crawler.png?branch=master"/>}[http://travis-ci.org/#!/obale/semantic_crawler]
|
69
|
+
|
59
70
|
|
60
71
|
== License
|
61
72
|
|
data/Rakefile
CHANGED
@@ -16,13 +16,12 @@ RDoc::Task.new(:rdoc) do |rdoc|
|
|
16
16
|
rdoc.rdoc_dir = 'rdoc'
|
17
17
|
rdoc.title = 'SemanticCrawler'
|
18
18
|
rdoc.options << '--line-numbers'
|
19
|
+
rdoc.options << '--main=README.rdoc'
|
19
20
|
rdoc.rdoc_files.include('README.rdoc')
|
21
|
+
rdoc.rdoc_files.include('CHANGELOG.rdoc')
|
20
22
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
21
23
|
end
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
25
|
Bundler::GemHelper.install_tasks
|
27
26
|
|
28
27
|
require 'rake/testtask'
|
@@ -34,5 +33,4 @@ Rake::TestTask.new(:test) do |t|
|
|
34
33
|
t.verbose = false
|
35
34
|
end
|
36
35
|
|
37
|
-
|
38
36
|
task :default => :test
|
data/lib/semantic_crawler.rb
CHANGED
@@ -1,13 +1,36 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
require "httparty"
|
2
4
|
|
5
|
+
# The top level module contains the different data sources
|
6
|
+
# as sub-modules. Currently there are the following modules
|
7
|
+
# available:
|
8
|
+
#
|
9
|
+
# * Freebase
|
10
|
+
# * Factbook
|
11
|
+
# * LinkedGeoData
|
12
|
+
# * Gdacs
|
13
|
+
#
|
14
|
+
# The existing modules are extended stepwise and additional
|
15
|
+
# sources are added in the future.
|
3
16
|
module SemanticCrawler
|
4
17
|
end
|
5
18
|
|
6
|
-
# Freebase.com
|
19
|
+
# Freebase.com - module: Freebase
|
7
20
|
require "semantic_crawler/freebase/crawler"
|
8
21
|
require "semantic_crawler/freebase/entity"
|
9
22
|
require "semantic_crawler/freebase/country"
|
10
23
|
|
11
|
-
# CIA Factbook RDF Dump
|
24
|
+
# CIA Factbook RDF Dump - module: Factbook
|
25
|
+
require "semantic_crawler/factbook"
|
12
26
|
require "semantic_crawler/factbook/country"
|
13
27
|
|
28
|
+
# LinkedGeoData (http://linkedgeodata.org) - module: LinkedGeoData
|
29
|
+
require "semantic_crawler/linked_geo_data"
|
30
|
+
require "semantic_crawler/linked_geo_data/relevant_node"
|
31
|
+
|
32
|
+
# GDACS (http://gdacs.org) - module: Gdacs
|
33
|
+
require "semantic_crawler/gdacs"
|
34
|
+
require "semantic_crawler/gdacs/feed.rb"
|
35
|
+
require "semantic_crawler/gdacs/feed_item.rb"
|
36
|
+
require "semantic_crawler/gdacs/resource.rb"
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# The RDF Dump of the CIA Factbook contains country relevant information. The
|
2
|
+
# information are maybe deprecated, but for general purpose information that
|
3
|
+
# never or infrequent change, this source is perfect.
|
4
|
+
#
|
5
|
+
# This module encapsulates the access to the underlying RDF files and wrapped
|
6
|
+
# the most important properties. Not wrapped properties in the namespace of
|
7
|
+
# factbook could be accessed via the following method:
|
8
|
+
#
|
9
|
+
# * link:SemanticCrawler::Factbook::Country::get_factbook_property
|
10
|
+
module SemanticCrawler::Factbook
|
11
|
+
end
|
@@ -1,18 +1,28 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
require 'open-uri'
|
3
5
|
|
4
6
|
module SemanticCrawler
|
5
7
|
module Factbook
|
8
|
+
# Extracted from the RDF Dump of the CIA Factbook. Contains all
|
9
|
+
# relevant, but maybe deprecated information about countries.
|
6
10
|
class Country
|
11
|
+
# The prefixed used for each country
|
7
12
|
@@URI_PREFIX = "http://www4.wiwiss.fu-berlin.de/factbook/data/"
|
8
13
|
|
14
|
+
# Predefined RDFS/OWL namespaces used for RDF file parsing.
|
9
15
|
@@NAMESPACES = {
|
10
16
|
"factbook" => "http://www4.wiwiss.fu-berlin.de/factbook/ns#",
|
11
17
|
"rdfs" => "http://www.w3.org/2000/01/rdf-schema#",
|
12
18
|
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
13
19
|
}
|
14
20
|
|
21
|
+
# Country name given as input during the object creation.
|
15
22
|
attr_reader :country_name
|
23
|
+
|
24
|
+
# The complete URL of the country. Could be also wrong,
|
25
|
+
# if the country_name is not valid.
|
16
26
|
attr_reader :url
|
17
27
|
|
18
28
|
# Get Country Information from the CIA Factbook. see
|
@@ -23,15 +33,14 @@ module SemanticCrawler
|
|
23
33
|
# >> puts austria.background
|
24
34
|
#
|
25
35
|
# Argumenst:
|
26
|
-
#
|
36
|
+
# new_country_name: (String)
|
27
37
|
def initialize(new_country_name)
|
28
|
-
|
29
|
-
@
|
30
|
-
@url = @@URI_PREFIX + @country_name
|
38
|
+
@country_name = new_country_name
|
39
|
+
@url = @@URI_PREFIX + @country_name.downcase.gsub(" ", "_").gsub("usa", "united_states")
|
31
40
|
begin
|
32
41
|
fetch_rdf
|
33
42
|
rescue => e
|
34
|
-
puts "Not able to get country information, through exception: " + e
|
43
|
+
puts "Not able to get country information, through exception: " + e.message
|
35
44
|
end
|
36
45
|
end
|
37
46
|
|
@@ -116,6 +125,7 @@ module SemanticCrawler
|
|
116
125
|
end
|
117
126
|
|
118
127
|
private
|
128
|
+
# Retrieves the RDF stream
|
119
129
|
def fetch_rdf
|
120
130
|
@doc = Nokogiri::XML(open(@url))
|
121
131
|
end
|
@@ -1,9 +1,14 @@
|
|
1
|
+
# [XXX] The current implementation outputs only an unreadable JSON object.
|
1
2
|
class SemanticCrawler::Freebase::Country < SemanticCrawler::Freebase::Entity
|
3
|
+
# The Freebase object that should be retrieved
|
2
4
|
attr_accessor :input_name
|
5
|
+
# The URL that points to the JSON object.
|
3
6
|
attr_accessor :json_link
|
4
7
|
|
8
|
+
# The URL prefix of an Freebase experimental JSON entity.
|
5
9
|
@@uri_prefix = "http://www.freebase.com/experimental/topic/standard/en/"
|
6
10
|
|
11
|
+
# Creates a new Freebase object (JSON)
|
7
12
|
def initialize
|
8
13
|
@input_name = nil
|
9
14
|
@html_link = nil
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module SemanticCrawler
|
4
|
+
module Gdacs
|
5
|
+
# The GDACS.org RSS feed contains the latest crisis information about
|
6
|
+
# the following crisis types:
|
7
|
+
#
|
8
|
+
# * Floods
|
9
|
+
# * Earthquakes
|
10
|
+
# * Tropical Cyclones
|
11
|
+
# * Volcanoes
|
12
|
+
class Feed
|
13
|
+
|
14
|
+
@@NAMESPACES = {
|
15
|
+
"atom" => "http://www.w3.org/2005/Atom"
|
16
|
+
}
|
17
|
+
|
18
|
+
# The gdacs.org RSS feed URL. (default:
|
19
|
+
# http://new.gdacs.org/xml/rss.xml)
|
20
|
+
attr_reader :url
|
21
|
+
|
22
|
+
# Initializes the gdacs.org feed URL. If not specified the default
|
23
|
+
# URL (http://new.gdacs.org/xml/rss.xml) is used. Normally the
|
24
|
+
# feed URL should not be changed.
|
25
|
+
def initialize(new_url = "http://new.gdacs.org/xml/rss.xml")
|
26
|
+
@url = new_url
|
27
|
+
@root_node = nil
|
28
|
+
begin
|
29
|
+
fetch_feed
|
30
|
+
rescue => e
|
31
|
+
puts "Not able to get country information, through exception: " + e.message
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Get rss/channel/title
|
36
|
+
def title
|
37
|
+
query_root_node("title/text()")
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get rss/channel/link
|
41
|
+
def link
|
42
|
+
query_root_node("link/text()")
|
43
|
+
end
|
44
|
+
|
45
|
+
# Get rss/channel/description
|
46
|
+
def description
|
47
|
+
query_root_node("description/text()")
|
48
|
+
end
|
49
|
+
|
50
|
+
# Get rss/channel/pubDate
|
51
|
+
def pubDate
|
52
|
+
query_root_node("pubDate/text()")
|
53
|
+
end
|
54
|
+
|
55
|
+
# Get rss/channel/webMaster
|
56
|
+
def webMaster
|
57
|
+
query_root_node("webMaster/text()")
|
58
|
+
end
|
59
|
+
|
60
|
+
# Get rss/channel/managingEditor
|
61
|
+
def managingEditor
|
62
|
+
query_root_node("managingEditor/text()")
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get rss/channel/atom:link
|
66
|
+
def atom_link
|
67
|
+
query_root_node("atom:link/@href", @@NAMESPACES)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Get rss/channel/item*
|
71
|
+
def items
|
72
|
+
nodeset = query_root_node("item")
|
73
|
+
@items = []
|
74
|
+
nodeset.each do |item|
|
75
|
+
item_obj = SemanticCrawler::Gdacs::FeedItem.new(item)
|
76
|
+
@items << item_obj
|
77
|
+
end
|
78
|
+
@items
|
79
|
+
end
|
80
|
+
|
81
|
+
# Query the root_node
|
82
|
+
def query_root_node(xpath_query, namespaces = {})
|
83
|
+
if !@root_node.nil?
|
84
|
+
@root_node.xpath(xpath_query, namespaces)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def xml_document
|
89
|
+
@root_node.to_s
|
90
|
+
end
|
91
|
+
|
92
|
+
private
|
93
|
+
# Retrieves the RSS feed
|
94
|
+
def fetch_feed
|
95
|
+
@doc = Nokogiri::XML(open(@url))
|
96
|
+
@root_node = @doc.xpath("/rss/channel")
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,172 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module SemanticCrawler
|
4
|
+
module Gdacs
|
5
|
+
# One crisis entity with related resources. Could be one of the
|
6
|
+
# following crisis types:
|
7
|
+
#
|
8
|
+
# * Floods
|
9
|
+
# * Earthquakes
|
10
|
+
# * Tropical Cyclones
|
11
|
+
# * Volcanoes
|
12
|
+
class FeedItem
|
13
|
+
# XML namespaces used for the parsing process
|
14
|
+
@@NAMESPACES = {
|
15
|
+
"gdacs" => "http://www.gdacs.org",
|
16
|
+
"asgard" => "http://asgard.jrc.it",
|
17
|
+
"geo" => "http://www.w3.org/2003/01/geo/wgs84_pos#",
|
18
|
+
"dc" => "http://purl.org/dc/elements/1.1/"
|
19
|
+
}
|
20
|
+
|
21
|
+
def initialize(new_root_node)
|
22
|
+
@root_node = new_root_node
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns the crisis title
|
26
|
+
def title
|
27
|
+
query_root_node("title/text()")
|
28
|
+
end
|
29
|
+
|
30
|
+
# Returns the crisis description
|
31
|
+
def description
|
32
|
+
query_root_node("description/text()")
|
33
|
+
end
|
34
|
+
|
35
|
+
# Returns the enclosure URL
|
36
|
+
def enclosure_url
|
37
|
+
query_root_node("enclosure/@url")
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns the enclosure type, e.g. image/png
|
41
|
+
def enclosure_type
|
42
|
+
query_root_node("enclosure/@type")
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns the enclosure length, e.g. 1
|
46
|
+
def enclosure_length
|
47
|
+
query_root_node("enclosure/@length")
|
48
|
+
end
|
49
|
+
|
50
|
+
# Returns the crisis gdacs link
|
51
|
+
def link
|
52
|
+
query_root_node("link/text()")
|
53
|
+
end
|
54
|
+
|
55
|
+
# Returns the publication date
|
56
|
+
def pubDate
|
57
|
+
query_root_node("pubDate/text()")
|
58
|
+
end
|
59
|
+
|
60
|
+
# Returns the crisis start date
|
61
|
+
def fromdate
|
62
|
+
query_root_node("gdacs:fromdate/text()", @@NAMESPACES)
|
63
|
+
end
|
64
|
+
|
65
|
+
# Returns the crisis end date
|
66
|
+
def todate
|
67
|
+
query_root_node("gdacs:todate/text()", @@NAMESPACES)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Returns the crisis subject abbreviation
|
71
|
+
def subject
|
72
|
+
query_root_node("dc:subject/text()", @@NAMESPACES)
|
73
|
+
end
|
74
|
+
|
75
|
+
# Returns an unique crisis identifier (could be non permanent)
|
76
|
+
def guid
|
77
|
+
query_root_node("guid/text()")
|
78
|
+
end
|
79
|
+
|
80
|
+
# Returns the latitude GPS coordinate where the crisis has occurred
|
81
|
+
def latitude
|
82
|
+
query_root_node("geo:Point/geo:lat/text()", @@NAMESPACES)
|
83
|
+
end
|
84
|
+
|
85
|
+
# Returns the longitude GPS coordinate where the crisis has occurred
|
86
|
+
def longitude
|
87
|
+
query_root_node("geo:Point/geo:long/text()", @@NAMESPACES)
|
88
|
+
end
|
89
|
+
|
90
|
+
# Returns the version
|
91
|
+
def version
|
92
|
+
query_root_node("gdacs:version/text()", @@NAMESPACES)
|
93
|
+
end
|
94
|
+
|
95
|
+
# Returns the event type abbreviation, e.g. VO (for volcanic
|
96
|
+
# eruption, EQ (for earthquake), FL (for flood), TC (for tropical
|
97
|
+
# cyclone)
|
98
|
+
def eventtype
|
99
|
+
query_root_node("gdacs:eventtype/text()", @@NAMESPACES)
|
100
|
+
end
|
101
|
+
|
102
|
+
# Returns the alert level, could be GREEN, ORANGE or RED
|
103
|
+
def alertlevel
|
104
|
+
query_root_node("gdacs:alertlevel/text()", @@NAMESPACES)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Returns the event name if available
|
108
|
+
def eventname
|
109
|
+
query_root_node("gdacs:eventname/text()", @@NAMESPACES)
|
110
|
+
end
|
111
|
+
|
112
|
+
# Returns the event id
|
113
|
+
def eventid
|
114
|
+
query_root_node("gdacs:eventid/text()", @@NAMESPACES)
|
115
|
+
end
|
116
|
+
|
117
|
+
# Returns the episode id
|
118
|
+
def episodeid
|
119
|
+
query_root_node("gdacs:episodeid/text()", @@NAMESPACES)
|
120
|
+
end
|
121
|
+
|
122
|
+
# Returns the severity as human readable string
|
123
|
+
def severity
|
124
|
+
query_root_node("gdacs:severity/text()", @@NAMESPACES)
|
125
|
+
end
|
126
|
+
|
127
|
+
# Returns the population as human readable string
|
128
|
+
def population
|
129
|
+
query_root_node("gdacs:population/text()", @@NAMESPACES)
|
130
|
+
end
|
131
|
+
|
132
|
+
# Returns the vulnerability as human readable string
|
133
|
+
def vulnerability
|
134
|
+
query_root_node("gdacs:vulnerability/text()", @@NAMESPACES)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Returns the country iso3 code if available
|
138
|
+
def iso3
|
139
|
+
query_root_node("gdacs:iso3/text()", @@NAMESPACES)
|
140
|
+
end
|
141
|
+
|
142
|
+
# Returns the country name(s)
|
143
|
+
def country
|
144
|
+
query_root_node("gdacs:country/text()", @@NAMESPACES)
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns ...
|
148
|
+
def glide
|
149
|
+
query_root_node("gdacs:glide/text()", @@NAMESPACES)
|
150
|
+
end
|
151
|
+
|
152
|
+
# Returns an array of SemanticCrawler::Gdacs::Resource objects
|
153
|
+
def resources
|
154
|
+
nodeset = query_root_node("gdacs:resources/gdacs:resource", @@NAMESPACES)
|
155
|
+
@items = []
|
156
|
+
nodeset.each do |item|
|
157
|
+
item_obj = SemanticCrawler::Gdacs::Resource.new(item)
|
158
|
+
@items << item_obj
|
159
|
+
end
|
160
|
+
@items
|
161
|
+
end
|
162
|
+
|
163
|
+
# Query the root_node
|
164
|
+
def query_root_node(xpath_query, namespaces = {})
|
165
|
+
if !@root_node.nil?
|
166
|
+
@root_node.xpath(xpath_query, namespaces)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|