semantic-crawler 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +17 -6
- data/Rakefile +2 -4
- data/lib/semantic_crawler.rb +25 -2
- data/lib/semantic_crawler/factbook.rb +11 -0
- data/lib/semantic_crawler/factbook/country.rb +15 -5
- data/lib/semantic_crawler/freebase.rb +4 -0
- data/lib/semantic_crawler/freebase/country.rb +5 -0
- data/lib/semantic_crawler/freebase/crawler.rb +1 -0
- data/lib/semantic_crawler/freebase/entity.rb +1 -0
- data/lib/semantic_crawler/gdacs.rb +6 -0
- data/lib/semantic_crawler/gdacs/feed.rb +100 -0
- data/lib/semantic_crawler/gdacs/feed_item.rb +172 -0
- data/lib/semantic_crawler/gdacs/resource.rb +66 -0
- data/lib/semantic_crawler/linked_geo_data.rb +5 -0
- data/lib/semantic_crawler/linked_geo_data/relevant_node.rb +13 -0
- data/lib/semantic_crawler/version.rb +3 -1
- data/test/dummy/log/test.log +4802 -0
- data/test/factbook_test.rb +1 -1
- data/test/gdacs_test.rb +80 -0
- metadata +37 -27
data/README.rdoc
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
[ATTENTION] This library is under heavy development
|
2
|
-
for a usable version.
|
1
|
+
[ATTENTION] This library is under heavy development. Please be patient for a usable version.
|
3
2
|
|
4
3
|
= SemanticCrawler
|
5
4
|
|
@@ -12,27 +11,35 @@ to bypass complex NLP (natural language processing).
|
|
12
11
|
== Supported Sources
|
13
12
|
|
14
13
|
* CIA Factbook RDF Dump, see http://www4.wiwiss.fu-berlin.de/factbook/directory/countries
|
14
|
+
* [Started] LinkedGeoData - LGD (see http://linkedgeodata.org)
|
15
|
+
* [Started] GDACS (see http://gdacs.org)
|
16
|
+
* [Started] Freebase (see http://freebase.com)
|
15
17
|
|
16
18
|
=== TODO
|
17
19
|
|
18
|
-
* LinkedGeoData - LGD(see http://linkedgeodata.org)
|
19
|
-
* Freebase
|
20
20
|
* Geonames
|
21
21
|
* DBPedia
|
22
22
|
* Different Government Sources
|
23
23
|
|
24
|
+
|
24
25
|
== Installation
|
25
26
|
|
27
|
+
$ gem install semantic-crawler
|
28
|
+
|
29
|
+
Or from source:
|
30
|
+
|
26
31
|
$ git clone git://github.com/obale/semantic_crawler.git
|
27
32
|
$ cd semantic_crawler
|
28
33
|
$ bundle install
|
29
34
|
$ rake build
|
30
35
|
$ rake install pkg/semantic-crawler-*.gem
|
31
36
|
|
32
|
-
If the library is stable enough it will be published at rubygems.org. That simplifies the installation.
|
33
|
-
|
34
37
|
You can add this library also as dependency in your Gemfile:
|
35
38
|
|
39
|
+
gem "semantic-cralwer"
|
40
|
+
|
41
|
+
Or from source:
|
42
|
+
|
36
43
|
gem "semantic-crawler", :git => "git://github.com/obale/semantic_crawler.git" # for the master branch or
|
37
44
|
gem "semantic-crawler", :git => "git://github.com/obale/semantic_crawler.git", :tags => "xyz" # for the xyz tag
|
38
45
|
|
@@ -56,6 +63,10 @@ see CHANGELOG.rdoc
|
|
56
63
|
* Ruby 1.8.7-p358 and Rails 3.2.2
|
57
64
|
* Ruby 1.9.3-p125 and Rails 3.2.2
|
58
65
|
|
66
|
+
=== Development Branch Health
|
67
|
+
|
68
|
+
{<img src="https://secure.travis-ci.org/obale/semantic_crawler.png?branch=master"/>}[http://travis-ci.org/#!/obale/semantic_crawler]
|
69
|
+
|
59
70
|
|
60
71
|
== License
|
61
72
|
|
data/Rakefile
CHANGED
@@ -16,13 +16,12 @@ RDoc::Task.new(:rdoc) do |rdoc|
|
|
16
16
|
rdoc.rdoc_dir = 'rdoc'
|
17
17
|
rdoc.title = 'SemanticCrawler'
|
18
18
|
rdoc.options << '--line-numbers'
|
19
|
+
rdoc.options << '--main=README.rdoc'
|
19
20
|
rdoc.rdoc_files.include('README.rdoc')
|
21
|
+
rdoc.rdoc_files.include('CHANGELOG.rdoc')
|
20
22
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
21
23
|
end
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
25
|
Bundler::GemHelper.install_tasks
|
27
26
|
|
28
27
|
require 'rake/testtask'
|
@@ -34,5 +33,4 @@ Rake::TestTask.new(:test) do |t|
|
|
34
33
|
t.verbose = false
|
35
34
|
end
|
36
35
|
|
37
|
-
|
38
36
|
task :default => :test
|
data/lib/semantic_crawler.rb
CHANGED
@@ -1,13 +1,36 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
require "httparty"
|
2
4
|
|
5
|
+
# The top level module contains the different data sources
|
6
|
+
# as sub-modules. Currently there are the following modules
|
7
|
+
# available:
|
8
|
+
#
|
9
|
+
# * Freebase
|
10
|
+
# * Factbook
|
11
|
+
# * LinkedGeoData
|
12
|
+
# * Gdacs
|
13
|
+
#
|
14
|
+
# The existing modules are extended stepwise and additional
|
15
|
+
# sources are added in the future.
|
3
16
|
module SemanticCrawler
|
4
17
|
end
|
5
18
|
|
6
|
-
# Freebase.com
|
19
|
+
# Freebase.com - module: Freebase
|
7
20
|
require "semantic_crawler/freebase/crawler"
|
8
21
|
require "semantic_crawler/freebase/entity"
|
9
22
|
require "semantic_crawler/freebase/country"
|
10
23
|
|
11
|
-
# CIA Factbook RDF Dump
|
24
|
+
# CIA Factbook RDF Dump - module: Factbook
|
25
|
+
require "semantic_crawler/factbook"
|
12
26
|
require "semantic_crawler/factbook/country"
|
13
27
|
|
28
|
+
# LinkedGeoData (http://linkedgeodata.org) - module: LinkedGeoData
|
29
|
+
require "semantic_crawler/linked_geo_data"
|
30
|
+
require "semantic_crawler/linked_geo_data/relevant_node"
|
31
|
+
|
32
|
+
# GDACS (http://gdacs.org) - module: Gdacs
|
33
|
+
require "semantic_crawler/gdacs"
|
34
|
+
require "semantic_crawler/gdacs/feed.rb"
|
35
|
+
require "semantic_crawler/gdacs/feed_item.rb"
|
36
|
+
require "semantic_crawler/gdacs/resource.rb"
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# The RDF Dump of the CIA Factbook contains country relevant information. The
|
2
|
+
# information are maybe deprecated, but for general purpose information that
|
3
|
+
# never or infrequent change, this source is perfect.
|
4
|
+
#
|
5
|
+
# This module encapsulates the access to the underlying RDF files and wrapped
|
6
|
+
# the most important properties. Not wrapped properties in the namespace of
|
7
|
+
# factbook could be accessed via the following method:
|
8
|
+
#
|
9
|
+
# * link:SemanticCrawler::Factbook::Country::get_factbook_property
|
10
|
+
module SemanticCrawler::Factbook
|
11
|
+
end
|
@@ -1,18 +1,28 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
require 'open-uri'
|
3
5
|
|
4
6
|
module SemanticCrawler
|
5
7
|
module Factbook
|
8
|
+
# Extracted from the RDF Dump of the CIA Factbook. Contains all
|
9
|
+
# relevant, but maybe deprecated information about countries.
|
6
10
|
class Country
|
11
|
+
# The prefixed used for each country
|
7
12
|
@@URI_PREFIX = "http://www4.wiwiss.fu-berlin.de/factbook/data/"
|
8
13
|
|
14
|
+
# Predefined RDFS/OWL namespaces used for RDF file parsing.
|
9
15
|
@@NAMESPACES = {
|
10
16
|
"factbook" => "http://www4.wiwiss.fu-berlin.de/factbook/ns#",
|
11
17
|
"rdfs" => "http://www.w3.org/2000/01/rdf-schema#",
|
12
18
|
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
13
19
|
}
|
14
20
|
|
21
|
+
# Country name given as input during the object creation.
|
15
22
|
attr_reader :country_name
|
23
|
+
|
24
|
+
# The complete URL of the country. Could be also wrong,
|
25
|
+
# if the country_name is not valid.
|
16
26
|
attr_reader :url
|
17
27
|
|
18
28
|
# Get Country Information from the CIA Factbook. see
|
@@ -23,15 +33,14 @@ module SemanticCrawler
|
|
23
33
|
# >> puts austria.background
|
24
34
|
#
|
25
35
|
# Argumenst:
|
26
|
-
#
|
36
|
+
# new_country_name: (String)
|
27
37
|
def initialize(new_country_name)
|
28
|
-
|
29
|
-
@
|
30
|
-
@url = @@URI_PREFIX + @country_name
|
38
|
+
@country_name = new_country_name
|
39
|
+
@url = @@URI_PREFIX + @country_name.downcase.gsub(" ", "_").gsub("usa", "united_states")
|
31
40
|
begin
|
32
41
|
fetch_rdf
|
33
42
|
rescue => e
|
34
|
-
puts "Not able to get country information, through exception: " + e
|
43
|
+
puts "Not able to get country information, through exception: " + e.message
|
35
44
|
end
|
36
45
|
end
|
37
46
|
|
@@ -116,6 +125,7 @@ module SemanticCrawler
|
|
116
125
|
end
|
117
126
|
|
118
127
|
private
|
128
|
+
# Retrieves the RDF stream
|
119
129
|
def fetch_rdf
|
120
130
|
@doc = Nokogiri::XML(open(@url))
|
121
131
|
end
|
@@ -1,9 +1,14 @@
|
|
1
|
+
# [XXX] The current implementation outputs only an unreadable JSON object.
|
1
2
|
class SemanticCrawler::Freebase::Country < SemanticCrawler::Freebase::Entity
|
3
|
+
# The Freebase object that should be retrieved
|
2
4
|
attr_accessor :input_name
|
5
|
+
# The URL that points to the JSON object.
|
3
6
|
attr_accessor :json_link
|
4
7
|
|
8
|
+
# The URL prefix of an Freebase experimental JSON entity.
|
5
9
|
@@uri_prefix = "http://www.freebase.com/experimental/topic/standard/en/"
|
6
10
|
|
11
|
+
# Creates a new Freebase object (JSON)
|
7
12
|
def initialize
|
8
13
|
@input_name = nil
|
9
14
|
@html_link = nil
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module SemanticCrawler
|
4
|
+
module Gdacs
|
5
|
+
# The GDACS.org RSS feed contains the latest crisis information about
|
6
|
+
# the following crisis types:
|
7
|
+
#
|
8
|
+
# * Floods
|
9
|
+
# * Earthquakes
|
10
|
+
# * Tropical Cyclones
|
11
|
+
# * Volcanoes
|
12
|
+
class Feed
|
13
|
+
|
14
|
+
@@NAMESPACES = {
|
15
|
+
"atom" => "http://www.w3.org/2005/Atom"
|
16
|
+
}
|
17
|
+
|
18
|
+
# The gdacs.org RSS feed URL. (default:
|
19
|
+
# http://new.gdacs.org/xml/rss.xml)
|
20
|
+
attr_reader :url
|
21
|
+
|
22
|
+
# Initializes the gdacs.org feed URL. If not specified the default
|
23
|
+
# URL (http://new.gdacs.org/xml/rss.xml) is used. Normally the
|
24
|
+
# feed URL should not be changed.
|
25
|
+
def initialize(new_url = "http://new.gdacs.org/xml/rss.xml")
|
26
|
+
@url = new_url
|
27
|
+
@root_node = nil
|
28
|
+
begin
|
29
|
+
fetch_feed
|
30
|
+
rescue => e
|
31
|
+
puts "Not able to get country information, through exception: " + e.message
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Get rss/channel/title
|
36
|
+
def title
|
37
|
+
query_root_node("title/text()")
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get rss/channel/link
|
41
|
+
def link
|
42
|
+
query_root_node("link/text()")
|
43
|
+
end
|
44
|
+
|
45
|
+
# Get rss/channel/description
|
46
|
+
def description
|
47
|
+
query_root_node("description/text()")
|
48
|
+
end
|
49
|
+
|
50
|
+
# Get rss/channel/pubDate
|
51
|
+
def pubDate
|
52
|
+
query_root_node("pubDate/text()")
|
53
|
+
end
|
54
|
+
|
55
|
+
# Get rss/channel/webMaster
|
56
|
+
def webMaster
|
57
|
+
query_root_node("webMaster/text()")
|
58
|
+
end
|
59
|
+
|
60
|
+
# Get rss/channel/managingEditor
|
61
|
+
def managingEditor
|
62
|
+
query_root_node("managingEditor/text()")
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get rss/channel/atom:link
|
66
|
+
def atom_link
|
67
|
+
query_root_node("atom:link/@href", @@NAMESPACES)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Get rss/channel/item*
|
71
|
+
def items
|
72
|
+
nodeset = query_root_node("item")
|
73
|
+
@items = []
|
74
|
+
nodeset.each do |item|
|
75
|
+
item_obj = SemanticCrawler::Gdacs::FeedItem.new(item)
|
76
|
+
@items << item_obj
|
77
|
+
end
|
78
|
+
@items
|
79
|
+
end
|
80
|
+
|
81
|
+
# Query the root_node
|
82
|
+
def query_root_node(xpath_query, namespaces = {})
|
83
|
+
if !@root_node.nil?
|
84
|
+
@root_node.xpath(xpath_query, namespaces)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def xml_document
|
89
|
+
@root_node.to_s
|
90
|
+
end
|
91
|
+
|
92
|
+
private
|
93
|
+
# Retrieves the RSS feed
|
94
|
+
def fetch_feed
|
95
|
+
@doc = Nokogiri::XML(open(@url))
|
96
|
+
@root_node = @doc.xpath("/rss/channel")
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,172 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module SemanticCrawler
|
4
|
+
module Gdacs
|
5
|
+
# One crisis entity with related resources. Could be one of the
|
6
|
+
# following crisis types:
|
7
|
+
#
|
8
|
+
# * Floods
|
9
|
+
# * Earthquakes
|
10
|
+
# * Tropical Cyclones
|
11
|
+
# * Volcanoes
|
12
|
+
class FeedItem
|
13
|
+
# XML namespaces used for the parsing process
|
14
|
+
@@NAMESPACES = {
|
15
|
+
"gdacs" => "http://www.gdacs.org",
|
16
|
+
"asgard" => "http://asgard.jrc.it",
|
17
|
+
"geo" => "http://www.w3.org/2003/01/geo/wgs84_pos#",
|
18
|
+
"dc" => "http://purl.org/dc/elements/1.1/"
|
19
|
+
}
|
20
|
+
|
21
|
+
def initialize(new_root_node)
|
22
|
+
@root_node = new_root_node
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns the crisis title
|
26
|
+
def title
|
27
|
+
query_root_node("title/text()")
|
28
|
+
end
|
29
|
+
|
30
|
+
# Returns the crisis description
|
31
|
+
def description
|
32
|
+
query_root_node("description/text()")
|
33
|
+
end
|
34
|
+
|
35
|
+
# Returns the enclosure URL
|
36
|
+
def enclosure_url
|
37
|
+
query_root_node("enclosure/@url")
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns the enclosure type, e.g. image/png
|
41
|
+
def enclosure_type
|
42
|
+
query_root_node("enclosure/@type")
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns the enclosure length, e.g. 1
|
46
|
+
def enclosure_length
|
47
|
+
query_root_node("enclosure/@length")
|
48
|
+
end
|
49
|
+
|
50
|
+
# Returns the crisis gdacs link
|
51
|
+
def link
|
52
|
+
query_root_node("link/text()")
|
53
|
+
end
|
54
|
+
|
55
|
+
# Returns the publication date
|
56
|
+
def pubDate
|
57
|
+
query_root_node("pubDate/text()")
|
58
|
+
end
|
59
|
+
|
60
|
+
# Returns the crisis start date
|
61
|
+
def fromdate
|
62
|
+
query_root_node("gdacs:fromdate/text()", @@NAMESPACES)
|
63
|
+
end
|
64
|
+
|
65
|
+
# Returns the crisis end date
|
66
|
+
def todate
|
67
|
+
query_root_node("gdacs:todate/text()", @@NAMESPACES)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Returns the crisis subject abbreviation
|
71
|
+
def subject
|
72
|
+
query_root_node("dc:subject/text()", @@NAMESPACES)
|
73
|
+
end
|
74
|
+
|
75
|
+
# Returns an unique crisis identifier (could be non permanent)
|
76
|
+
def guid
|
77
|
+
query_root_node("guid/text()")
|
78
|
+
end
|
79
|
+
|
80
|
+
# Returns the latitude GPS coordinate where the crisis has occurred
|
81
|
+
def latitude
|
82
|
+
query_root_node("geo:Point/geo:lat/text()", @@NAMESPACES)
|
83
|
+
end
|
84
|
+
|
85
|
+
# Returns the longitude GPS coordinate where the crisis has occurred
|
86
|
+
def longitude
|
87
|
+
query_root_node("geo:Point/geo:long/text()", @@NAMESPACES)
|
88
|
+
end
|
89
|
+
|
90
|
+
# Returns the version
|
91
|
+
def version
|
92
|
+
query_root_node("gdacs:version/text()", @@NAMESPACES)
|
93
|
+
end
|
94
|
+
|
95
|
+
# Returns the event type abbreviation, e.g. VO (for volcanic
|
96
|
+
# eruption, EQ (for earthquake), FL (for flood), TC (for tropical
|
97
|
+
# cyclone)
|
98
|
+
def eventtype
|
99
|
+
query_root_node("gdacs:eventtype/text()", @@NAMESPACES)
|
100
|
+
end
|
101
|
+
|
102
|
+
# Returns the alert level, could be GREEN, ORANGE or RED
|
103
|
+
def alertlevel
|
104
|
+
query_root_node("gdacs:alertlevel/text()", @@NAMESPACES)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Returns the event name if available
|
108
|
+
def eventname
|
109
|
+
query_root_node("gdacs:eventname/text()", @@NAMESPACES)
|
110
|
+
end
|
111
|
+
|
112
|
+
# Returns the event id
|
113
|
+
def eventid
|
114
|
+
query_root_node("gdacs:eventid/text()", @@NAMESPACES)
|
115
|
+
end
|
116
|
+
|
117
|
+
# Returns the episode id
|
118
|
+
def episodeid
|
119
|
+
query_root_node("gdacs:episodeid/text()", @@NAMESPACES)
|
120
|
+
end
|
121
|
+
|
122
|
+
# Returns the severity as human readable string
|
123
|
+
def severity
|
124
|
+
query_root_node("gdacs:severity/text()", @@NAMESPACES)
|
125
|
+
end
|
126
|
+
|
127
|
+
# Returns the population as human readable string
|
128
|
+
def population
|
129
|
+
query_root_node("gdacs:population/text()", @@NAMESPACES)
|
130
|
+
end
|
131
|
+
|
132
|
+
# Returns the vulnerability as human readable string
|
133
|
+
def vulnerability
|
134
|
+
query_root_node("gdacs:vulnerability/text()", @@NAMESPACES)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Returns the country iso3 code if available
|
138
|
+
def iso3
|
139
|
+
query_root_node("gdacs:iso3/text()", @@NAMESPACES)
|
140
|
+
end
|
141
|
+
|
142
|
+
# Returns the country name(s)
|
143
|
+
def country
|
144
|
+
query_root_node("gdacs:country/text()", @@NAMESPACES)
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns ...
|
148
|
+
def glide
|
149
|
+
query_root_node("gdacs:glide/text()", @@NAMESPACES)
|
150
|
+
end
|
151
|
+
|
152
|
+
# Returns an array of SemanticCrawler::Gdacs::Resource objects
|
153
|
+
def resources
|
154
|
+
nodeset = query_root_node("gdacs:resources/gdacs:resource", @@NAMESPACES)
|
155
|
+
@items = []
|
156
|
+
nodeset.each do |item|
|
157
|
+
item_obj = SemanticCrawler::Gdacs::Resource.new(item)
|
158
|
+
@items << item_obj
|
159
|
+
end
|
160
|
+
@items
|
161
|
+
end
|
162
|
+
|
163
|
+
# Query the root_node
|
164
|
+
def query_root_node(xpath_query, namespaces = {})
|
165
|
+
if !@root_node.nil?
|
166
|
+
@root_node.xpath(xpath_query, namespaces)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|