semantic-crawler 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -1,4 +1,4 @@
1
- = SemanticCrawler
1
+ = Ruby Semantic Crawler Library
2
2
 
3
3
  This project encapsulates data gathering from different sources.
4
4
  It simplifies the extension of internal data with public available
@@ -13,7 +13,7 @@ to bypass complex NLP (natural language processing).
13
13
  * {FAO - Food and Agriculture Organization of the United Nations}[http://www.fao.org]
14
14
  * {LinkedGeoData - LGD}[http://linkedgeodata.org]
15
15
  * {GDACS}[http://gdacs.org]
16
- * [Started] {Freebase}[http://freebase.com]
16
+ * {Freebase}[http://freebase.com]
17
17
 
18
18
  === TODO
19
19
 
@@ -56,13 +56,13 @@ library include or execute the following line:
56
56
  The GeoNames module is able to return a Factbook::Country and Fao::Country
57
57
  module on the base of input GPS coordinates (lat/long).
58
58
 
59
- >> @innsbruck = SemanticCrawler::GeoNames::Country.new(47.271338, 11.395333)
60
- >> articles = @innsbruck.get_wikipedia_articles
59
+ >> innsbruck = SemanticCrawler::GeoNames::Country.new(47.271338, 11.395333)
60
+ >> articles = innsbruck.get_wikipedia_articles
61
61
  >> articles.each do |article|
62
62
  >> puts article.wikipedia_url
63
63
  >> end
64
- >> factbook_obj = @innsbruck.get_factbook_country
65
- >> fao_obj = @innsbruck.get_fao_country
64
+ >> factbook_obj = innsbruck.get_factbook_country
65
+ >> fao_obj = innsbruck.get_fao_country
66
66
 
67
67
  === Factbook
68
68
 
@@ -120,12 +120,34 @@ Geo information from {LinkedGeoData}[http://linkedgeodata.org]:
120
120
  >> end
121
121
 
122
122
 
123
+ == Freebase
124
+
125
+ Freebase.com country information:
126
+
127
+ >> country = SemanticCrawler::Freebase::Country.new("Austria")
128
+ >> links = country.same_as
129
+ >> links.each do |link|
130
+ >> be_valid link.start_with?("http")
131
+ >> end
132
+ >> puts country.website
133
+
134
+
123
135
  == Tested with
124
136
 
125
137
  * Ruby 1.8.7-p358 and Rails 3.2.2
126
138
  * Ruby 1.9.3-p125 and Rails 3.2.2
127
139
 
128
140
 
141
+ == Additional Links
142
+
143
+ * {RubyGems}[http://rubygems.org/gems/semantic-crawler]
144
+ * {RubyDoc}[http://rubydoc.info/gems/semantic-crawler/file/README.rdoc]
145
+ * {GitHub}[https://github.com/obale/semantic_crawler]
146
+ * {Travis CI}[http://travis-ci.org/#!/obale/semantic_crawler]
147
+ * {Bugtracker}[https://github.com/obale/semantic_crawler/issues?sort=created&direction=desc&state=open]
148
+ * {Wiki}[https://github.com/obale/semantic_crawler/wiki]
149
+
150
+
129
151
  == License
130
152
 
131
153
  (c) 2012 by Alex Oberhauser for {Sigimera}[http://www.sigimera.org],
@@ -25,7 +25,7 @@ module SemanticCrawler
25
25
  def initialize(new_country_name)
26
26
  @country_name = new_country_name
27
27
  if !@country_name.nil?
28
- @url = @@URI_PREFIX + @country_name.gsub(" ", "_").gsub("USA", "United_States")
28
+ @url = @@URI_PREFIX + @country_name.gsub(" ", "_").gsub("USA", "United_States_of_America")
29
29
  @root_node = nil
30
30
  begin
31
31
  fetch_rdf
@@ -1,49 +1,127 @@
1
1
  # encoding: UTF-8
2
2
 
3
- require 'json'
4
-
5
3
  module SemanticCrawler
6
4
  module Freebase
7
5
  # Freebase Country entity. Currently it is very abstract and
8
6
  # could be each entry on Freebase.
9
- #
10
- # [XXX] The current implementation outputs only an unreadable JSON object.
11
7
  class Country
12
8
  include HTTParty
13
- # The Freebase object that should be retrieved
14
- attr_accessor :input_name
15
- # The URL that points to the JSON object.
16
- attr_accessor :json_link
9
+ # The URL prefix of an english Freebase RDF entity.
10
+ @@URI_PREFIX = "http://rdf.freebase.com/rdf/en."
11
+
12
+ # Namespace hash
13
+ @@NAMESPACES = {
14
+ "rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
15
+ "fb" => "http://rdf.freebase.com/ns/",
16
+ "owl" => "http://www.w3.org/2002/07/owl#"
17
+ }
17
18
 
18
- # The URL prefix of an Freebase experimental JSON entity.
19
- @@uri_prefix = "http://www.freebase.com/experimental/topic/standard/en/"
19
+ # @attribute [r]
20
+ # The read only country name
21
+ attr_reader :country_name
20
22
 
21
23
  # Creates a new Freebase object (JSON)
22
24
  def initialize(new_country_name)
23
- @input_name = new_country_name
24
- @html_link = nil
25
- @json_link = nil
26
- end
27
-
28
- # Get Country Information from freebase.com
29
- #
30
- # Example:
31
- # >> austria = SemanticCrawler::Freebase::Country.new("Austria")
32
- # >> austria.get
33
- #
34
- # Argumenst:
35
- # options: (Hash)
36
- #
37
- #
38
- def get(options={})
39
- if !@input_name.nil?
40
- @json_link = @@uri_prefix + @input_name
41
- JSON(self.class.get(@json_link, options).body)["result"]
42
- else
43
- nil
25
+ @country_name = new_country_name
26
+ if !@country_name.nil?
27
+ @normalized_country = @country_name.gsub(" ", "_").gsub("USA", "United_States_of_America").downcase
28
+ @url = @@URI_PREFIX + @normalized_country
29
+ @subject = "http://rdf.freebase.com/ns/en.#{@normalized_country}"
30
+ @root_node = nil
31
+ begin
32
+ fetch_rdf
33
+ rescue => e
34
+ $log.error("Not able to get country information, through exception: #{e}")
35
+ end
44
36
  end
45
37
  end
46
38
 
39
+ # Extract the fb:common.topic.topical_webpage property
40
+ # @return [String] Website as String
41
+ def website
42
+ query_root_node("fb:common.topic.topical_webpage/text()", @@NAMESPACES).to_s
43
+ end
44
+
45
+ # Extract the fb:type.object.name property
46
+ # @param [String] The language in ISO 3166-1 alpha-2 format
47
+ # @return [String] The country name
48
+ def object_name(lang = "en")
49
+ query_root_node("fb:type.object.name[@xml:lang='#{lang}']/text()", @@NAMESPACES).to_s
50
+ end
51
+
52
+ # Extract the fb:location.location.contains properties
53
+ # @return [Array<String>]
54
+ def contains
55
+ return_list = []
56
+ list = query_root_node("fb:location.location.contains/@rdf:resource", @@NAMESPACES)
57
+ if !list.nil?
58
+ list.each do |entry|
59
+ return_list << entry.to_s
60
+ end
61
+ end
62
+ return_list
63
+ end
64
+
65
+ # Extract fb:location.country.administrative_divisions properties
66
+ # @return [Array<String>]
67
+ def administrative_divisions
68
+ return_list = []
69
+ list = query_root_node("fb:location.country.administrative_divisions/@rdf:resource", @@NAMESPACES)
70
+ if !list.nil?
71
+ list.each do |entry|
72
+ return_list << entry.to_s
73
+ end
74
+ end
75
+ return_list
76
+ end
77
+
78
+ # Extract fb:government.governmental_jurisdiction.agencies
79
+ # @return [Array<String>] Returns the governmental jurisdiction agencies as URLs
80
+ def agencies
81
+ return_list = []
82
+ list = query_root_node("fb:government.governmental_jurisdiction.agencies/@rdf:resource", @@NAMESPACES)
83
+ if !list.nil?
84
+ list.each do |entry|
85
+ return_list << entry.to_s
86
+ end
87
+ end
88
+ return_list
89
+ end
90
+
91
+ # Extract the owl:sameAs links
92
+ # @return [Array<String>] A list of same concept URLs, e.g. to dbpedia or nytimes
93
+ def same_as
94
+ return_list = []
95
+ list = query_root_node("owl:sameAs/@rdf:resource", @@NAMESPACES)
96
+ if !list.nil?
97
+ list.each do |entry|
98
+ return_list << entry.to_s
99
+ end
100
+ end
101
+ return_list
102
+ end
103
+
104
+ # Executes a xpath query with optional a hash with namespaces
105
+ # @return [String]
106
+ def query_root_node(xpath_query, namespaces = {})
107
+ if !@root_node.nil?
108
+ @root_node.xpath(xpath_query, namespaces)
109
+ end
110
+ end
111
+
112
+ # Outputs the document as XML
113
+ # @return [String] The document serialized as XML
114
+ def xml_document
115
+ @root_node.to_s
116
+ end
117
+
118
+ private
119
+ # Retrieves the RDF file
120
+ def fetch_rdf
121
+ @doc = Nokogiri::XML(open(@url))
122
+ @root_node = @doc.xpath("//*[@rdf:about='#{@subject}']", @@NAMESPACES)
123
+ end
124
+
47
125
  end
48
126
  end
49
127
  end
@@ -1,5 +1,4 @@
1
1
  module SemanticCrawler
2
-
3
2
  # The current version of this library.
4
- VERSION = "0.3.0"
3
+ VERSION = "0.4.0"
5
4
  end
@@ -17,7 +17,7 @@ require "geonames"
17
17
  require 'logger'
18
18
 
19
19
  # Top module that contains the whole library. Each sub-module
20
- # is wrappes one source.
20
+ # wrappes one source.
21
21
  module SemanticCrawler
22
22
  $log = Logger.new(File.expand_path('../../log/semantic-crawler.log', __FILE__), 'daily')
23
23
  end
@@ -1,4 +1,26 @@
1
- #desc "Explaining what the task does"
2
- #task :semantic_crawler do
3
- # # Task goes here
4
- #end
1
+ require "grit"
2
+
3
+ desc "Outputs the ChangeLog of the current release (latest release on rubygems.org)"
4
+ task :changelog do
5
+ repo = Grit::Repo.new('.')
6
+
7
+ currentMerge = repo.log('origin/master', nil, :merges => true)[0].id
8
+ lastMerge = repo.log('origin/master', nil, :merges => true)[1].id
9
+ changes = repo.commits_between(lastMerge, currentMerge)
10
+
11
+ tags = repo.tags
12
+
13
+ if !changes.nil?
14
+ changes.reverse!
15
+
16
+ tags.each do |tag|
17
+ if tag.commit.sha.eql?(changes[0].id)
18
+ puts "=> Current version: \033[0;31m#{tag.name}\033[0m (tagged)"
19
+ end
20
+ end
21
+
22
+ changes.each do |commit|
23
+ puts "\033[0;33m#{commit.id[0,7]}\033[0m - #{commit.date} - \033[0;32m#{commit.message}\033[0m"
24
+ end
25
+ end
26
+ end
@@ -9,11 +9,11 @@ Gem::Specification.new do |s|
9
9
  s.version = SemanticCrawler::VERSION
10
10
  s.authors = ["Alex Oberhauser"]
11
11
  s.email = ["alex.oberhauser@sigimera.org"]
12
+ s.licenses = ["MIT"]
12
13
  s.homepage = "https://github.com/obale/semantic_crawler"
13
14
  s.summary = "SemanticCrawler is a ruby library that encapsulates data gathering from different sources."
14
15
  s.description = "SemanticCrawler is a ruby library that encapsulates data gathering from different sources. Currently country information from Factbook and FAO (Food and Agriculture Organization of the United Nations), crisis information from GDACS.org and geo data from LinkedGeoData are supported. Additional the GeoNames module allows to get Factbook and FAO country information from GPS coordinates."
15
16
 
16
- #s.files = Dir["{app,config,db,lib,log}/**/*"] + ["MIT-LICENSE", "Rakefile", "README.rdoc"]
17
17
  s.files = `git ls-files`.split("\n")
18
18
  s.executables = `git ls-files -- bin/*`.split('\n').map{ |f| File.basename(f) }
19
19
  s.test_files = Dir["{test,spec}/**/*"]
@@ -26,6 +26,7 @@ Gem::Specification.new do |s|
26
26
  s.add_dependency "geonames" # Use for the GeoNames module
27
27
 
28
28
  s.add_development_dependency "yard"
29
+ s.add_development_dependency "grit"
29
30
  s.add_development_dependency "rails", "~> 3.2.2"
30
31
  s.add_development_dependency "sqlite3"
31
32
  s.add_development_dependency "rspec-rails"
@@ -6,10 +6,10 @@ describe SemanticCrawler::Fao do
6
6
  @obj = SemanticCrawler::Fao::Country.new("Austria")
7
7
  end
8
8
 
9
- it "init empty country object" do
10
- @obj = SemanticCrawler::Fao::Country.new(nil)
11
- @obj.country_name.should.eql?(nil)
12
- @obj.url.should eq(nil)
9
+ it "test nil country" do
10
+ wrongCountry = SemanticCrawler::Fao::Country.new(nil)
11
+ wrongCountry.country_name.should.eql?(nil)
12
+ wrongCountry.url.should eq(nil)
13
13
  end
14
14
 
15
15
  it "check austria object" do
@@ -2,16 +2,61 @@ require 'spec_helper'
2
2
 
3
3
  describe SemanticCrawler::Freebase do
4
4
 
5
+ before(:all) do
6
+ @country = SemanticCrawler::Freebase::Country.new("Austria")
7
+ end
8
+
5
9
  it "test empty country" do
6
10
  emptyCountry = SemanticCrawler::Freebase::Country.new(nil)
7
- emptyCountry.get.nil?.should == true
11
+ emptyCountry.country_name.should_not be
12
+ end
13
+
14
+ it "check austria country name and xml_document" do
15
+ @country.country_name.should eq("Austria")
16
+ @country.xml_document.size.should > 0
8
17
  end
9
18
 
10
- it "init austria country information" do
11
- austria = SemanticCrawler::Freebase::Country.new("Austria")
12
- austria.get.to_s.nil?.should == false
19
+ it "check same as links" do
20
+ links = @country.same_as
21
+ links.size.should > 0
22
+ links.each do |link|
23
+ be_valid link.start_with?("http")
24
+ end
13
25
  end
14
26
 
15
- pending "Implement the 'Freebase' module"
27
+ it "check website" do
28
+ @country.website.should eq("http://www.austria.gv.at/")
29
+ end
30
+
31
+ it "check object name" do
32
+ @country.object_name.should eq("Austria")
33
+ @country.object_name("en").should eq("Austria")
34
+ @country.object_name("sl").should eq("Avstrija")
35
+ @country.object_name("cs").should eq("Rakousko")
36
+ end
37
+
38
+ it "check same as links" do
39
+ links = @country.contains
40
+ links.size.should > 0
41
+ links.each do |link|
42
+ be_valid link.start_with?("http")
43
+ end
44
+ end
45
+
46
+ it "check administrative divisions" do
47
+ links = @country.administrative_divisions
48
+ links.size.should > 0
49
+ links.each do |link|
50
+ be_valid link.start_with?("http")
51
+ end
52
+ end
53
+
54
+ it "check agencies" do
55
+ links = @country.agencies
56
+ links.size.should > 0
57
+ links.each do |link|
58
+ be_valid link.start_with?("http")
59
+ end
60
+ end
16
61
 
17
62
  end
@@ -53,3 +53,34 @@ ActionController::RoutingError (No route matches [GET] "/"):
53
53
   (91.0ms) CREATE UNIQUE INDEX "unique_schema_migrations" ON "schema_migrations" ("version")
54
54
   (0.1ms) SELECT version FROM "schema_migrations"
55
55
   (85.5ms) INSERT INTO "schema_migrations" (version) VALUES ('0')
56
+  (0.1ms) select sqlite_version(*)
57
+  (113.4ms) CREATE TABLE "schema_migrations" ("version" varchar(255) NOT NULL)
58
+  (0.0ms) PRAGMA index_list("schema_migrations")
59
+  (101.1ms) CREATE UNIQUE INDEX "unique_schema_migrations" ON "schema_migrations" ("version")
60
+  (0.1ms) SELECT "schema_migrations"."version" FROM "schema_migrations" 
61
+
62
+
63
+ Started GET "/" for 127.0.0.1 at 2012-04-16 11:15:17 +0200
64
+
65
+ ActionController::RoutingError (No route matches [GET] "/"):
66
+ actionpack (3.2.2) lib/action_dispatch/middleware/debug_exceptions.rb:21:in `call'
67
+ actionpack (3.2.2) lib/action_dispatch/middleware/show_exceptions.rb:56:in `call'
68
+ railties (3.2.2) lib/rails/rack/logger.rb:26:in `call_app'
69
+ railties (3.2.2) lib/rails/rack/logger.rb:16:in `call'
70
+ actionpack (3.2.2) lib/action_dispatch/middleware/request_id.rb:22:in `call'
71
+ rack (1.4.1) lib/rack/methodoverride.rb:21:in `call'
72
+ rack (1.4.1) lib/rack/runtime.rb:17:in `call'
73
+ activesupport (3.2.2) lib/active_support/cache/strategy/local_cache.rb:72:in `call'
74
+ rack (1.4.1) lib/rack/lock.rb:15:in `call'
75
+ actionpack (3.2.2) lib/action_dispatch/middleware/static.rb:61:in `call'
76
+ railties (3.2.2) lib/rails/engine.rb:479:in `call'
77
+ railties (3.2.2) lib/rails/application.rb:220:in `call'
78
+ rack (1.4.1) lib/rack/content_length.rb:14:in `call'
79
+ railties (3.2.2) lib/rails/rack/log_tailer.rb:14:in `call'
80
+ rack (1.4.1) lib/rack/handler/webrick.rb:59:in `service'
81
+ /home/obale/.rvm/rubies/ruby-1.9.3-p125/lib/ruby/1.9.1/webrick/httpserver.rb:138:in `service'
82
+ /home/obale/.rvm/rubies/ruby-1.9.3-p125/lib/ruby/1.9.1/webrick/httpserver.rb:94:in `run'
83
+ /home/obale/.rvm/rubies/ruby-1.9.3-p125/lib/ruby/1.9.1/webrick/server.rb:191:in `block in start_thread'
84
+
85
+
86
+ Rendered /home/obale/.rvm/gems/ruby-1.9.3-p125@sigimera/gems/actionpack-3.2.2/lib/action_dispatch/middleware/templates/rescues/routing_error.erb within rescues/layout (3.3ms)
@@ -8483,3 +8483,31 @@ Interrupt: : rollback transaction
8483
8483
   (0.0ms) rollback transaction
8484
8484
   (0.3ms) begin transaction
8485
8485
   (0.0ms) rollback transaction
8486
+  (0.3ms) begin transaction
8487
+  (0.0ms) rollback transaction
8488
+  (0.3ms) begin transaction
8489
+  (0.1ms) rollback transaction
8490
+  (0.3ms) begin transaction
8491
+  (0.0ms) rollback transaction
8492
+  (0.3ms) begin transaction
8493
+  (0.0ms) rollback transaction
8494
+  (0.3ms) begin transaction
8495
+  (0.0ms) rollback transaction
8496
+  (0.3ms) begin transaction
8497
+  (0.0ms) rollback transaction
8498
+  (0.3ms) begin transaction
8499
+  (0.0ms) rollback transaction
8500
+  (0.3ms) begin transaction
8501
+  (0.1ms) rollback transaction
8502
+  (0.3ms) begin transaction
8503
+  (0.0ms) rollback transaction
8504
+  (0.3ms) begin transaction
8505
+  (0.0ms) rollback transaction
8506
+  (0.3ms) begin transaction
8507
+  (0.1ms) rollback transaction
8508
+  (0.1ms) begin transaction
8509
+  (0.0ms) rollback transaction
8510
+  (0.3ms) begin transaction
8511
+  (0.0ms) rollback transaction
8512
+  (0.3ms) begin transaction
8513
+  (0.0ms) rollback transaction
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: semantic-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-15 00:00:00.000000000 Z
12
+ date: 2012-04-17 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: httparty
@@ -91,6 +91,22 @@ dependencies:
91
91
  - - ! '>='
92
92
  - !ruby/object:Gem::Version
93
93
  version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: grit
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
94
110
  - !ruby/object:Gem::Dependency
95
111
  name: rails
96
112
  requirement: !ruby/object:Gem::Requirement
@@ -190,7 +206,6 @@ files:
190
206
  - MIT-LICENSE
191
207
  - README.rdoc
192
208
  - Rakefile
193
- - changelog.sh
194
209
  - exploitation/freebase.rb
195
210
  - lib/semantic_crawler.rb
196
211
  - lib/semantic_crawler/dbpedia.rb
@@ -263,7 +278,8 @@ files:
263
278
  - test/dummy/log/test.log
264
279
  - test/dummy/log/development.log
265
280
  homepage: https://github.com/obale/semantic_crawler
266
- licenses: []
281
+ licenses:
282
+ - MIT
267
283
  post_install_message:
268
284
  rdoc_options: []
269
285
  require_paths:
@@ -276,7 +292,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
276
292
  version: '0'
277
293
  segments:
278
294
  - 0
279
- hash: 4442978705485601093
295
+ hash: -4201444334486941347
280
296
  required_rubygems_version: !ruby/object:Gem::Requirement
281
297
  none: false
282
298
  requirements:
@@ -285,10 +301,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
285
301
  version: '0'
286
302
  segments:
287
303
  - 0
288
- hash: 4442978705485601093
304
+ hash: -4201444334486941347
289
305
  requirements: []
290
306
  rubyforge_project:
291
- rubygems_version: 1.8.21
307
+ rubygems_version: 1.8.22
292
308
  signing_key:
293
309
  specification_version: 3
294
310
  summary: SemanticCrawler is a ruby library that encapsulates data gathering from different
data/changelog.sh DELETED
@@ -1,4 +0,0 @@
1
- #!/bin/bash
2
- currentMerge=$(git log --merges master -1 --format=%h)
3
- previousMerge=$(git log --merges master -2 --format=%h | tail -n1)
4
- git log ${previousMerge}..${currentMerge}