semantic-crawler 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +6 -2
- data/lib/semantic_crawler.rb +5 -0
- data/lib/semantic_crawler/version.rb +1 -1
- data/lib/semantic_crawler/websites.rb +4 -0
- data/lib/semantic_crawler/websites/micro_data.rb +52 -0
- data/meta_extract.rb +89 -0
- data/semantic_crawler.gemspec +6 -3
- data/spec/geo_names_spec.rb +6 -6
- data/spec/websites_spec.rb +9 -0
- data/start_pry.sh +2 -0
- metadata +64 -17
data/Rakefile
CHANGED
@@ -13,7 +13,6 @@ rescue LoadError
|
|
13
13
|
end
|
14
14
|
|
15
15
|
require 'rake'
|
16
|
-
require 'rspec/core/rake_task'
|
17
16
|
|
18
17
|
RDoc::Task.new(:rdoc) do |rdoc|
|
19
18
|
rdoc.rdoc_dir = 'rdoc'
|
@@ -38,6 +37,11 @@ Rake::TestTask.new(:test) do |t|
|
|
38
37
|
t.verbose = false
|
39
38
|
end
|
40
39
|
|
41
|
-
|
40
|
+
require 'rspec/core/rake_task'
|
41
|
+
RSpec::Core::RakeTask.new(:spec) do |t|
|
42
|
+
t.fail_on_error = true
|
43
|
+
t.rspec_opts = "--colour --format doc"
|
44
|
+
end
|
45
|
+
|
42
46
|
|
43
47
|
task :default => [ :test, :spec ]
|
data/lib/semantic_crawler.rb
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
require "httparty"
|
4
4
|
require "geonames"
|
5
|
+
require "nokogiri"
|
6
|
+
require "microdata"
|
5
7
|
|
6
8
|
# The top level module contains the different data sources
|
7
9
|
# as sub-modules. Currently there are the following modules
|
@@ -54,3 +56,6 @@ require "semantic_crawler/fao/country"
|
|
54
56
|
# Freebase.com - module: Freebase
|
55
57
|
require "semantic_crawler/freebase/country"
|
56
58
|
|
59
|
+
# Websites - module: Websites
|
60
|
+
require "semantic_crawler/websites"
|
61
|
+
require "semantic_crawler/websites/micro_data"
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module SemanticCrawler
|
4
|
+
module Websites
|
5
|
+
|
6
|
+
# Extract microdata from a website and output it as JSON
|
7
|
+
class MicroData
|
8
|
+
|
9
|
+
attr_accessor :url
|
10
|
+
attr_accessor :microdata
|
11
|
+
|
12
|
+
def initialize(url)
|
13
|
+
doc = Nokogiri::HTML(open(url))
|
14
|
+
microdata = Microdata::Document.new(doc.to_s)
|
15
|
+
items = microdata.extract_items
|
16
|
+
self.microdata = extract_microdata(items)
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_json
|
20
|
+
microdata.to_json
|
21
|
+
end
|
22
|
+
|
23
|
+
def to_s
|
24
|
+
microdata
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
def extract_microdata(items)
|
29
|
+
hash = Hash.new
|
30
|
+
if items.kind_of? Array and items.first and items.first.kind_of? String
|
31
|
+
hash = items
|
32
|
+
elsif items.kind_of? Array and items.first
|
33
|
+
items.each do |item|
|
34
|
+
props = item.properties
|
35
|
+
properties = Hash.new
|
36
|
+
props.each do |key, value|
|
37
|
+
hash[item.type.first] ||= Array.new
|
38
|
+
values = extract_microdata(value)
|
39
|
+
properties.merge!(key.to_s => values)
|
40
|
+
end
|
41
|
+
hash[item.type.first] << properties
|
42
|
+
end
|
43
|
+
else
|
44
|
+
raise "Not implemented!"
|
45
|
+
end
|
46
|
+
hash
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
data/meta_extract.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'awesome_print'
|
4
|
+
|
5
|
+
require 'microdata'
|
6
|
+
|
7
|
+
module Extractor
|
8
|
+
class Extractor::HTMLParser
|
9
|
+
|
10
|
+
def extract_microdata(items)
|
11
|
+
hash = Hash.new
|
12
|
+
if items.kind_of? Array and items.first and items.first.kind_of? String
|
13
|
+
hash = items
|
14
|
+
elsif items.kind_of? Array and items.first
|
15
|
+
items.each do |item|
|
16
|
+
props = item.properties
|
17
|
+
properties = Hash.new
|
18
|
+
props.each do |key, value|
|
19
|
+
hash[item.type.first] ||= Array.new
|
20
|
+
values = extract_microdata(value)
|
21
|
+
properties.merge!(key.to_s => values)
|
22
|
+
end
|
23
|
+
hash[item.type.first] << properties
|
24
|
+
end
|
25
|
+
else
|
26
|
+
raise "Not implemented!"
|
27
|
+
end
|
28
|
+
hash
|
29
|
+
end
|
30
|
+
|
31
|
+
def get_microdata_json(url)
|
32
|
+
doc = Nokogiri::HTML(open(url))
|
33
|
+
microdata = Microdata::Document.new(doc.to_s)
|
34
|
+
items = microdata.extract_items
|
35
|
+
extract_microdata(items)
|
36
|
+
end
|
37
|
+
|
38
|
+
def extractLink(url)
|
39
|
+
doc = Nokogiri::HTML(open(url))
|
40
|
+
|
41
|
+
doc.css('link').each do |node|
|
42
|
+
if !node['type'].nil?
|
43
|
+
puts node['type'] + " => " + node['href']
|
44
|
+
if node['type'].downcase.eql?("application/rdf+xml")
|
45
|
+
rdf = Nokogiri::XML(open(node['href']))
|
46
|
+
ap "-------------"
|
47
|
+
ap "Name: #{rdf.xpath("/rdf:RDF/foaf:Person/foaf:name", rdf.namespaces).text}"
|
48
|
+
ap "Homepage: #{rdf.xpath("/rdf:RDF/foaf:Person/foaf:homepage/@rdf:resource", rdf.namespaces).text}"
|
49
|
+
pubs = rdf.xpath("/rdf:RDF/foaf:Person/foaf:publications/@rdf:resource", rdf.namespaces)
|
50
|
+
if pubs
|
51
|
+
pubs.each do |pub|
|
52
|
+
publication = rdf.xpath("//rdf:RDF/*[@rdf:ID='#{pub.text.gsub('#', '')}']")
|
53
|
+
if publication
|
54
|
+
ap "Publications: #{publication.xpath("./bibtex:hasTitle").text}"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
ap "-------------"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def extractMeta(url)
|
65
|
+
doc = Nokogiri::HTML(open(url))
|
66
|
+
|
67
|
+
doc.css('meta').each do |node|
|
68
|
+
if !node['name'].nil?
|
69
|
+
puts node['name'] + " => " + node['content']
|
70
|
+
end
|
71
|
+
if !node['property'].nil?
|
72
|
+
puts node['property'] + " => " + node['content']
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
url = "https://www.alex-oberhauser.com"
|
81
|
+
|
82
|
+
html = Extractor::HTMLParser.new
|
83
|
+
#html.extractLink url
|
84
|
+
json = html.get_microdata_json(url)
|
85
|
+
ap json
|
86
|
+
ap json["http://schema.org/Organization"].size == 3
|
87
|
+
ap json["http://schema.org/EducationalOrganization"].size == 2
|
88
|
+
#puts "-------------"
|
89
|
+
#html.extractMeta url
|
data/semantic_crawler.gemspec
CHANGED
@@ -11,8 +11,8 @@ Gem::Specification.new do |s|
|
|
11
11
|
s.email = ["alex.oberhauser@sigimera.org"]
|
12
12
|
s.licenses = ["MIT"]
|
13
13
|
s.homepage = "https://github.com/obale/semantic_crawler"
|
14
|
-
s.summary = "SemanticCrawler is a ruby library that encapsulates data gathering from different sources."
|
15
|
-
s.description = "SemanticCrawler is a ruby library that encapsulates data gathering from different sources. Currently country information from Freebase, Factbook and FAO (Food and Agriculture Organization of the United Nations), crisis information from GDACS.org and geo data from LinkedGeoData are supported. Additional the GeoNames module allows to get Factbook and FAO country information from GPS coordinates."
|
14
|
+
s.summary = "SemanticCrawler is a ruby library that encapsulates data gathering from different sources, e.g. microdata from websites."
|
15
|
+
s.description = "SemanticCrawler is a ruby library that encapsulates data gathering from different sources. Currently microdata from websites, country information from Freebase, Factbook and FAO (Food and Agriculture Organization of the United Nations), crisis information from GDACS.org and geo data from LinkedGeoData are supported. Additional the GeoNames module allows to get Factbook and FAO country information from GPS coordinates."
|
16
16
|
|
17
17
|
s.files = `git ls-files`.split("\n")
|
18
18
|
s.executables = `git ls-files -- bin/*`.split('\n').map{ |f| File.basename(f) }
|
@@ -24,13 +24,16 @@ Gem::Specification.new do |s|
|
|
24
24
|
#s.add_dependency "google-api-client" # Freebase API access
|
25
25
|
s.add_dependency "nokogiri" # XML Parsing
|
26
26
|
s.add_dependency "geonames" # Use for the GeoNames module
|
27
|
+
s.add_dependency "microdata"
|
27
28
|
|
28
29
|
s.add_development_dependency "yard"
|
29
30
|
s.add_development_dependency "grit"
|
30
31
|
s.add_development_dependency "rails", "~> 3.2"
|
31
32
|
s.add_development_dependency "sqlite3"
|
32
|
-
s.add_development_dependency "rspec
|
33
|
+
s.add_development_dependency "rspec"
|
33
34
|
s.add_development_dependency "simplecov"
|
34
35
|
s.add_development_dependency "simplecov-rcov"
|
36
|
+
s.add_development_dependency "pry"
|
37
|
+
s.add_development_dependency "awesome_print"
|
35
38
|
|
36
39
|
end
|
data/spec/geo_names_spec.rb
CHANGED
@@ -7,34 +7,34 @@ describe SemanticCrawler::GeoNames do
|
|
7
7
|
@innsbruck = SemanticCrawler::GeoNames::Country.new(47.271338, 11.395333)
|
8
8
|
end
|
9
9
|
|
10
|
-
|
10
|
+
xit "get country code" do
|
11
11
|
@innsbruck.get_country_code.should eq("AT")
|
12
12
|
end
|
13
13
|
|
14
|
-
|
14
|
+
xit "get wikipedia articles" do
|
15
15
|
articles = @innsbruck.get_wikipedia_articles
|
16
16
|
articles.each do |article|
|
17
17
|
article.wikipedia_url.to_s.start_with?("http").should be_true
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
-
|
21
|
+
xit "get country name" do
|
22
22
|
@innsbruck.get_country_name.should eq("Austria")
|
23
23
|
end
|
24
24
|
|
25
|
-
|
25
|
+
xit "get factbook country" do
|
26
26
|
factbook = @innsbruck.get_factbook_country
|
27
27
|
factbook.background.to_s.size.should > 0
|
28
28
|
factbook.country_name.to_s.should eq("Austria")
|
29
29
|
end
|
30
30
|
|
31
|
-
|
31
|
+
xit "get fao country" do
|
32
32
|
fao = @innsbruck.get_fao_country
|
33
33
|
fao.country_name.to_s.should eq("Austria")
|
34
34
|
fao.population_notes.to_s.should eq("2010 Revision from the UN Population Division")
|
35
35
|
end
|
36
36
|
|
37
|
-
|
37
|
+
xit "get freebase country" do
|
38
38
|
freebase = @innsbruck.get_freebase_country
|
39
39
|
freebase.country_name.to_s.should eq("Austria")
|
40
40
|
freebase.website.should eq("http://www.austria.gv.at/")
|
@@ -0,0 +1,9 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe SemanticCrawler::Websites do
|
4
|
+
it "test microdata extraction" do
|
5
|
+
m = SemanticCrawler::Websites::MicroData.new("https://www.alex-oberhauser.com").to_s
|
6
|
+
m['http://schema.org/ItemList'].first['itemListElement'].size.should > 0
|
7
|
+
m['http://schema.org/Organization'].size.should > 2
|
8
|
+
end
|
9
|
+
end
|
data/start_pry.sh
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: semantic-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-04-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: httparty
|
@@ -75,6 +75,22 @@ dependencies:
|
|
75
75
|
- - ! '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
77
|
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: microdata
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :runtime
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
78
94
|
- !ruby/object:Gem::Dependency
|
79
95
|
name: yard
|
80
96
|
requirement: !ruby/object:Gem::Requirement
|
@@ -140,7 +156,7 @@ dependencies:
|
|
140
156
|
- !ruby/object:Gem::Version
|
141
157
|
version: '0'
|
142
158
|
- !ruby/object:Gem::Dependency
|
143
|
-
name: rspec
|
159
|
+
name: rspec
|
144
160
|
requirement: !ruby/object:Gem::Requirement
|
145
161
|
none: false
|
146
162
|
requirements:
|
@@ -187,11 +203,44 @@ dependencies:
|
|
187
203
|
- - ! '>='
|
188
204
|
- !ruby/object:Gem::Version
|
189
205
|
version: '0'
|
206
|
+
- !ruby/object:Gem::Dependency
|
207
|
+
name: pry
|
208
|
+
requirement: !ruby/object:Gem::Requirement
|
209
|
+
none: false
|
210
|
+
requirements:
|
211
|
+
- - ! '>='
|
212
|
+
- !ruby/object:Gem::Version
|
213
|
+
version: '0'
|
214
|
+
type: :development
|
215
|
+
prerelease: false
|
216
|
+
version_requirements: !ruby/object:Gem::Requirement
|
217
|
+
none: false
|
218
|
+
requirements:
|
219
|
+
- - ! '>='
|
220
|
+
- !ruby/object:Gem::Version
|
221
|
+
version: '0'
|
222
|
+
- !ruby/object:Gem::Dependency
|
223
|
+
name: awesome_print
|
224
|
+
requirement: !ruby/object:Gem::Requirement
|
225
|
+
none: false
|
226
|
+
requirements:
|
227
|
+
- - ! '>='
|
228
|
+
- !ruby/object:Gem::Version
|
229
|
+
version: '0'
|
230
|
+
type: :development
|
231
|
+
prerelease: false
|
232
|
+
version_requirements: !ruby/object:Gem::Requirement
|
233
|
+
none: false
|
234
|
+
requirements:
|
235
|
+
- - ! '>='
|
236
|
+
- !ruby/object:Gem::Version
|
237
|
+
version: '0'
|
190
238
|
description: SemanticCrawler is a ruby library that encapsulates data gathering from
|
191
|
-
different sources. Currently country information from Freebase,
|
192
|
-
(Food and Agriculture Organization of the United Nations), crisis
|
193
|
-
GDACS.org and geo data from LinkedGeoData are supported. Additional
|
194
|
-
module allows to get Factbook and FAO country information from GPS
|
239
|
+
different sources. Currently microdata from websites, country information from Freebase,
|
240
|
+
Factbook and FAO (Food and Agriculture Organization of the United Nations), crisis
|
241
|
+
information from GDACS.org and geo data from LinkedGeoData are supported. Additional
|
242
|
+
the GeoNames module allows to get Factbook and FAO country information from GPS
|
243
|
+
coordinates.
|
195
244
|
email:
|
196
245
|
- alex.oberhauser@sigimera.org
|
197
246
|
executables: []
|
@@ -227,8 +276,11 @@ files:
|
|
227
276
|
- lib/semantic_crawler/linked_geo_data/relevant_node.rb
|
228
277
|
- lib/semantic_crawler/linked_geo_data/relevant_nodes.rb
|
229
278
|
- lib/semantic_crawler/version.rb
|
279
|
+
- lib/semantic_crawler/websites.rb
|
280
|
+
- lib/semantic_crawler/websites/micro_data.rb
|
230
281
|
- lib/tasks/semantic_crawler_tasks.rake
|
231
282
|
- log/.gitkeep
|
283
|
+
- meta_extract.rb
|
232
284
|
- semantic_crawler.gemspec
|
233
285
|
- spec/dbpedia_spec.rb
|
234
286
|
- spec/factbook_spec.rb
|
@@ -239,6 +291,8 @@ files:
|
|
239
291
|
- spec/geo_names_spec.rb
|
240
292
|
- spec/linked_geo_data_spec.rb
|
241
293
|
- spec/spec_helper.rb
|
294
|
+
- spec/websites_spec.rb
|
295
|
+
- start_pry.sh
|
242
296
|
- test/dummy/README.rdoc
|
243
297
|
- test/dummy/Rakefile
|
244
298
|
- test/dummy/app/assets/javascripts/application.js
|
@@ -276,7 +330,6 @@ files:
|
|
276
330
|
- test/semantic_crawler_test.rb
|
277
331
|
- test/test_helper.rb
|
278
332
|
- test/dummy/log/test.log
|
279
|
-
- test/dummy/log/development.log
|
280
333
|
homepage: https://github.com/obale/semantic_crawler
|
281
334
|
licenses:
|
282
335
|
- MIT
|
@@ -290,30 +343,23 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
290
343
|
- - ! '>='
|
291
344
|
- !ruby/object:Gem::Version
|
292
345
|
version: '0'
|
293
|
-
segments:
|
294
|
-
- 0
|
295
|
-
hash: 1493129874267030689
|
296
346
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
297
347
|
none: false
|
298
348
|
requirements:
|
299
349
|
- - ! '>='
|
300
350
|
- !ruby/object:Gem::Version
|
301
351
|
version: '0'
|
302
|
-
segments:
|
303
|
-
- 0
|
304
|
-
hash: 1493129874267030689
|
305
352
|
requirements: []
|
306
353
|
rubyforge_project:
|
307
|
-
rubygems_version: 1.8.
|
354
|
+
rubygems_version: 1.8.25
|
308
355
|
signing_key:
|
309
356
|
specification_version: 3
|
310
357
|
summary: SemanticCrawler is a ruby library that encapsulates data gathering from different
|
311
|
-
sources.
|
358
|
+
sources, e.g. microdata from websites.
|
312
359
|
test_files:
|
313
360
|
- test/test_helper.rb
|
314
361
|
- test/semantic_crawler_test.rb
|
315
362
|
- test/dummy/log/test.log
|
316
|
-
- test/dummy/log/development.log
|
317
363
|
- test/dummy/public/422.html
|
318
364
|
- test/dummy/public/favicon.ico
|
319
365
|
- test/dummy/public/500.html
|
@@ -351,6 +397,7 @@ test_files:
|
|
351
397
|
- spec/geo_names_spec.rb
|
352
398
|
- spec/spec_helper.rb
|
353
399
|
- spec/freebase_spec.rb
|
400
|
+
- spec/websites_spec.rb
|
354
401
|
- spec/gdacs_spec.rb
|
355
402
|
- spec/fao_austria_spec.rb
|
356
403
|
has_rdoc:
|