semantic-crawler 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -13,7 +13,6 @@ rescue LoadError
13
13
  end
14
14
 
15
15
  require 'rake'
16
- require 'rspec/core/rake_task'
17
16
 
18
17
  RDoc::Task.new(:rdoc) do |rdoc|
19
18
  rdoc.rdoc_dir = 'rdoc'
@@ -38,6 +37,11 @@ Rake::TestTask.new(:test) do |t|
38
37
  t.verbose = false
39
38
  end
40
39
 
41
- RSpec::Core::RakeTask.new(:spec)
40
+ require 'rspec/core/rake_task'
41
+ RSpec::Core::RakeTask.new(:spec) do |t|
42
+ t.fail_on_error = true
43
+ t.rspec_opts = "--colour --format doc"
44
+ end
45
+
42
46
 
43
47
  task :default => [ :test, :spec ]
@@ -2,6 +2,8 @@
2
2
 
3
3
  require "httparty"
4
4
  require "geonames"
5
+ require "nokogiri"
6
+ require "microdata"
5
7
 
6
8
  # The top level module contains the different data sources
7
9
  # as sub-modules. Currently there are the following modules
@@ -54,3 +56,6 @@ require "semantic_crawler/fao/country"
54
56
  # Freebase.com - module: Freebase
55
57
  require "semantic_crawler/freebase/country"
56
58
 
59
+ # Websites - module: Websites
60
+ require "semantic_crawler/websites"
61
+ require "semantic_crawler/websites/micro_data"
@@ -1,4 +1,4 @@
1
1
  module SemanticCrawler
2
2
  # The current version of this library.
3
- VERSION = "0.6.0"
3
+ VERSION = "0.7.0"
4
4
  end
@@ -0,0 +1,4 @@
1
+ # The module Websites encapsulates methods that are capable of extraction
2
+ # of semantic from websites, e.g. microdata
3
+ module SemanticCrawler::Websites
4
+ end
@@ -0,0 +1,52 @@
1
+ # encoding: UTF-8
2
+
3
+ module SemanticCrawler
4
+ module Websites
5
+
6
+ # Extract microdata from a website and output it as JSON
7
+ class MicroData
8
+
9
+ attr_accessor :url
10
+ attr_accessor :microdata
11
+
12
+ def initialize(url)
13
+ doc = Nokogiri::HTML(open(url))
14
+ microdata = Microdata::Document.new(doc.to_s)
15
+ items = microdata.extract_items
16
+ self.microdata = extract_microdata(items)
17
+ end
18
+
19
+ def to_json
20
+ microdata.to_json
21
+ end
22
+
23
+ def to_s
24
+ microdata
25
+ end
26
+
27
+ private
28
+ def extract_microdata(items)
29
+ hash = Hash.new
30
+ if items.kind_of? Array and items.first and items.first.kind_of? String
31
+ hash = items
32
+ elsif items.kind_of? Array and items.first
33
+ items.each do |item|
34
+ props = item.properties
35
+ properties = Hash.new
36
+ props.each do |key, value|
37
+ hash[item.type.first] ||= Array.new
38
+ values = extract_microdata(value)
39
+ properties.merge!(key.to_s => values)
40
+ end
41
+ hash[item.type.first] << properties
42
+ end
43
+ else
44
+ raise "Not implemented!"
45
+ end
46
+ hash
47
+ end
48
+
49
+ end
50
+
51
+ end
52
+ end
@@ -0,0 +1,89 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'awesome_print'
4
+
5
+ require 'microdata'
6
+
7
+ module Extractor
8
+ class Extractor::HTMLParser
9
+
10
+ def extract_microdata(items)
11
+ hash = Hash.new
12
+ if items.kind_of? Array and items.first and items.first.kind_of? String
13
+ hash = items
14
+ elsif items.kind_of? Array and items.first
15
+ items.each do |item|
16
+ props = item.properties
17
+ properties = Hash.new
18
+ props.each do |key, value|
19
+ hash[item.type.first] ||= Array.new
20
+ values = extract_microdata(value)
21
+ properties.merge!(key.to_s => values)
22
+ end
23
+ hash[item.type.first] << properties
24
+ end
25
+ else
26
+ raise "Not implemented!"
27
+ end
28
+ hash
29
+ end
30
+
31
+ def get_microdata_json(url)
32
+ doc = Nokogiri::HTML(open(url))
33
+ microdata = Microdata::Document.new(doc.to_s)
34
+ items = microdata.extract_items
35
+ extract_microdata(items)
36
+ end
37
+
38
+ def extractLink(url)
39
+ doc = Nokogiri::HTML(open(url))
40
+
41
+ doc.css('link').each do |node|
42
+ if !node['type'].nil?
43
+ puts node['type'] + " => " + node['href']
44
+ if node['type'].downcase.eql?("application/rdf+xml")
45
+ rdf = Nokogiri::XML(open(node['href']))
46
+ ap "-------------"
47
+ ap "Name: #{rdf.xpath("/rdf:RDF/foaf:Person/foaf:name", rdf.namespaces).text}"
48
+ ap "Homepage: #{rdf.xpath("/rdf:RDF/foaf:Person/foaf:homepage/@rdf:resource", rdf.namespaces).text}"
49
+ pubs = rdf.xpath("/rdf:RDF/foaf:Person/foaf:publications/@rdf:resource", rdf.namespaces)
50
+ if pubs
51
+ pubs.each do |pub|
52
+ publication = rdf.xpath("//rdf:RDF/*[@rdf:ID='#{pub.text.gsub('#', '')}']")
53
+ if publication
54
+ ap "Publications: #{publication.xpath("./bibtex:hasTitle").text}"
55
+ end
56
+ end
57
+ end
58
+ ap "-------------"
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ def extractMeta(url)
65
+ doc = Nokogiri::HTML(open(url))
66
+
67
+ doc.css('meta').each do |node|
68
+ if !node['name'].nil?
69
+ puts node['name'] + " => " + node['content']
70
+ end
71
+ if !node['property'].nil?
72
+ puts node['property'] + " => " + node['content']
73
+ end
74
+ end
75
+ end
76
+ end
77
+
78
+ end
79
+
80
+ url = "https://www.alex-oberhauser.com"
81
+
82
+ html = Extractor::HTMLParser.new
83
+ #html.extractLink url
84
+ json = html.get_microdata_json(url)
85
+ ap json
86
+ ap json["http://schema.org/Organization"].size == 3
87
+ ap json["http://schema.org/EducationalOrganization"].size == 2
88
+ #puts "-------------"
89
+ #html.extractMeta url
@@ -11,8 +11,8 @@ Gem::Specification.new do |s|
11
11
  s.email = ["alex.oberhauser@sigimera.org"]
12
12
  s.licenses = ["MIT"]
13
13
  s.homepage = "https://github.com/obale/semantic_crawler"
14
- s.summary = "SemanticCrawler is a ruby library that encapsulates data gathering from different sources."
15
- s.description = "SemanticCrawler is a ruby library that encapsulates data gathering from different sources. Currently country information from Freebase, Factbook and FAO (Food and Agriculture Organization of the United Nations), crisis information from GDACS.org and geo data from LinkedGeoData are supported. Additional the GeoNames module allows to get Factbook and FAO country information from GPS coordinates."
14
+ s.summary = "SemanticCrawler is a ruby library that encapsulates data gathering from different sources, e.g. microdata from websites."
15
+ s.description = "SemanticCrawler is a ruby library that encapsulates data gathering from different sources. Currently microdata from websites, country information from Freebase, Factbook and FAO (Food and Agriculture Organization of the United Nations), crisis information from GDACS.org and geo data from LinkedGeoData are supported. Additional the GeoNames module allows to get Factbook and FAO country information from GPS coordinates."
16
16
 
17
17
  s.files = `git ls-files`.split("\n")
18
18
  s.executables = `git ls-files -- bin/*`.split('\n').map{ |f| File.basename(f) }
@@ -24,13 +24,16 @@ Gem::Specification.new do |s|
24
24
  #s.add_dependency "google-api-client" # Freebase API access
25
25
  s.add_dependency "nokogiri" # XML Parsing
26
26
  s.add_dependency "geonames" # Use for the GeoNames module
27
+ s.add_dependency "microdata"
27
28
 
28
29
  s.add_development_dependency "yard"
29
30
  s.add_development_dependency "grit"
30
31
  s.add_development_dependency "rails", "~> 3.2"
31
32
  s.add_development_dependency "sqlite3"
32
- s.add_development_dependency "rspec-rails"
33
+ s.add_development_dependency "rspec"
33
34
  s.add_development_dependency "simplecov"
34
35
  s.add_development_dependency "simplecov-rcov"
36
+ s.add_development_dependency "pry"
37
+ s.add_development_dependency "awesome_print"
35
38
 
36
39
  end
@@ -7,34 +7,34 @@ describe SemanticCrawler::GeoNames do
7
7
  @innsbruck = SemanticCrawler::GeoNames::Country.new(47.271338, 11.395333)
8
8
  end
9
9
 
10
- it "get country code" do
10
+ xit "get country code" do
11
11
  @innsbruck.get_country_code.should eq("AT")
12
12
  end
13
13
 
14
- it "get wikipedia articles" do
14
+ xit "get wikipedia articles" do
15
15
  articles = @innsbruck.get_wikipedia_articles
16
16
  articles.each do |article|
17
17
  article.wikipedia_url.to_s.start_with?("http").should be_true
18
18
  end
19
19
  end
20
20
 
21
- it "get country name" do
21
+ xit "get country name" do
22
22
  @innsbruck.get_country_name.should eq("Austria")
23
23
  end
24
24
 
25
- it "get factbook country" do
25
+ xit "get factbook country" do
26
26
  factbook = @innsbruck.get_factbook_country
27
27
  factbook.background.to_s.size.should > 0
28
28
  factbook.country_name.to_s.should eq("Austria")
29
29
  end
30
30
 
31
- it "get fao country" do
31
+ xit "get fao country" do
32
32
  fao = @innsbruck.get_fao_country
33
33
  fao.country_name.to_s.should eq("Austria")
34
34
  fao.population_notes.to_s.should eq("2010 Revision from the UN Population Division")
35
35
  end
36
36
 
37
- it "get freebase country" do
37
+ xit "get freebase country" do
38
38
  freebase = @innsbruck.get_freebase_country
39
39
  freebase.country_name.to_s.should eq("Austria")
40
40
  freebase.website.should eq("http://www.austria.gv.at/")
@@ -0,0 +1,9 @@
1
+ require 'spec_helper'
2
+
3
+ describe SemanticCrawler::Websites do
4
+ it "test microdata extraction" do
5
+ m = SemanticCrawler::Websites::MicroData.new("https://www.alex-oberhauser.com").to_s
6
+ m['http://schema.org/ItemList'].first['itemListElement'].size.should > 0
7
+ m['http://schema.org/Organization'].size.should > 2
8
+ end
9
+ end
@@ -0,0 +1,2 @@
1
+ #!/bin/bash
2
+ pry -I./lib -r "semantic_crawler"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: semantic-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-04 00:00:00.000000000 Z
12
+ date: 2013-04-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: httparty
@@ -75,6 +75,22 @@ dependencies:
75
75
  - - ! '>='
76
76
  - !ruby/object:Gem::Version
77
77
  version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: microdata
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
78
94
  - !ruby/object:Gem::Dependency
79
95
  name: yard
80
96
  requirement: !ruby/object:Gem::Requirement
@@ -140,7 +156,7 @@ dependencies:
140
156
  - !ruby/object:Gem::Version
141
157
  version: '0'
142
158
  - !ruby/object:Gem::Dependency
143
- name: rspec-rails
159
+ name: rspec
144
160
  requirement: !ruby/object:Gem::Requirement
145
161
  none: false
146
162
  requirements:
@@ -187,11 +203,44 @@ dependencies:
187
203
  - - ! '>='
188
204
  - !ruby/object:Gem::Version
189
205
  version: '0'
206
+ - !ruby/object:Gem::Dependency
207
+ name: pry
208
+ requirement: !ruby/object:Gem::Requirement
209
+ none: false
210
+ requirements:
211
+ - - ! '>='
212
+ - !ruby/object:Gem::Version
213
+ version: '0'
214
+ type: :development
215
+ prerelease: false
216
+ version_requirements: !ruby/object:Gem::Requirement
217
+ none: false
218
+ requirements:
219
+ - - ! '>='
220
+ - !ruby/object:Gem::Version
221
+ version: '0'
222
+ - !ruby/object:Gem::Dependency
223
+ name: awesome_print
224
+ requirement: !ruby/object:Gem::Requirement
225
+ none: false
226
+ requirements:
227
+ - - ! '>='
228
+ - !ruby/object:Gem::Version
229
+ version: '0'
230
+ type: :development
231
+ prerelease: false
232
+ version_requirements: !ruby/object:Gem::Requirement
233
+ none: false
234
+ requirements:
235
+ - - ! '>='
236
+ - !ruby/object:Gem::Version
237
+ version: '0'
190
238
  description: SemanticCrawler is a ruby library that encapsulates data gathering from
191
- different sources. Currently country information from Freebase, Factbook and FAO
192
- (Food and Agriculture Organization of the United Nations), crisis information from
193
- GDACS.org and geo data from LinkedGeoData are supported. Additional the GeoNames
194
- module allows to get Factbook and FAO country information from GPS coordinates.
239
+ different sources. Currently microdata from websites, country information from Freebase,
240
+ Factbook and FAO (Food and Agriculture Organization of the United Nations), crisis
241
+ information from GDACS.org and geo data from LinkedGeoData are supported. Additional
242
+ the GeoNames module allows to get Factbook and FAO country information from GPS
243
+ coordinates.
195
244
  email:
196
245
  - alex.oberhauser@sigimera.org
197
246
  executables: []
@@ -227,8 +276,11 @@ files:
227
276
  - lib/semantic_crawler/linked_geo_data/relevant_node.rb
228
277
  - lib/semantic_crawler/linked_geo_data/relevant_nodes.rb
229
278
  - lib/semantic_crawler/version.rb
279
+ - lib/semantic_crawler/websites.rb
280
+ - lib/semantic_crawler/websites/micro_data.rb
230
281
  - lib/tasks/semantic_crawler_tasks.rake
231
282
  - log/.gitkeep
283
+ - meta_extract.rb
232
284
  - semantic_crawler.gemspec
233
285
  - spec/dbpedia_spec.rb
234
286
  - spec/factbook_spec.rb
@@ -239,6 +291,8 @@ files:
239
291
  - spec/geo_names_spec.rb
240
292
  - spec/linked_geo_data_spec.rb
241
293
  - spec/spec_helper.rb
294
+ - spec/websites_spec.rb
295
+ - start_pry.sh
242
296
  - test/dummy/README.rdoc
243
297
  - test/dummy/Rakefile
244
298
  - test/dummy/app/assets/javascripts/application.js
@@ -276,7 +330,6 @@ files:
276
330
  - test/semantic_crawler_test.rb
277
331
  - test/test_helper.rb
278
332
  - test/dummy/log/test.log
279
- - test/dummy/log/development.log
280
333
  homepage: https://github.com/obale/semantic_crawler
281
334
  licenses:
282
335
  - MIT
@@ -290,30 +343,23 @@ required_ruby_version: !ruby/object:Gem::Requirement
290
343
  - - ! '>='
291
344
  - !ruby/object:Gem::Version
292
345
  version: '0'
293
- segments:
294
- - 0
295
- hash: 1493129874267030689
296
346
  required_rubygems_version: !ruby/object:Gem::Requirement
297
347
  none: false
298
348
  requirements:
299
349
  - - ! '>='
300
350
  - !ruby/object:Gem::Version
301
351
  version: '0'
302
- segments:
303
- - 0
304
- hash: 1493129874267030689
305
352
  requirements: []
306
353
  rubyforge_project:
307
- rubygems_version: 1.8.24
354
+ rubygems_version: 1.8.25
308
355
  signing_key:
309
356
  specification_version: 3
310
357
  summary: SemanticCrawler is a ruby library that encapsulates data gathering from different
311
- sources.
358
+ sources, e.g. microdata from websites.
312
359
  test_files:
313
360
  - test/test_helper.rb
314
361
  - test/semantic_crawler_test.rb
315
362
  - test/dummy/log/test.log
316
- - test/dummy/log/development.log
317
363
  - test/dummy/public/422.html
318
364
  - test/dummy/public/favicon.ico
319
365
  - test/dummy/public/500.html
@@ -351,6 +397,7 @@ test_files:
351
397
  - spec/geo_names_spec.rb
352
398
  - spec/spec_helper.rb
353
399
  - spec/freebase_spec.rb
400
+ - spec/websites_spec.rb
354
401
  - spec/gdacs_spec.rb
355
402
  - spec/fao_austria_spec.rb
356
403
  has_rdoc: