RubyGems - semantic-crawler - Versions diffs - 0.6.0 → 0.7.0 - Mend

semantic-crawler 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/Rakefile +6 -2
data/lib/semantic_crawler.rb +5 -0
data/lib/semantic_crawler/version.rb +1 -1
data/lib/semantic_crawler/websites.rb +4 -0
data/lib/semantic_crawler/websites/micro_data.rb +52 -0
data/meta_extract.rb +89 -0
data/semantic_crawler.gemspec +6 -3
data/spec/geo_names_spec.rb +6 -6
data/spec/websites_spec.rb +9 -0
data/start_pry.sh +2 -0
metadata +64 -17

data/Rakefile CHANGED

@@ -13,7 +13,6 @@ rescue LoadError
 end
 require 'rake'
-require 'rspec/core/rake_task'
 RDoc::Task.new(:rdoc) do |rdoc|
   rdoc.rdoc_dir = 'rdoc'
@@ -38,6 +37,11 @@ Rake::TestTask.new(:test) do |t|
   t.verbose = false
 end
-RSpec::Core::RakeTask.new(:spec)
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |t|
+    t.fail_on_error = true
+    t.rspec_opts = "--colour --format doc"
+end
 task :default => [ :test, :spec ]

data/lib/semantic_crawler.rb CHANGED

@@ -2,6 +2,8 @@
 require "httparty"
 require "geonames"
+require "nokogiri"
+require "microdata"
 # The top level module contains the different data sources
 # as sub-modules. Currently there are the following modules
@@ -54,3 +56,6 @@ require "semantic_crawler/fao/country"
 # Freebase.com - module: Freebase
 require "semantic_crawler/freebase/country"
+# Websites - module: Websites
+require "semantic_crawler/websites"
+require "semantic_crawler/websites/micro_data"

data/lib/semantic_crawler/version.rb CHANGED

@@ -1,4 +1,4 @@
 module SemanticCrawler
     # The current version of this library.
-    VERSION = "0.6.0"
+    VERSION = "0.7.0"
 end

data/lib/semantic_crawler/websites.rb ADDED

@@ -0,0 +1,4 @@
+# The module Websites encapsulates methods that are capable of extraction
+# of semantic from websites, e.g. microdata
+module SemanticCrawler::Websites
+end

data/lib/semantic_crawler/websites/micro_data.rb ADDED

@@ -0,0 +1,52 @@
+# encoding: UTF-8
+module SemanticCrawler
+  module Websites
+    # Extract microdata from a website and output it as JSON
+    class MicroData
+      attr_accessor :url
+      attr_accessor :microdata
+      def initialize(url)
+        doc = Nokogiri::HTML(open(url))
+        microdata = Microdata::Document.new(doc.to_s)
+        items = microdata.extract_items
+        self.microdata = extract_microdata(items)
+      end
+      def to_json
+        microdata.to_json
+      end
+      def to_s
+        microdata
+      end
+      private
+      def extract_microdata(items)
+        hash = Hash.new
+        if items.kind_of? Array and items.first and items.first.kind_of? String
+          hash = items
+        elsif items.kind_of? Array and items.first
+          items.each do |item|
+            props = item.properties
+            properties = Hash.new
+            props.each do |key, value|
+              hash[item.type.first] ||= Array.new
+              values = extract_microdata(value)
+              properties.merge!(key.to_s => values)
+            end
+            hash[item.type.first] << properties
+          end
+        else
+          raise "Not implemented!"
+        end
+        hash
+      end
+    end
+  end
+end

data/meta_extract.rb ADDED

@@ -0,0 +1,89 @@
+require 'nokogiri'
+require 'open-uri'
+require 'awesome_print'
+require 'microdata'
+module Extractor
+  class Extractor::HTMLParser
+    def extract_microdata(items)
+      hash = Hash.new
+      if items.kind_of? Array and items.first and items.first.kind_of? String
+        hash = items
+      elsif items.kind_of? Array and items.first
+        items.each do |item|
+          props = item.properties
+          properties = Hash.new
+          props.each do |key, value|
+            hash[item.type.first] ||= Array.new
+            values = extract_microdata(value)
+            properties.merge!(key.to_s => values)
+          end
+          hash[item.type.first] << properties
+        end
+      else
+        raise "Not implemented!"
+      end
+      hash
+    end
+    def get_microdata_json(url)
+      doc = Nokogiri::HTML(open(url))
+      microdata = Microdata::Document.new(doc.to_s)
+      items = microdata.extract_items
+      extract_microdata(items)
+    end
+    def extractLink(url)
+      doc = Nokogiri::HTML(open(url))
+      doc.css('link').each do |node|
+        if !node['type'].nil?
+          puts node['type'] + " => " + node['href']
+          if node['type'].downcase.eql?("application/rdf+xml")
+            rdf = Nokogiri::XML(open(node['href']))
+            ap "-------------"
+            ap "Name:         #{rdf.xpath("/rdf:RDF/foaf:Person/foaf:name", rdf.namespaces).text}"
+            ap "Homepage:     #{rdf.xpath("/rdf:RDF/foaf:Person/foaf:homepage/@rdf:resource", rdf.namespaces).text}"
+            pubs = rdf.xpath("/rdf:RDF/foaf:Person/foaf:publications/@rdf:resource", rdf.namespaces)
+            if pubs
+              pubs.each do |pub|
+                publication = rdf.xpath("//rdf:RDF/*[@rdf:ID='#{pub.text.gsub('#', '')}']")
+                if publication
+                  ap "Publications: #{publication.xpath("./bibtex:hasTitle").text}"
+                end
+              end
+            end
+            ap "-------------"
+          end
+        end
+      end
+    end
+    def extractMeta(url)
+      doc = Nokogiri::HTML(open(url))
+      doc.css('meta').each do |node|
+        if !node['name'].nil?
+          puts node['name'] + " => " + node['content']
+        end
+        if !node['property'].nil?
+          puts node['property'] + " => " + node['content']
+        end
+      end
+    end
+  end
+end
+url = "https://www.alex-oberhauser.com"
+html = Extractor::HTMLParser.new
+#html.extractLink url
+json = html.get_microdata_json(url)
+ap json
+ap json["http://schema.org/Organization"].size == 3
+ap json["http://schema.org/EducationalOrganization"].size == 2
+#puts "-------------"
+#html.extractMeta url

data/semantic_crawler.gemspec CHANGED

@@ -11,8 +11,8 @@ Gem::Specification.new do |s|
     s.email       = ["alex.oberhauser@sigimera.org"]
     s.licenses    = ["MIT"]
     s.homepage    = "https://github.com/obale/semantic_crawler"
-    s.summary     = "SemanticCrawler is a ruby library that encapsulates data gathering from different sources."
-    s.description = "SemanticCrawler is a ruby library that encapsulates data gathering from different sources. Currently country information from Freebase, Factbook and FAO (Food and Agriculture Organization of the United Nations), crisis information from GDACS.org and geo data from LinkedGeoData are supported. Additional the GeoNames module allows to get Factbook and FAO country information from GPS coordinates."
+    s.summary     = "SemanticCrawler is a ruby library that encapsulates data gathering from different sources, e.g. microdata from websites."
+    s.description = "SemanticCrawler is a ruby library that encapsulates data gathering from different sources. Currently microdata from websites, country information from Freebase, Factbook and FAO (Food and Agriculture Organization of the United Nations), crisis information from GDACS.org and geo data from LinkedGeoData are supported. Additional the GeoNames module allows to get Factbook and FAO country information from GPS coordinates."
     s.files = `git ls-files`.split("\n")
     s.executables = `git ls-files -- bin/*`.split('\n').map{ |f| File.basename(f) }
@@ -24,13 +24,16 @@ Gem::Specification.new do |s|
     #s.add_dependency "google-api-client" # Freebase API access
     s.add_dependency "nokogiri"           # XML Parsing
     s.add_dependency "geonames"           # Use for the GeoNames module
+    s.add_dependency "microdata"
     s.add_development_dependency "yard"
     s.add_development_dependency "grit"
     s.add_development_dependency "rails", "~> 3.2"
     s.add_development_dependency "sqlite3"
-    s.add_development_dependency "rspec-rails"
+    s.add_development_dependency "rspec"
     s.add_development_dependency "simplecov"
     s.add_development_dependency "simplecov-rcov"
+    s.add_development_dependency "pry"
+    s.add_development_dependency "awesome_print"
 end

data/spec/geo_names_spec.rb CHANGED

@@ -7,34 +7,34 @@ describe SemanticCrawler::GeoNames do
         @innsbruck = SemanticCrawler::GeoNames::Country.new(47.271338, 11.395333)
     end
-    it "get country code" do
+    xit "get country code" do
         @innsbruck.get_country_code.should eq("AT")
     end
-    it "get wikipedia articles" do
+    xit "get wikipedia articles" do
         articles = @innsbruck.get_wikipedia_articles
         articles.each do |article|
             article.wikipedia_url.to_s.start_with?("http").should be_true
         end
     end
-    it "get country name" do
+    xit "get country name" do
         @innsbruck.get_country_name.should eq("Austria")
     end
-    it "get factbook country" do
+    xit "get factbook country" do
         factbook = @innsbruck.get_factbook_country
         factbook.background.to_s.size.should > 0
         factbook.country_name.to_s.should eq("Austria")
     end
-    it "get fao country" do
+    xit "get fao country" do
         fao = @innsbruck.get_fao_country
         fao.country_name.to_s.should eq("Austria")
         fao.population_notes.to_s.should eq("2010 Revision from the UN Population Division")
     end
-    it "get freebase country" do
+    xit "get freebase country" do
         freebase = @innsbruck.get_freebase_country
         freebase.country_name.to_s.should eq("Austria")
         freebase.website.should eq("http://www.austria.gv.at/")

data/spec/websites_spec.rb ADDED

@@ -0,0 +1,9 @@
+require 'spec_helper'
+describe SemanticCrawler::Websites do
+  it "test microdata extraction" do
+    m = SemanticCrawler::Websites::MicroData.new("https://www.alex-oberhauser.com").to_s
+    m['http://schema.org/ItemList'].first['itemListElement'].size.should > 0
+    m['http://schema.org/Organization'].size.should > 2
+  end
+end

data/start_pry.sh ADDED

	@@ -0,0 +1,2 @@
1	+ #!/bin/bash
2	+ pry -I./lib -r "semantic_crawler"

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: semantic-crawler
 version: !ruby/object:Gem::Version
-  version: 0.6.0
+  version: 0.7.0
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-08-04 00:00:00.000000000 Z
+date: 2013-04-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: httparty
@@ -75,6 +75,22 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: microdata
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: yard
   requirement: !ruby/object:Gem::Requirement
@@ -140,7 +156,7 @@ dependencies:
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
-  name: rspec-rails
+  name: rspec
   requirement: !ruby/object:Gem::Requirement
     none: false
     requirements:
@@ -187,11 +203,44 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: pry
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: awesome_print
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 description: SemanticCrawler is a ruby library that encapsulates data gathering from
-  different sources. Currently country information from Freebase, Factbook and FAO
-  (Food and Agriculture Organization of the United Nations), crisis information from
-  GDACS.org and geo data from LinkedGeoData are supported. Additional the GeoNames
-  module allows to get Factbook and FAO country information from GPS coordinates.
+  different sources. Currently microdata from websites, country information from Freebase,
+  Factbook and FAO (Food and Agriculture Organization of the United Nations), crisis
+  information from GDACS.org and geo data from LinkedGeoData are supported. Additional
+  the GeoNames module allows to get Factbook and FAO country information from GPS
+  coordinates.
 email:
 - alex.oberhauser@sigimera.org
 executables: []
@@ -227,8 +276,11 @@ files:
 - lib/semantic_crawler/linked_geo_data/relevant_node.rb
 - lib/semantic_crawler/linked_geo_data/relevant_nodes.rb
 - lib/semantic_crawler/version.rb
+- lib/semantic_crawler/websites.rb
+- lib/semantic_crawler/websites/micro_data.rb
 - lib/tasks/semantic_crawler_tasks.rake
 - log/.gitkeep
+- meta_extract.rb
 - semantic_crawler.gemspec
 - spec/dbpedia_spec.rb
 - spec/factbook_spec.rb
@@ -239,6 +291,8 @@ files:
 - spec/geo_names_spec.rb
 - spec/linked_geo_data_spec.rb
 - spec/spec_helper.rb
+- spec/websites_spec.rb
+- start_pry.sh
 - test/dummy/README.rdoc
 - test/dummy/Rakefile
 - test/dummy/app/assets/javascripts/application.js
@@ -276,7 +330,6 @@ files:
 - test/semantic_crawler_test.rb
 - test/test_helper.rb
 - test/dummy/log/test.log
-- test/dummy/log/development.log
 homepage: https://github.com/obale/semantic_crawler
 licenses:
 - MIT
@@ -290,30 +343,23 @@ required_ruby_version: !ruby/object:Gem::Requirement
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
-      segments:
-      - 0
-      hash: 1493129874267030689
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
-      segments:
-      - 0
-      hash: 1493129874267030689
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.24
+rubygems_version: 1.8.25
 signing_key:
 specification_version: 3
 summary: SemanticCrawler is a ruby library that encapsulates data gathering from different
-  sources.
+  sources, e.g. microdata from websites.
 test_files:
 - test/test_helper.rb
 - test/semantic_crawler_test.rb
 - test/dummy/log/test.log
-- test/dummy/log/development.log
 - test/dummy/public/422.html
 - test/dummy/public/favicon.ico
 - test/dummy/public/500.html
@@ -351,6 +397,7 @@ test_files:
 - spec/geo_names_spec.rb
 - spec/spec_helper.rb
 - spec/freebase_spec.rb
+- spec/websites_spec.rb
 - spec/gdacs_spec.rb
 - spec/fao_austria_spec.rb
 has_rdoc: