RubyGems - semantic-crawler - Versions diffs - 0.0.2 → 0.0.3 - Mend

semantic-crawler 0.0.2 → 0.0.3

Files changed (20) hide show

data/README.rdoc +17 -6
data/Rakefile +2 -4
data/lib/semantic_crawler.rb +25 -2
data/lib/semantic_crawler/factbook.rb +11 -0
data/lib/semantic_crawler/factbook/country.rb +15 -5
data/lib/semantic_crawler/freebase.rb +4 -0
data/lib/semantic_crawler/freebase/country.rb +5 -0
data/lib/semantic_crawler/freebase/crawler.rb +1 -0
data/lib/semantic_crawler/freebase/entity.rb +1 -0
data/lib/semantic_crawler/gdacs.rb +6 -0
data/lib/semantic_crawler/gdacs/feed.rb +100 -0
data/lib/semantic_crawler/gdacs/feed_item.rb +172 -0
data/lib/semantic_crawler/gdacs/resource.rb +66 -0
data/lib/semantic_crawler/linked_geo_data.rb +5 -0
data/lib/semantic_crawler/linked_geo_data/relevant_node.rb +13 -0
data/lib/semantic_crawler/version.rb +3 -1
data/test/dummy/log/test.log +4802 -0
data/test/factbook_test.rb +1 -1
data/test/gdacs_test.rb +80 -0
metadata +37 -27

data/README.rdoc CHANGED Viewed

@@ -1,5 +1,4 @@
-[ATTENTION]  This library is under heavy development and currently not working properly. Please be patient
-             for a usable version.
+[ATTENTION]  This library is under heavy development. Please be patient for a usable version.
 = SemanticCrawler
@@ -12,27 +11,35 @@ to bypass complex NLP (natural language processing).
 == Supported Sources
 * CIA Factbook RDF Dump, see http://www4.wiwiss.fu-berlin.de/factbook/directory/countries
+* [Started] LinkedGeoData - LGD (see http://linkedgeodata.org)
+* [Started] GDACS (see http://gdacs.org)
+* [Started] Freebase (see http://freebase.com)
 === TODO
-* LinkedGeoData - LGD(see http://linkedgeodata.org)
-* Freebase
 * Geonames
 * DBPedia
 * Different Government Sources
 == Installation
+    $ gem install semantic-crawler
+Or from source:
     $ git clone git://github.com/obale/semantic_crawler.git
     $ cd semantic_crawler
     $ bundle install
     $ rake build
     $ rake install pkg/semantic-crawler-*.gem
-If the library is stable enough it will be published at rubygems.org. That simplifies the installation.
 You can add this library also as dependency in your Gemfile:
+    gem "semantic-cralwer"
+Or from source:
     gem "semantic-crawler", :git => "git://github.com/obale/semantic_crawler.git"                   # for the master branch or
     gem "semantic-crawler", :git => "git://github.com/obale/semantic_crawler.git", :tags => "xyz"   # for the xyz tag
@@ -56,6 +63,10 @@ see CHANGELOG.rdoc
 * Ruby 1.8.7-p358 and Rails 3.2.2
 * Ruby 1.9.3-p125 and Rails 3.2.2
+=== Development Branch Health
+{<img src="https://secure.travis-ci.org/obale/semantic_crawler.png?branch=master"/>}[http://travis-ci.org/#!/obale/semantic_crawler]
 == License

data/Rakefile CHANGED Viewed

@@ -16,13 +16,12 @@ RDoc::Task.new(:rdoc) do |rdoc|
   rdoc.rdoc_dir = 'rdoc'
   rdoc.title    = 'SemanticCrawler'
   rdoc.options << '--line-numbers'
+  rdoc.options << '--main=README.rdoc'
   rdoc.rdoc_files.include('README.rdoc')
+  rdoc.rdoc_files.include('CHANGELOG.rdoc')
   rdoc.rdoc_files.include('lib/**/*.rb')
 end
 Bundler::GemHelper.install_tasks
 require 'rake/testtask'
@@ -34,5 +33,4 @@ Rake::TestTask.new(:test) do |t|
   t.verbose = false
 end
 task :default => :test

data/lib/semantic_crawler.rb CHANGED Viewed

@@ -1,13 +1,36 @@
+# encoding: UTF-8
 require "httparty"
+# The top level module contains the different data sources
+# as sub-modules. Currently there are the following modules
+# available:
+#
+# * Freebase
+# * Factbook
+# * LinkedGeoData
+# * Gdacs
+#
+# The existing modules are extended stepwise and additional
+# sources are added in the future.
 module SemanticCrawler
 end
-# Freebase.com
+# Freebase.com - module: Freebase
 require "semantic_crawler/freebase/crawler"
 require "semantic_crawler/freebase/entity"
 require "semantic_crawler/freebase/country"
-# CIA Factbook RDF Dump
+# CIA Factbook RDF Dump - module: Factbook
+require "semantic_crawler/factbook"
 require "semantic_crawler/factbook/country"
+# LinkedGeoData (http://linkedgeodata.org) - module: LinkedGeoData
+require "semantic_crawler/linked_geo_data"
+require "semantic_crawler/linked_geo_data/relevant_node"
+# GDACS (http://gdacs.org) - module: Gdacs
+require "semantic_crawler/gdacs"
+require "semantic_crawler/gdacs/feed.rb"
+require "semantic_crawler/gdacs/feed_item.rb"
+require "semantic_crawler/gdacs/resource.rb"

data/lib/semantic_crawler/factbook.rb ADDED Viewed

@@ -0,0 +1,11 @@
+# The RDF Dump of the CIA Factbook contains country relevant information. The
+# information are maybe deprecated, but for general purpose information that
+# never or infrequent change, this source is perfect.
+#
+# This module encapsulates the access to the underlying RDF files and wrapped
+# the most important properties. Not wrapped properties in the namespace of
+# factbook could be accessed via the following method:
+#
+# * link:SemanticCrawler::Factbook::Country::get_factbook_property
+module SemanticCrawler::Factbook
+end

data/lib/semantic_crawler/factbook/country.rb CHANGED Viewed

@@ -1,18 +1,28 @@
+# encoding: UTF-8
 require 'nokogiri'
 require 'open-uri'
 module SemanticCrawler
     module Factbook
+        # Extracted from the RDF Dump of the CIA Factbook. Contains all
+        # relevant, but maybe deprecated information about countries.
         class Country
+            # The prefixed used for each country
             @@URI_PREFIX = "http://www4.wiwiss.fu-berlin.de/factbook/data/"
+            # Predefined RDFS/OWL namespaces used for RDF file parsing.
             @@NAMESPACES = {
                 "factbook" => "http://www4.wiwiss.fu-berlin.de/factbook/ns#",
                 "rdfs" => "http://www.w3.org/2000/01/rdf-schema#",
                 "rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
             }
+            # Country name given as input during the object creation.
             attr_reader :country_name
+            # The complete URL of the country. Could be also wrong,
+            # if the country_name is not valid.
             attr_reader :url
             # Get Country Information from the CIA Factbook. see
@@ -23,15 +33,14 @@ module SemanticCrawler
             #   >> puts austria.background
             #
             # Argumenst:
-            #   name: (String)
+            #   new_country_name: (String)
             def initialize(new_country_name)
-                new_country_name.gsub!(" ", "_")
-                @country_name = new_country_name.downcase
-                @url = @@URI_PREFIX + @country_name
+                @country_name = new_country_name
+                @url = @@URI_PREFIX + @country_name.downcase.gsub(" ", "_").gsub("usa", "united_states")
                 begin
                     fetch_rdf
                 rescue => e
-                    puts "Not able to get country information, through exception: " + e
+                    puts "Not able to get country information, through exception: " + e.message
                 end
             end
@@ -116,6 +125,7 @@ module SemanticCrawler
             end
             private
+            # Retrieves the RDF stream
             def fetch_rdf
                 @doc = Nokogiri::XML(open(@url))
             end

data/lib/semantic_crawler/freebase.rb ADDED Viewed

@@ -0,0 +1,4 @@
+# The module Freebase retrieves general information
+# about different types.
+module SemanticCrawler::Freebase
+end

data/lib/semantic_crawler/freebase/country.rb CHANGED Viewed

@@ -1,9 +1,14 @@
+# [XXX] The current implementation outputs only an unreadable JSON object.
 class SemanticCrawler::Freebase::Country < SemanticCrawler::Freebase::Entity
+    # The Freebase object that should be retrieved
     attr_accessor :input_name
+    # The URL that points to the JSON object.
     attr_accessor :json_link
+    # The URL prefix of an Freebase experimental JSON entity.
     @@uri_prefix = "http://www.freebase.com/experimental/topic/standard/en/"
+    # Creates a new Freebase object (JSON)
     def initialize
         @input_name = nil
         @html_link = nil

data/lib/semantic_crawler/freebase/crawler.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 module SemanticCrawler
     module Freebase
+        # [XXX] This class should be deleted. The country object could be called directly.
         class Crawler
             # Returns a country object that contains all relevant
             # information that could be extracted from freebase.com

data/lib/semantic_crawler/freebase/entity.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# [XXX] Delete this class. Not necessary.
 class SemanticCrawler::Freebase::Entity
     include HTTParty
 end

data/lib/semantic_crawler/gdacs.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# encoding: UTF-8
+#
+# The Gdacs module is responsible for the parsing of the
+# current crisis RSS feeds from http://gdacs.org
+module SemanticCrawler::Gdacs
+end

data/lib/semantic_crawler/gdacs/feed.rb ADDED Viewed

@@ -0,0 +1,100 @@
+# encoding: UTF-8
+module SemanticCrawler
+    module Gdacs
+        # The GDACS.org RSS feed contains the latest crisis information about
+        # the following crisis types:
+        #
+        # * Floods
+        # * Earthquakes
+        # * Tropical Cyclones
+        # * Volcanoes
+        class Feed
+            @@NAMESPACES = {
+                "atom" => "http://www.w3.org/2005/Atom"
+            }
+            # The gdacs.org RSS feed URL. (default:
+            # http://new.gdacs.org/xml/rss.xml)
+            attr_reader :url
+            # Initializes the gdacs.org feed URL. If not specified the default
+            # URL (http://new.gdacs.org/xml/rss.xml) is used. Normally the
+            # feed URL should not be changed.
+            def initialize(new_url = "http://new.gdacs.org/xml/rss.xml")
+                @url = new_url
+                @root_node = nil
+                begin
+                    fetch_feed
+                rescue => e
+                    puts "Not able to get country information, through exception: " + e.message
+                end
+            end
+            # Get rss/channel/title
+            def title
+                query_root_node("title/text()")
+            end
+            # Get rss/channel/link
+            def link
+                query_root_node("link/text()")
+            end
+            # Get rss/channel/description
+            def description
+                query_root_node("description/text()")
+            end
+            # Get rss/channel/pubDate
+            def pubDate
+                query_root_node("pubDate/text()")
+            end
+            # Get rss/channel/webMaster
+            def webMaster
+                query_root_node("webMaster/text()")
+            end
+            # Get rss/channel/managingEditor
+            def managingEditor
+                query_root_node("managingEditor/text()")
+            end
+            # Get rss/channel/atom:link
+            def atom_link
+                query_root_node("atom:link/@href", @@NAMESPACES)
+            end
+            # Get rss/channel/item*
+            def items
+               nodeset = query_root_node("item")
+               @items = []
+               nodeset.each do |item|
+                   item_obj = SemanticCrawler::Gdacs::FeedItem.new(item)
+                   @items << item_obj
+               end
+               @items
+            end
+            # Query the root_node
+            def query_root_node(xpath_query, namespaces = {})
+                if !@root_node.nil?
+                    @root_node.xpath(xpath_query, namespaces)
+                end
+            end
+            def xml_document
+                @root_node.to_s
+            end
+            private
+                # Retrieves the RSS feed
+                def fetch_feed
+                    @doc = Nokogiri::XML(open(@url))
+                    @root_node = @doc.xpath("/rss/channel")
+                end
+        end
+    end
+end

data/lib/semantic_crawler/gdacs/feed_item.rb ADDED Viewed

@@ -0,0 +1,172 @@
+# encoding: UTF-8
+module SemanticCrawler
+    module Gdacs
+        # One crisis entity with related resources. Could be one of the
+        # following crisis types:
+        #
+        # * Floods
+        # * Earthquakes
+        # * Tropical Cyclones
+        # * Volcanoes
+        class FeedItem
+            # XML namespaces used for the parsing process
+            @@NAMESPACES = {
+                "gdacs" => "http://www.gdacs.org",
+                "asgard" => "http://asgard.jrc.it",
+                "geo" => "http://www.w3.org/2003/01/geo/wgs84_pos#",
+                "dc" => "http://purl.org/dc/elements/1.1/"
+            }
+            def initialize(new_root_node)
+                @root_node = new_root_node
+            end
+            # Returns the crisis title
+            def title
+                query_root_node("title/text()")
+            end
+            # Returns the crisis description
+            def description
+                query_root_node("description/text()")
+            end
+            # Returns the enclosure URL
+            def enclosure_url
+                query_root_node("enclosure/@url")
+            end
+            # Returns the enclosure type, e.g. image/png
+            def enclosure_type
+                query_root_node("enclosure/@type")
+            end
+            # Returns the enclosure length, e.g. 1
+            def enclosure_length
+                query_root_node("enclosure/@length")
+            end
+            # Returns the crisis gdacs link
+            def link
+                query_root_node("link/text()")
+            end
+            # Returns the publication date
+            def pubDate
+                query_root_node("pubDate/text()")
+            end
+            # Returns the crisis start date
+            def fromdate
+                query_root_node("gdacs:fromdate/text()", @@NAMESPACES)
+            end
+            # Returns the crisis end date
+            def todate
+                query_root_node("gdacs:todate/text()", @@NAMESPACES)
+            end
+            # Returns the crisis subject abbreviation
+            def subject
+                query_root_node("dc:subject/text()", @@NAMESPACES)
+            end
+            # Returns an unique crisis identifier (could be non permanent)
+            def guid
+                query_root_node("guid/text()")
+            end
+            # Returns the latitude GPS coordinate where the crisis has occurred
+            def latitude
+                query_root_node("geo:Point/geo:lat/text()", @@NAMESPACES)
+            end
+            # Returns the longitude GPS coordinate where the crisis has occurred
+            def longitude
+                query_root_node("geo:Point/geo:long/text()", @@NAMESPACES)
+            end
+            # Returns the version
+            def version
+                query_root_node("gdacs:version/text()", @@NAMESPACES)
+            end
+            # Returns the event type abbreviation, e.g. VO (for volcanic
+            # eruption, EQ (for earthquake), FL (for flood), TC (for tropical
+            # cyclone)
+            def eventtype
+                query_root_node("gdacs:eventtype/text()", @@NAMESPACES)
+            end
+            # Returns the alert level, could be GREEN, ORANGE or RED
+            def alertlevel
+                query_root_node("gdacs:alertlevel/text()", @@NAMESPACES)
+            end
+            # Returns the event name if available
+            def eventname
+                query_root_node("gdacs:eventname/text()", @@NAMESPACES)
+            end
+            # Returns the event id
+            def eventid
+                query_root_node("gdacs:eventid/text()", @@NAMESPACES)
+            end
+            # Returns the episode id
+            def episodeid
+                query_root_node("gdacs:episodeid/text()", @@NAMESPACES)
+            end
+            # Returns the severity as human readable string
+            def severity
+                query_root_node("gdacs:severity/text()", @@NAMESPACES)
+            end
+            # Returns the population as human readable string
+            def population
+                query_root_node("gdacs:population/text()", @@NAMESPACES)
+            end
+            # Returns the vulnerability as human readable string
+            def vulnerability
+                query_root_node("gdacs:vulnerability/text()", @@NAMESPACES)
+            end
+            # Returns the country iso3 code if available
+            def iso3
+                query_root_node("gdacs:iso3/text()", @@NAMESPACES)
+            end
+            # Returns the country name(s)
+            def country
+                query_root_node("gdacs:country/text()", @@NAMESPACES)
+            end
+            # Returns ...
+            def glide
+                query_root_node("gdacs:glide/text()", @@NAMESPACES)
+            end
+            # Returns an array of SemanticCrawler::Gdacs::Resource objects
+            def resources
+               nodeset = query_root_node("gdacs:resources/gdacs:resource", @@NAMESPACES)
+               @items = []
+               nodeset.each do |item|
+                   item_obj = SemanticCrawler::Gdacs::Resource.new(item)
+                   @items << item_obj
+               end
+               @items
+            end
+            # Query the root_node
+            def query_root_node(xpath_query, namespaces = {})
+                if !@root_node.nil?
+                    @root_node.xpath(xpath_query, namespaces)
+                end
+            end
+        end
+    end
+end