RubyGems - tomtaylor-geo-spider - Versions diffs - 0.1.0 - Mend

tomtaylor-geo-spider 0.1.0

Files changed (35) hide show

data/History.txt +3 -0
data/License.txt +20 -0
data/Manifest.txt +34 -0
data/PostInstall.txt +0 -0
data/README.txt +67 -0
data/Rakefile +4 -0
data/config/hoe.rb +73 -0
data/config/requirements.rb +15 -0
data/lib/geo-spider/extractors/base.rb +15 -0
data/lib/geo-spider/extractors/master.rb +23 -0
data/lib/geo-spider/extractors/microformat.rb +21 -0
data/lib/geo-spider/extractors/postcode.rb +40 -0
data/lib/geo-spider/location.rb +18 -0
data/lib/geo-spider/page.rb +83 -0
data/lib/geo-spider/site.rb +50 -0
data/lib/geo-spider/version.rb +9 -0
data/lib/geo-spider.rb +23 -0
data/script/console +10 -0
data/script/destroy +14 -0
data/script/generate +14 -0
data/setup.rb +1585 -0
data/spec/assets/pages/multiple_postcodes_and_microformats.html +15 -0
data/spec/assets/pages/page_with_links.html +14 -0
data/spec/assets/pages/separate_microformat_and_postcode.html +13 -0
data/spec/assets/pages/single_microformat.html +13 -0
data/spec/assets/pages/single_postcode.html +13 -0
data/spec/geo-spider/page_spec.rb +125 -0
data/spec/geo-spider/site_spec.rb +8 -0
data/spec/spec.opts +1 -0
data/spec/spec_helper.rb +19 -0
data/tasks/deployment.rake +34 -0
data/tasks/environment.rake +7 -0
data/tasks/rspec.rake +21 -0
data/tasks/website.rake +9 -0
metadata +100 -0

data/History.txt ADDED Viewed

@@ -0,0 +1,3 @@
+== 0.1.0 2008-09-06
+* Initial release

data/License.txt ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2008 Tom Taylor
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/Manifest.txt ADDED Viewed

@@ -0,0 +1,34 @@
+History.txt
+License.txt
+Manifest.txt
+PostInstall.txt
+README.txt
+Rakefile
+config/hoe.rb
+config/requirements.rb
+lib/geo-spider.rb
+lib/geo-spider/extractors/base.rb
+lib/geo-spider/extractors/master.rb
+lib/geo-spider/extractors/microformat.rb
+lib/geo-spider/extractors/postcode.rb
+lib/geo-spider/location.rb
+lib/geo-spider/page.rb
+lib/geo-spider/site.rb
+lib/geo-spider/version.rb
+script/console
+script/destroy
+script/generate
+setup.rb
+spec/assets/pages/multiple_postcodes_and_microformats.html
+spec/assets/pages/page_with_links.html
+spec/assets/pages/separate_microformat_and_postcode.html
+spec/assets/pages/single_microformat.html
+spec/assets/pages/single_postcode.html
+spec/geo-spider/page_spec.rb
+spec/geo-spider/site_spec.rb
+spec/spec.opts
+spec/spec_helper.rb
+tasks/deployment.rake
+tasks/environment.rake
+tasks/rspec.rake
+tasks/website.rake

data/PostInstall.txt ADDED Viewed

File without changes

data/README.txt ADDED Viewed

@@ -0,0 +1,67 @@
+= geo-spider
+* http://geospider.rubyforge.org
+* http://github.com/tomtaylor/geo-spider
+== DESCRIPTION:
+Tool for spidering websites/blogs, extracting geodata from specific pages.
+Starting at a base URL, it will spider every page underneath, returning pages which have a URL that matches a desired pattern.
+The typical use case is spidering an entire blog for posts which contain geodata.
+Different methods for extracting geodata can be used. It currently supports UK postcodes and the abbr design pattern geo microformat <http://microformats.org/wiki/geo>.
+It is current in use behind the scenes of the Geoblogomatic <http://www.geoblogomatic.com>
+== FEATURES/PROBLEMS:
+* Still very much in development.
+== SYNOPSIS:
+Spider entire sites like so:
+  require 'geo-spider'
+  site = GeoSpider::Site.new("http://www.piecesofhackney.co.uk")
+  site.each_page do |page|
+    puts page.locations.inspect
+  end
+Extract geodata from specific page like so:
+  require 'geo-spider'
+  page = GeoSpider::Page.new("http://www.nothingtoseehere.net/2008/07/t34_tank_london_1.html")
+  puts page.locations.inspect
+== REQUIREMENTS:
+* hpricot (http://code.whytheluckystiff.net/hpricot/) - for HTML parsing
+* graticule (http://graticule.rubyforge.org/) - for geocoding
+== LICENSE:
+(The MIT License)
+Copyright (c) 2008 Tom Taylor
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/Rakefile ADDED Viewed

@@ -0,0 +1,4 @@
+require 'config/requirements'
+require 'config/hoe' # setup Hoe + all gem configuration
+Dir['tasks/**/*.rake'].each { |rake| load rake }

data/config/hoe.rb ADDED Viewed

@@ -0,0 +1,73 @@
+require 'geo-spider/version'
+AUTHOR = 'Tom Taylor'  # can also be an array of Authors
+EMAIL = "tom@tomtaylor.co.uk"
+DESCRIPTION = "Tool for spidering websites, extracting pages with geodata."
+GEM_NAME = 'geo-spider' # what ppl will type to install your gem
+RUBYFORGE_PROJECT = 'geospider' # The unix name for your project
+HOMEPATH = "http://#{RUBYFORGE_PROJECT}.rubyforge.org"
+DOWNLOAD_PATH = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
+EXTRA_DEPENDENCIES = [
+ ['hpricot', 'graticule']
+]    # An array of rubygem dependencies [name, version]
+@config_file = "~/.rubyforge/user-config.yml"
+@config = nil
+RUBYFORGE_USERNAME = "tomtaylor"
+def rubyforge_username
+  unless @config
+    begin
+      @config = YAML.load(File.read(File.expand_path(@config_file)))
+    rescue
+      puts <<-EOS
+ERROR: No rubyforge config file found: #{@config_file}
+Run 'rubyforge setup' to prepare your env for access to Rubyforge
+ - See http://newgem.rubyforge.org/rubyforge.html for more details
+      EOS
+      exit
+    end
+  end
+  RUBYFORGE_USERNAME.replace @config["username"]
+end
+REV = nil
+# UNCOMMENT IF REQUIRED:
+# REV = YAML.load(`svn info`)['Revision']
+VERS = GeoSpider::VERSION::STRING + (REV ? ".#{REV}" : "")
+RDOC_OPTS = ['--quiet', '--title', 'geo-spider documentation',
+    "--opname", "index.html",
+    "--line-numbers",
+    "--main", "README",
+    "--inline-source"]
+class Hoe
+  def extra_deps
+    @extra_deps.reject! { |x| Array(x).first == 'hoe' }
+    @extra_deps
+  end
+end
+# Generate all the Rake tasks
+# Run 'rake -T' to see list of generated tasks (from gem root directory)
+$hoe = Hoe.new(GEM_NAME, VERS) do |p|
+  p.developer(AUTHOR, EMAIL)
+  p.description = DESCRIPTION
+  p.summary = DESCRIPTION
+  p.url = HOMEPATH
+  p.rubyforge_name = RUBYFORGE_PROJECT if RUBYFORGE_PROJECT
+  p.test_globs = ["test/**/test_*.rb"]
+  p.clean_globs |= ['**/.*.sw?', '*.gem', '.config', '**/.DS_Store']  #An array of file patterns to delete on clean.
+  p.remote_rdoc_dir = ''
+  # == Optional
+  p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
+  #p.extra_deps = EXTRA_DEPENDENCIES
+    #p.spec_extras = {}    # A hash of extra values to set in the gemspec.
+  end
+CHANGES = $hoe.paragraphs_of('History.txt', 0..1).join("\\n\\n")
+PATH    = (RUBYFORGE_PROJECT == GEM_NAME) ? RUBYFORGE_PROJECT : "#{RUBYFORGE_PROJECT}/#{GEM_NAME}"
+# $hoe.remote_rdoc_dir = File.join(PATH.gsub(/^#{RUBYFORGE_PROJECT}\/?/,''), 'rdoc')
+$hoe.rsync_args = '-av --delete --ignore-errors'
+$hoe.spec.post_install_message = File.open(File.dirname(__FILE__) + "/../PostInstall.txt").read rescue ""

data/config/requirements.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require 'fileutils'
+include FileUtils
+require 'rubygems'
+%w[rake hoe newgem rubigen].each do |req_gem|
+  begin
+    require req_gem
+  rescue LoadError
+    puts "This Rakefile requires the '#{req_gem}' RubyGem."
+    puts "Installation: gem install #{req_gem} -y"
+    exit
+  end
+end
+$:.unshift(File.join(File.dirname(__FILE__), %w[.. lib]))

data/lib/geo-spider/extractors/base.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module GeoSpider
+  module Extractors
+    class Base
+      def initialize(element)
+        @element = element
+      end
+    end
+  end
+end

data/lib/geo-spider/extractors/master.rb ADDED Viewed

@@ -0,0 +1,23 @@
+require 'geo-spider/extractors/microformat'
+require 'geo-spider/extractors/postcode'
+module GeoSpider
+  module Extractors
+    class Master < GeoSpider::Extractors::Base
+      # TODO: Handle duplicates from different data sources
+      def locations
+        microformat_locations = Extractors::Microformat.new(@element).locations
+        postcode_locations = Extractors::Postcode.new(@element).locations
+        (microformat_locations + postcode_locations).flatten
+      end
+    end
+  end
+end

data/lib/geo-spider/extractors/microformat.rb ADDED Viewed

@@ -0,0 +1,21 @@
+require 'geo-spider/extractors/base'
+module GeoSpider
+  module Extractors
+    class Microformat < GeoSpider::Extractors::Base
+      def locations
+        @element.search("abbr[@class='geo'][@title]").map do |geo|
+          latitude, longitude = geo.attributes["title"].split(";")
+          text = geo.inner_text
+          Location.new(:latitude => latitude.to_f, :longitude => longitude.to_f, :title => text)
+        end
+      end
+    end
+  end
+end

data/lib/geo-spider/extractors/postcode.rb ADDED Viewed

@@ -0,0 +1,40 @@
+require 'geo-spider/extractors/base'
+require 'graticule'
+module GeoSpider
+  module Extractors
+    class Postcode < Base
+      # Full BS 7666 postcode format. Source: http://en.wikipedia.org/wiki/UK_postcodes
+      REGEXP = /(GIR 0AA|[A-PR-UWYZ]([0-9]{1,2}|([A-HK-Y][0-9]|[A-HK-Y][0-9]([0-9]|[ABEHMNPRV-Y]))|[0-9][A-HJKS-UW])(\s*)[0-9][ABD-HJLNP-UW-Z]{2})/i
+      def locations
+        results = @element.inner_text.scan(REGEXP)
+        results = results.map(&:first)
+        results.map do |result|
+          latitude, longitude = geocoder.location(result)
+          Location.new(:latitude => latitude, :longitude => longitude, :title => result)
+        end
+      end
+      # You need to set a valid Yahoo API key before the UK postcode geocoding will work. Yahoo have vastly better UK postcode accuracy than the other large mapping providers, apart from perhaps Multimap.
+      def self.api_key=(api_key)
+        @@api_key = api_key
+      end
+      private
+      def geocoder
+        raise "No Yahoo API key set" unless @@api_key
+        Graticule.service(:yahoo).new @@api_key
+      end
+    end
+  end
+end

data/lib/geo-spider/location.rb ADDED Viewed

@@ -0,0 +1,18 @@
+module GeoSpider
+  class Location
+    attr_reader :longitude, :latitude, :title
+    def initialize(params = {})
+      raise "No longitude provided" unless params[:longitude]
+      raise "No latitude provided" unless params[:latitude]
+      @latitude = params[:latitude]
+      @longitude = params[:longitude]
+      @title = params[:title]
+    end
+  end
+end

data/lib/geo-spider/page.rb ADDED Viewed

@@ -0,0 +1,83 @@
+require 'geo-spider/location'
+require 'geo-spider/extractors/master'
+module GeoSpider
+  class Page
+    attr_reader :url
+    DEFAULT_CONTENT_CSS_SELECTOR = "body" # Find locations within the entire body by default
+    DEFAULT_TITLE_CSS_SELECTOR = "title" # Use the title in the head by deault
+    # Create a new page based on the URL.
+    def initialize(url, options = {})
+      @url = url
+      @site = options[:site]
+      @content_css_selector = options[:content_css_selector] || DEFAULT_CONTENT_CSS_SELECTOR
+      @title_css_selector = options[:title_css_selector] || DEFAULT_TITLE_CSS_SELECTOR
+      hpricot_doc
+    end
+    def title
+      hpricot_doc.at(@title_css_selector).inner_text
+    end
+    # Returns an array of Location objects based on the locations found in the page.
+    def locations
+      body_element = hpricot_doc.at(@content_css_selector)
+      master_extractor = Extractors::Master.new(body_element)
+      master_extractor.locations
+    end
+    # Returns a unique array of URLs present in the page as strings, normalized to remove anchors.
+    def links
+      hpricot_doc.search("a[@href]").map do |a|
+        normalize_url(a.attributes["href"])
+      end.uniq.reject { |b| rejected_url?(b) }
+    end
+    # Returns a unique array of internal URLs present in the page as string, normalized to remove anchors. Needs the page to know what site it is part of, or it cannot decide what is an internal link.
+    def internal_links
+      raise("Cannot discover internal links without knowing what site this page is part of.") if @site.nil?
+      links.select { |l| internal_url?(l) }
+    end
+    private
+    def hpricot_doc
+      @hpricot_doc ||= Hpricot(raw_http)
+    end
+    def raw_http
+      open(self.url, 'User-Agent' => GeoSpider::user_agent)
+    end
+    def internal_url?(url_to_test)
+      # Does it begin with the URL of the site and what's the extension?
+      url_to_test[0, @site.url.to_s.length] == @site.url.to_s
+    end
+    def rejected_url?(url_to_test)
+      url_to_test =~ /(mp3|m4a|mov|jpg|png|gif|zip|pdf)$/i
+    end
+    def normalize_url(link_url)
+      begin
+        link_url = URI.parse(link_url)
+        link_url.merge(@url) unless link_url.absolute?
+        link_url.fragment = nil
+        link_url.to_s
+      rescue URI::InvalidURIError
+        ""
+      end
+    end
+  end
+end

data/lib/geo-spider/site.rb ADDED Viewed

@@ -0,0 +1,50 @@
+module GeoSpider
+  class Site
+    attr_reader :url
+    DEFAULT_REGEXP = /.+/ # By default match every URL
+    def initialize(url)
+      @url = URI.parse(url)
+    end
+    def each_page(options = {}, &block)
+      regexp = options.delete(:regexp) || DEFAULT_REGEXP
+      options = options.merge( { :site => self } )
+      queue = [self.url.to_s]
+      seen = []
+      until queue.empty? do
+        url = queue.shift
+        begin
+          page = Page.new(url, options)
+          if url =~ regexp
+            yield page
+          end
+          seen << url
+          next_links = (page.internal_links - seen - queue) # only add internal links that we've not seen or already have queued.
+          queue.concat(next_links)
+        rescue # need to decide what exactly to rescue from, rather than just everything.
+          next
+        end
+      end
+    end
+    def pages(options = {})
+      pages = []
+      self.each_page(options) do |page|
+        pages << page
+      end
+      pages
+    end
+  end
+end

data/lib/geo-spider/version.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module GeoSpider
+  module VERSION #:nodoc:
+    MAJOR = 0
+    MINOR = 1
+    TINY  = 0
+    STRING = [MAJOR, MINOR, TINY].join('.')
+  end
+end

data/lib/geo-spider.rb ADDED Viewed

@@ -0,0 +1,23 @@
+$:.unshift(File.dirname(__FILE__)) unless
+  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
+require 'rubygems'
+require 'hpricot'
+require 'open-uri'
+require 'geo-spider/page'
+require 'geo-spider/site'
+module GeoSpider
+  DEFAULT_USER_AGENT = 'geo-spider (http://github.com/tomtaylor/geo-spider)'
+  def self.user_agent
+    @user_agent || DEFAULT_USER_AGENT
+  end
+  def self.user_agent=(user_agent)
+    @user_agent = user_agent
+  end
+end

data/script/console ADDED Viewed

@@ -0,0 +1,10 @@
+#!/usr/bin/env ruby
+# File: script/console
+irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
+libs =  " -r irb/completion"
+# Perhaps use a console_lib to store any extra methods I may want available in the cosole
+# libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
+libs <<  " -r #{File.dirname(__FILE__) + '/../lib/geo-spider.rb'}"
+puts "Loading geo-spider gem"
+exec "#{irb} #{libs} --simple-prompt"

data/script/destroy ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
+begin
+  require 'rubigen'
+rescue LoadError
+  require 'rubygems'
+  require 'rubigen'
+end
+require 'rubigen/scripts/destroy'
+ARGV.shift if ['--help', '-h'].include?(ARGV[0])
+RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
+RubiGen::Scripts::Destroy.new.run(ARGV)

data/script/generate ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
+begin
+  require 'rubigen'
+rescue LoadError
+  require 'rubygems'
+  require 'rubigen'
+end
+require 'rubigen/scripts/generate'
+ARGV.shift if ['--help', '-h'].include?(ARGV[0])
+RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
+RubiGen::Scripts::Generate.new.run(ARGV)