tomtaylor-geo-spider 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt ADDED
@@ -0,0 +1,3 @@
1
+ == 0.1.0 2008-09-06
2
+
3
+ * Initial release
data/License.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2008 Tom Taylor
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Manifest.txt ADDED
@@ -0,0 +1,34 @@
1
+ History.txt
2
+ License.txt
3
+ Manifest.txt
4
+ PostInstall.txt
5
+ README.txt
6
+ Rakefile
7
+ config/hoe.rb
8
+ config/requirements.rb
9
+ lib/geo-spider.rb
10
+ lib/geo-spider/extractors/base.rb
11
+ lib/geo-spider/extractors/master.rb
12
+ lib/geo-spider/extractors/microformat.rb
13
+ lib/geo-spider/extractors/postcode.rb
14
+ lib/geo-spider/location.rb
15
+ lib/geo-spider/page.rb
16
+ lib/geo-spider/site.rb
17
+ lib/geo-spider/version.rb
18
+ script/console
19
+ script/destroy
20
+ script/generate
21
+ setup.rb
22
+ spec/assets/pages/multiple_postcodes_and_microformats.html
23
+ spec/assets/pages/page_with_links.html
24
+ spec/assets/pages/separate_microformat_and_postcode.html
25
+ spec/assets/pages/single_microformat.html
26
+ spec/assets/pages/single_postcode.html
27
+ spec/geo-spider/page_spec.rb
28
+ spec/geo-spider/site_spec.rb
29
+ spec/spec.opts
30
+ spec/spec_helper.rb
31
+ tasks/deployment.rake
32
+ tasks/environment.rake
33
+ tasks/rspec.rake
34
+ tasks/website.rake
data/PostInstall.txt ADDED
File without changes
data/README.txt ADDED
@@ -0,0 +1,67 @@
1
+ = geo-spider
2
+
3
+ * http://geospider.rubyforge.org
4
+ * http://github.com/tomtaylor/geo-spider
5
+
6
+ == DESCRIPTION:
7
+
8
+ Tool for spidering websites/blogs, extracting geodata from specific pages.
9
+
10
+ Starting at a base URL, it will spider every page underneath, returning pages which have a URL that matches a desired pattern.
11
+
12
+ The typical use case is spidering an entire blog for posts which contain geodata.
13
+
14
+ Different methods for extracting geodata can be used. It currently supports UK postcodes and the abbr design pattern geo microformat <http://microformats.org/wiki/geo>.
15
+
16
+ It is current in use behind the scenes of the Geoblogomatic <http://www.geoblogomatic.com>
17
+
18
+ == FEATURES/PROBLEMS:
19
+
20
+ * Still very much in development.
21
+
22
+ == SYNOPSIS:
23
+
24
+ Spider entire sites like so:
25
+
26
+ require 'geo-spider'
27
+ site = GeoSpider::Site.new("http://www.piecesofhackney.co.uk")
28
+
29
+ site.each_page do |page|
30
+ puts page.locations.inspect
31
+ end
32
+
33
+ Extract geodata from specific page like so:
34
+
35
+ require 'geo-spider'
36
+ page = GeoSpider::Page.new("http://www.nothingtoseehere.net/2008/07/t34_tank_london_1.html")
37
+ puts page.locations.inspect
38
+
39
+ == REQUIREMENTS:
40
+
41
+ * hpricot (http://code.whytheluckystiff.net/hpricot/) - for HTML parsing
42
+ * graticule (http://graticule.rubyforge.org/) - for geocoding
43
+
44
+ == LICENSE:
45
+
46
+ (The MIT License)
47
+
48
+ Copyright (c) 2008 Tom Taylor
49
+
50
+ Permission is hereby granted, free of charge, to any person obtaining
51
+ a copy of this software and associated documentation files (the
52
+ 'Software'), to deal in the Software without restriction, including
53
+ without limitation the rights to use, copy, modify, merge, publish,
54
+ distribute, sublicense, and/or sell copies of the Software, and to
55
+ permit persons to whom the Software is furnished to do so, subject to
56
+ the following conditions:
57
+
58
+ The above copyright notice and this permission notice shall be
59
+ included in all copies or substantial portions of the Software.
60
+
61
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
62
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
63
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
64
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
65
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
66
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
67
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ require 'config/requirements'
2
+ require 'config/hoe' # setup Hoe + all gem configuration
3
+
4
+ Dir['tasks/**/*.rake'].each { |rake| load rake }
data/config/hoe.rb ADDED
@@ -0,0 +1,73 @@
1
+ require 'geo-spider/version'
2
+
3
+ AUTHOR = 'Tom Taylor' # can also be an array of Authors
4
+ EMAIL = "tom@tomtaylor.co.uk"
5
+ DESCRIPTION = "Tool for spidering websites, extracting pages with geodata."
6
+ GEM_NAME = 'geo-spider' # what ppl will type to install your gem
7
+ RUBYFORGE_PROJECT = 'geospider' # The unix name for your project
8
+ HOMEPATH = "http://#{RUBYFORGE_PROJECT}.rubyforge.org"
9
+ DOWNLOAD_PATH = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
10
+ EXTRA_DEPENDENCIES = [
11
+ ['hpricot', 'graticule']
12
+ ] # An array of rubygem dependencies [name, version]
13
+
14
+ @config_file = "~/.rubyforge/user-config.yml"
15
+ @config = nil
16
+ RUBYFORGE_USERNAME = "tomtaylor"
17
+ def rubyforge_username
18
+ unless @config
19
+ begin
20
+ @config = YAML.load(File.read(File.expand_path(@config_file)))
21
+ rescue
22
+ puts <<-EOS
23
+ ERROR: No rubyforge config file found: #{@config_file}
24
+ Run 'rubyforge setup' to prepare your env for access to Rubyforge
25
+ - See http://newgem.rubyforge.org/rubyforge.html for more details
26
+ EOS
27
+ exit
28
+ end
29
+ end
30
+ RUBYFORGE_USERNAME.replace @config["username"]
31
+ end
32
+
33
+
34
+ REV = nil
35
+ # UNCOMMENT IF REQUIRED:
36
+ # REV = YAML.load(`svn info`)['Revision']
37
+ VERS = GeoSpider::VERSION::STRING + (REV ? ".#{REV}" : "")
38
+ RDOC_OPTS = ['--quiet', '--title', 'geo-spider documentation',
39
+ "--opname", "index.html",
40
+ "--line-numbers",
41
+ "--main", "README",
42
+ "--inline-source"]
43
+
44
+ class Hoe
45
+ def extra_deps
46
+ @extra_deps.reject! { |x| Array(x).first == 'hoe' }
47
+ @extra_deps
48
+ end
49
+ end
50
+
51
+ # Generate all the Rake tasks
52
+ # Run 'rake -T' to see list of generated tasks (from gem root directory)
53
+ $hoe = Hoe.new(GEM_NAME, VERS) do |p|
54
+ p.developer(AUTHOR, EMAIL)
55
+ p.description = DESCRIPTION
56
+ p.summary = DESCRIPTION
57
+ p.url = HOMEPATH
58
+ p.rubyforge_name = RUBYFORGE_PROJECT if RUBYFORGE_PROJECT
59
+ p.test_globs = ["test/**/test_*.rb"]
60
+ p.clean_globs |= ['**/.*.sw?', '*.gem', '.config', '**/.DS_Store'] #An array of file patterns to delete on clean.
61
+ p.remote_rdoc_dir = ''
62
+ # == Optional
63
+ p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
64
+ #p.extra_deps = EXTRA_DEPENDENCIES
65
+
66
+ #p.spec_extras = {} # A hash of extra values to set in the gemspec.
67
+ end
68
+
69
+ CHANGES = $hoe.paragraphs_of('History.txt', 0..1).join("\\n\\n")
70
+ PATH = (RUBYFORGE_PROJECT == GEM_NAME) ? RUBYFORGE_PROJECT : "#{RUBYFORGE_PROJECT}/#{GEM_NAME}"
71
+ # $hoe.remote_rdoc_dir = File.join(PATH.gsub(/^#{RUBYFORGE_PROJECT}\/?/,''), 'rdoc')
72
+ $hoe.rsync_args = '-av --delete --ignore-errors'
73
+ $hoe.spec.post_install_message = File.open(File.dirname(__FILE__) + "/../PostInstall.txt").read rescue ""
@@ -0,0 +1,15 @@
1
+ require 'fileutils'
2
+ include FileUtils
3
+
4
+ require 'rubygems'
5
+ %w[rake hoe newgem rubigen].each do |req_gem|
6
+ begin
7
+ require req_gem
8
+ rescue LoadError
9
+ puts "This Rakefile requires the '#{req_gem}' RubyGem."
10
+ puts "Installation: gem install #{req_gem} -y"
11
+ exit
12
+ end
13
+ end
14
+
15
+ $:.unshift(File.join(File.dirname(__FILE__), %w[.. lib]))
@@ -0,0 +1,15 @@
1
+ module GeoSpider
2
+
3
+ module Extractors
4
+
5
+ class Base
6
+
7
+ def initialize(element)
8
+ @element = element
9
+ end
10
+
11
+ end
12
+
13
+ end
14
+
15
+ end
@@ -0,0 +1,23 @@
1
+ require 'geo-spider/extractors/microformat'
2
+ require 'geo-spider/extractors/postcode'
3
+
4
+ module GeoSpider
5
+
6
+ module Extractors
7
+
8
+ class Master < GeoSpider::Extractors::Base
9
+
10
+ # TODO: Handle duplicates from different data sources
11
+
12
+ def locations
13
+ microformat_locations = Extractors::Microformat.new(@element).locations
14
+ postcode_locations = Extractors::Postcode.new(@element).locations
15
+
16
+ (microformat_locations + postcode_locations).flatten
17
+ end
18
+
19
+ end
20
+
21
+ end
22
+
23
+ end
@@ -0,0 +1,21 @@
1
+ require 'geo-spider/extractors/base'
2
+
3
+ module GeoSpider
4
+
5
+ module Extractors
6
+
7
+ class Microformat < GeoSpider::Extractors::Base
8
+
9
+ def locations
10
+ @element.search("abbr[@class='geo'][@title]").map do |geo|
11
+ latitude, longitude = geo.attributes["title"].split(";")
12
+ text = geo.inner_text
13
+ Location.new(:latitude => latitude.to_f, :longitude => longitude.to_f, :title => text)
14
+ end
15
+ end
16
+
17
+ end
18
+
19
+ end
20
+
21
+ end
@@ -0,0 +1,40 @@
1
+ require 'geo-spider/extractors/base'
2
+ require 'graticule'
3
+
4
+ module GeoSpider
5
+
6
+ module Extractors
7
+
8
+ class Postcode < Base
9
+
10
+ # Full BS 7666 postcode format. Source: http://en.wikipedia.org/wiki/UK_postcodes
11
+ REGEXP = /(GIR 0AA|[A-PR-UWYZ]([0-9]{1,2}|([A-HK-Y][0-9]|[A-HK-Y][0-9]([0-9]|[ABEHMNPRV-Y]))|[0-9][A-HJKS-UW])(\s*)[0-9][ABD-HJLNP-UW-Z]{2})/i
12
+
13
+ def locations
14
+ results = @element.inner_text.scan(REGEXP)
15
+ results = results.map(&:first)
16
+
17
+ results.map do |result|
18
+ latitude, longitude = geocoder.location(result)
19
+ Location.new(:latitude => latitude, :longitude => longitude, :title => result)
20
+ end
21
+ end
22
+
23
+ # You need to set a valid Yahoo API key before the UK postcode geocoding will work. Yahoo have vastly better UK postcode accuracy than the other large mapping providers, apart from perhaps Multimap.
24
+
25
+ def self.api_key=(api_key)
26
+ @@api_key = api_key
27
+ end
28
+
29
+ private
30
+
31
+ def geocoder
32
+ raise "No Yahoo API key set" unless @@api_key
33
+ Graticule.service(:yahoo).new @@api_key
34
+ end
35
+
36
+ end
37
+
38
+ end
39
+
40
+ end
@@ -0,0 +1,18 @@
1
+ module GeoSpider
2
+
3
+ class Location
4
+
5
+ attr_reader :longitude, :latitude, :title
6
+
7
+ def initialize(params = {})
8
+ raise "No longitude provided" unless params[:longitude]
9
+ raise "No latitude provided" unless params[:latitude]
10
+
11
+ @latitude = params[:latitude]
12
+ @longitude = params[:longitude]
13
+ @title = params[:title]
14
+ end
15
+
16
+ end
17
+
18
+ end
@@ -0,0 +1,83 @@
1
+ require 'geo-spider/location'
2
+ require 'geo-spider/extractors/master'
3
+
4
+ module GeoSpider
5
+
6
+ class Page
7
+
8
+ attr_reader :url
9
+
10
+ DEFAULT_CONTENT_CSS_SELECTOR = "body" # Find locations within the entire body by default
11
+ DEFAULT_TITLE_CSS_SELECTOR = "title" # Use the title in the head by deault
12
+
13
+ # Create a new page based on the URL.
14
+
15
+ def initialize(url, options = {})
16
+ @url = url
17
+ @site = options[:site]
18
+ @content_css_selector = options[:content_css_selector] || DEFAULT_CONTENT_CSS_SELECTOR
19
+ @title_css_selector = options[:title_css_selector] || DEFAULT_TITLE_CSS_SELECTOR
20
+ hpricot_doc
21
+ end
22
+
23
+ def title
24
+ hpricot_doc.at(@title_css_selector).inner_text
25
+ end
26
+
27
+ # Returns an array of Location objects based on the locations found in the page.
28
+
29
+ def locations
30
+ body_element = hpricot_doc.at(@content_css_selector)
31
+ master_extractor = Extractors::Master.new(body_element)
32
+ master_extractor.locations
33
+ end
34
+
35
+ # Returns a unique array of URLs present in the page as strings, normalized to remove anchors.
36
+
37
+ def links
38
+ hpricot_doc.search("a[@href]").map do |a|
39
+ normalize_url(a.attributes["href"])
40
+ end.uniq.reject { |b| rejected_url?(b) }
41
+ end
42
+
43
+ # Returns a unique array of internal URLs present in the page as string, normalized to remove anchors. Needs the page to know what site it is part of, or it cannot decide what is an internal link.
44
+
45
+ def internal_links
46
+ raise("Cannot discover internal links without knowing what site this page is part of.") if @site.nil?
47
+ links.select { |l| internal_url?(l) }
48
+ end
49
+
50
+ private
51
+
52
+ def hpricot_doc
53
+ @hpricot_doc ||= Hpricot(raw_http)
54
+ end
55
+
56
+ def raw_http
57
+ open(self.url, 'User-Agent' => GeoSpider::user_agent)
58
+ end
59
+
60
+ def internal_url?(url_to_test)
61
+ # Does it begin with the URL of the site and what's the extension?
62
+ url_to_test[0, @site.url.to_s.length] == @site.url.to_s
63
+ end
64
+
65
+ def rejected_url?(url_to_test)
66
+ url_to_test =~ /(mp3|m4a|mov|jpg|png|gif|zip|pdf)$/i
67
+ end
68
+
69
+ def normalize_url(link_url)
70
+ begin
71
+ link_url = URI.parse(link_url)
72
+ link_url.merge(@url) unless link_url.absolute?
73
+ link_url.fragment = nil
74
+ link_url.to_s
75
+ rescue URI::InvalidURIError
76
+ ""
77
+ end
78
+ end
79
+
80
+ end
81
+
82
+
83
+ end
@@ -0,0 +1,50 @@
1
+ module GeoSpider
2
+
3
+ class Site
4
+
5
+ attr_reader :url
6
+
7
+ DEFAULT_REGEXP = /.+/ # By default match every URL
8
+
9
+ def initialize(url)
10
+ @url = URI.parse(url)
11
+ end
12
+
13
+ def each_page(options = {}, &block)
14
+ regexp = options.delete(:regexp) || DEFAULT_REGEXP
15
+
16
+ options = options.merge( { :site => self } )
17
+
18
+ queue = [self.url.to_s]
19
+ seen = []
20
+
21
+ until queue.empty? do
22
+ url = queue.shift
23
+ begin
24
+ page = Page.new(url, options)
25
+ if url =~ regexp
26
+ yield page
27
+ end
28
+ seen << url
29
+ next_links = (page.internal_links - seen - queue) # only add internal links that we've not seen or already have queued.
30
+ queue.concat(next_links)
31
+ rescue # need to decide what exactly to rescue from, rather than just everything.
32
+ next
33
+ end
34
+ end
35
+ end
36
+
37
+ def pages(options = {})
38
+ pages = []
39
+
40
+ self.each_page(options) do |page|
41
+ pages << page
42
+ end
43
+
44
+ pages
45
+ end
46
+
47
+
48
+ end
49
+
50
+ end
@@ -0,0 +1,9 @@
1
+ module GeoSpider
2
+ module VERSION #:nodoc:
3
+ MAJOR = 0
4
+ MINOR = 1
5
+ TINY = 0
6
+
7
+ STRING = [MAJOR, MINOR, TINY].join('.')
8
+ end
9
+ end
data/lib/geo-spider.rb ADDED
@@ -0,0 +1,23 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ require 'rubygems'
5
+ require 'hpricot'
6
+ require 'open-uri'
7
+
8
+ require 'geo-spider/page'
9
+ require 'geo-spider/site'
10
+
11
+ module GeoSpider
12
+
13
+ DEFAULT_USER_AGENT = 'geo-spider (http://github.com/tomtaylor/geo-spider)'
14
+
15
+ def self.user_agent
16
+ @user_agent || DEFAULT_USER_AGENT
17
+ end
18
+
19
+ def self.user_agent=(user_agent)
20
+ @user_agent = user_agent
21
+ end
22
+
23
+ end
data/script/console ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+ # File: script/console
3
+ irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
4
+
5
+ libs = " -r irb/completion"
6
+ # Perhaps use a console_lib to store any extra methods I may want available in the cosole
7
+ # libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
8
+ libs << " -r #{File.dirname(__FILE__) + '/../lib/geo-spider.rb'}"
9
+ puts "Loading geo-spider gem"
10
+ exec "#{irb} #{libs} --simple-prompt"
data/script/destroy ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/destroy'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Destroy.new.run(ARGV)
data/script/generate ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/generate'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Generate.new.run(ARGV)