geo-spider 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +3 -0
- data/License.txt +20 -0
- data/Manifest.txt +34 -0
- data/PostInstall.txt +0 -0
- data/README.txt +67 -0
- data/Rakefile +4 -0
- data/config/hoe.rb +73 -0
- data/config/requirements.rb +15 -0
- data/lib/geo-spider.rb +23 -0
- data/lib/geo-spider/extractors/base.rb +15 -0
- data/lib/geo-spider/extractors/master.rb +23 -0
- data/lib/geo-spider/extractors/microformat.rb +21 -0
- data/lib/geo-spider/extractors/postcode.rb +40 -0
- data/lib/geo-spider/location.rb +18 -0
- data/lib/geo-spider/page.rb +83 -0
- data/lib/geo-spider/site.rb +50 -0
- data/lib/geo-spider/version.rb +9 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/setup.rb +1585 -0
- data/spec/assets/pages/multiple_postcodes_and_microformats.html +15 -0
- data/spec/assets/pages/page_with_links.html +14 -0
- data/spec/assets/pages/separate_microformat_and_postcode.html +13 -0
- data/spec/assets/pages/single_microformat.html +13 -0
- data/spec/assets/pages/single_postcode.html +13 -0
- data/spec/geo-spider/page_spec.rb +125 -0
- data/spec/geo-spider/site_spec.rb +8 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +19 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/rspec.rake +21 -0
- data/tasks/website.rake +9 -0
- metadata +101 -0
data/History.txt
ADDED
data/License.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2008 Tom Taylor
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Manifest.txt
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
History.txt
|
2
|
+
License.txt
|
3
|
+
Manifest.txt
|
4
|
+
PostInstall.txt
|
5
|
+
README.txt
|
6
|
+
Rakefile
|
7
|
+
config/hoe.rb
|
8
|
+
config/requirements.rb
|
9
|
+
lib/geo-spider.rb
|
10
|
+
lib/geo-spider/extractors/base.rb
|
11
|
+
lib/geo-spider/extractors/master.rb
|
12
|
+
lib/geo-spider/extractors/microformat.rb
|
13
|
+
lib/geo-spider/extractors/postcode.rb
|
14
|
+
lib/geo-spider/location.rb
|
15
|
+
lib/geo-spider/page.rb
|
16
|
+
lib/geo-spider/site.rb
|
17
|
+
lib/geo-spider/version.rb
|
18
|
+
script/console
|
19
|
+
script/destroy
|
20
|
+
script/generate
|
21
|
+
setup.rb
|
22
|
+
spec/assets/pages/multiple_postcodes_and_microformats.html
|
23
|
+
spec/assets/pages/page_with_links.html
|
24
|
+
spec/assets/pages/separate_microformat_and_postcode.html
|
25
|
+
spec/assets/pages/single_microformat.html
|
26
|
+
spec/assets/pages/single_postcode.html
|
27
|
+
spec/geo-spider/page_spec.rb
|
28
|
+
spec/geo-spider/site_spec.rb
|
29
|
+
spec/spec.opts
|
30
|
+
spec/spec_helper.rb
|
31
|
+
tasks/deployment.rake
|
32
|
+
tasks/environment.rake
|
33
|
+
tasks/rspec.rake
|
34
|
+
tasks/website.rake
|
data/PostInstall.txt
ADDED
File without changes
|
data/README.txt
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
= geo-spider
|
2
|
+
|
3
|
+
* http://geospider.rubyforge.org
|
4
|
+
* http://github.com/tomtaylor/geo-spider
|
5
|
+
|
6
|
+
== DESCRIPTION:
|
7
|
+
|
8
|
+
Tool for spidering websites/blogs, extracting geodata from specific pages.
|
9
|
+
|
10
|
+
Starting at a base URL, it will spider every page underneath, returning pages which have a URL that matches a desired pattern.
|
11
|
+
|
12
|
+
The typical use case is spidering an entire blog for posts which contain geodata.
|
13
|
+
|
14
|
+
Different methods for extracting geodata can be used. It currently supports UK postcodes and the abbr design pattern geo microformat <http://microformats.org/wiki/geo>.
|
15
|
+
|
16
|
+
It is current in use behind the scenes of the Geoblogomatic <http://www.geoblogomatic.com>
|
17
|
+
|
18
|
+
== FEATURES/PROBLEMS:
|
19
|
+
|
20
|
+
* Still very much in development.
|
21
|
+
|
22
|
+
== SYNOPSIS:
|
23
|
+
|
24
|
+
Spider entire sites like so:
|
25
|
+
|
26
|
+
require 'geo-spider'
|
27
|
+
site = GeoSpider::Site.new("http://www.piecesofhackney.co.uk")
|
28
|
+
|
29
|
+
site.each_page do |page|
|
30
|
+
puts page.locations.inspect
|
31
|
+
end
|
32
|
+
|
33
|
+
Extract geodata from specific page like so:
|
34
|
+
|
35
|
+
require 'geo-spider'
|
36
|
+
page = GeoSpider::Page.new("http://www.nothingtoseehere.net/2008/07/t34_tank_london_1.html")
|
37
|
+
puts page.locations.inspect
|
38
|
+
|
39
|
+
== REQUIREMENTS:
|
40
|
+
|
41
|
+
* hpricot (http://code.whytheluckystiff.net/hpricot/) - for HTML parsing
|
42
|
+
* graticule (http://graticule.rubyforge.org/) - for geocoding
|
43
|
+
|
44
|
+
== LICENSE:
|
45
|
+
|
46
|
+
(The MIT License)
|
47
|
+
|
48
|
+
Copyright (c) 2008 Tom Taylor
|
49
|
+
|
50
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
51
|
+
a copy of this software and associated documentation files (the
|
52
|
+
'Software'), to deal in the Software without restriction, including
|
53
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
54
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
55
|
+
permit persons to whom the Software is furnished to do so, subject to
|
56
|
+
the following conditions:
|
57
|
+
|
58
|
+
The above copyright notice and this permission notice shall be
|
59
|
+
included in all copies or substantial portions of the Software.
|
60
|
+
|
61
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
62
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
63
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
64
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
65
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
66
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
67
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
data/config/hoe.rb
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'geo-spider/version'
|
2
|
+
|
3
|
+
AUTHOR = 'Tom Taylor' # can also be an array of Authors
|
4
|
+
EMAIL = "tom@tomtaylor.co.uk"
|
5
|
+
DESCRIPTION = "Tool for spidering websites, extracting pages with geodata."
|
6
|
+
GEM_NAME = 'geo-spider' # what ppl will type to install your gem
|
7
|
+
RUBYFORGE_PROJECT = 'geospider' # The unix name for your project
|
8
|
+
HOMEPATH = "http://#{RUBYFORGE_PROJECT}.rubyforge.org"
|
9
|
+
DOWNLOAD_PATH = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
|
10
|
+
EXTRA_DEPENDENCIES = [
|
11
|
+
['hpricot', 'graticule']
|
12
|
+
] # An array of rubygem dependencies [name, version]
|
13
|
+
|
14
|
+
@config_file = "~/.rubyforge/user-config.yml"
|
15
|
+
@config = nil
|
16
|
+
RUBYFORGE_USERNAME = "tomtaylor"
|
17
|
+
def rubyforge_username
|
18
|
+
unless @config
|
19
|
+
begin
|
20
|
+
@config = YAML.load(File.read(File.expand_path(@config_file)))
|
21
|
+
rescue
|
22
|
+
puts <<-EOS
|
23
|
+
ERROR: No rubyforge config file found: #{@config_file}
|
24
|
+
Run 'rubyforge setup' to prepare your env for access to Rubyforge
|
25
|
+
- See http://newgem.rubyforge.org/rubyforge.html for more details
|
26
|
+
EOS
|
27
|
+
exit
|
28
|
+
end
|
29
|
+
end
|
30
|
+
RUBYFORGE_USERNAME.replace @config["username"]
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
REV = nil
|
35
|
+
# UNCOMMENT IF REQUIRED:
|
36
|
+
# REV = YAML.load(`svn info`)['Revision']
|
37
|
+
VERS = GeoSpider::VERSION::STRING + (REV ? ".#{REV}" : "")
|
38
|
+
RDOC_OPTS = ['--quiet', '--title', 'geo-spider documentation',
|
39
|
+
"--opname", "index.html",
|
40
|
+
"--line-numbers",
|
41
|
+
"--main", "README",
|
42
|
+
"--inline-source"]
|
43
|
+
|
44
|
+
class Hoe
|
45
|
+
def extra_deps
|
46
|
+
@extra_deps.reject! { |x| Array(x).first == 'hoe' }
|
47
|
+
@extra_deps
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Generate all the Rake tasks
|
52
|
+
# Run 'rake -T' to see list of generated tasks (from gem root directory)
|
53
|
+
$hoe = Hoe.new(GEM_NAME, VERS) do |p|
|
54
|
+
p.developer(AUTHOR, EMAIL)
|
55
|
+
p.description = DESCRIPTION
|
56
|
+
p.summary = DESCRIPTION
|
57
|
+
p.url = HOMEPATH
|
58
|
+
p.rubyforge_name = RUBYFORGE_PROJECT if RUBYFORGE_PROJECT
|
59
|
+
p.test_globs = ["test/**/test_*.rb"]
|
60
|
+
p.clean_globs |= ['**/.*.sw?', '*.gem', '.config', '**/.DS_Store'] #An array of file patterns to delete on clean.
|
61
|
+
p.remote_rdoc_dir = ''
|
62
|
+
# == Optional
|
63
|
+
p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
|
64
|
+
#p.extra_deps = EXTRA_DEPENDENCIES
|
65
|
+
|
66
|
+
#p.spec_extras = {} # A hash of extra values to set in the gemspec.
|
67
|
+
end
|
68
|
+
|
69
|
+
CHANGES = $hoe.paragraphs_of('History.txt', 0..1).join("\\n\\n")
|
70
|
+
PATH = (RUBYFORGE_PROJECT == GEM_NAME) ? RUBYFORGE_PROJECT : "#{RUBYFORGE_PROJECT}/#{GEM_NAME}"
|
71
|
+
# $hoe.remote_rdoc_dir = File.join(PATH.gsub(/^#{RUBYFORGE_PROJECT}\/?/,''), 'rdoc')
|
72
|
+
$hoe.rsync_args = '-av --delete --ignore-errors'
|
73
|
+
$hoe.spec.post_install_message = File.open(File.dirname(__FILE__) + "/../PostInstall.txt").read rescue ""
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
include FileUtils
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
%w[rake hoe newgem rubigen].each do |req_gem|
|
6
|
+
begin
|
7
|
+
require req_gem
|
8
|
+
rescue LoadError
|
9
|
+
puts "This Rakefile requires the '#{req_gem}' RubyGem."
|
10
|
+
puts "Installation: gem install #{req_gem} -y"
|
11
|
+
exit
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
$:.unshift(File.join(File.dirname(__FILE__), %w[.. lib]))
|
data/lib/geo-spider.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__)) unless
|
2
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'hpricot'
|
6
|
+
require 'open-uri'
|
7
|
+
|
8
|
+
require 'geo-spider/page'
|
9
|
+
require 'geo-spider/site'
|
10
|
+
|
11
|
+
module GeoSpider
|
12
|
+
|
13
|
+
DEFAULT_USER_AGENT = 'geo-spider (http://github.com/tomtaylor/geo-spider)'
|
14
|
+
|
15
|
+
def self.user_agent
|
16
|
+
@user_agent || DEFAULT_USER_AGENT
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.user_agent=(user_agent)
|
20
|
+
@user_agent = user_agent
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'geo-spider/extractors/microformat'
|
2
|
+
require 'geo-spider/extractors/postcode'
|
3
|
+
|
4
|
+
module GeoSpider
|
5
|
+
|
6
|
+
module Extractors
|
7
|
+
|
8
|
+
class Master < GeoSpider::Extractors::Base
|
9
|
+
|
10
|
+
# TODO: Handle duplicates from different data sources
|
11
|
+
|
12
|
+
def locations
|
13
|
+
microformat_locations = Extractors::Microformat.new(@element).locations
|
14
|
+
postcode_locations = Extractors::Postcode.new(@element).locations
|
15
|
+
|
16
|
+
(microformat_locations + postcode_locations).flatten
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'geo-spider/extractors/base'
|
2
|
+
|
3
|
+
module GeoSpider
|
4
|
+
|
5
|
+
module Extractors
|
6
|
+
|
7
|
+
class Microformat < GeoSpider::Extractors::Base
|
8
|
+
|
9
|
+
def locations
|
10
|
+
@element.search("abbr[@class='geo'][@title]").map do |geo|
|
11
|
+
latitude, longitude = geo.attributes["title"].split(";")
|
12
|
+
text = geo.inner_text
|
13
|
+
Location.new(:latitude => latitude.to_f, :longitude => longitude.to_f, :title => text)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'geo-spider/extractors/base'
|
2
|
+
require 'graticule'
|
3
|
+
|
4
|
+
module GeoSpider
|
5
|
+
|
6
|
+
module Extractors
|
7
|
+
|
8
|
+
class Postcode < Base
|
9
|
+
|
10
|
+
# Full BS 7666 postcode format. Source: http://en.wikipedia.org/wiki/UK_postcodes
|
11
|
+
REGEXP = /(GIR 0AA|[A-PR-UWYZ]([0-9]{1,2}|([A-HK-Y][0-9]|[A-HK-Y][0-9]([0-9]|[ABEHMNPRV-Y]))|[0-9][A-HJKS-UW])(\s*)[0-9][ABD-HJLNP-UW-Z]{2})/i
|
12
|
+
|
13
|
+
def locations
|
14
|
+
results = @element.inner_text.scan(REGEXP)
|
15
|
+
results = results.map(&:first)
|
16
|
+
|
17
|
+
results.map do |result|
|
18
|
+
latitude, longitude = geocoder.location(result)
|
19
|
+
Location.new(:latitude => latitude, :longitude => longitude, :title => result)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# You need to set a valid Yahoo API key before the UK postcode geocoding will work. Yahoo have vastly better UK postcode accuracy than the other large mapping providers, apart from perhaps Multimap.
|
24
|
+
|
25
|
+
def self.api_key=(api_key)
|
26
|
+
@@api_key = api_key
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def geocoder
|
32
|
+
raise "No Yahoo API key set" unless @@api_key
|
33
|
+
Graticule.service(:yahoo).new @@api_key
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module GeoSpider
|
2
|
+
|
3
|
+
class Location
|
4
|
+
|
5
|
+
attr_reader :longitude, :latitude, :title
|
6
|
+
|
7
|
+
def initialize(params = {})
|
8
|
+
raise "No longitude provided" unless params[:longitude]
|
9
|
+
raise "No latitude provided" unless params[:latitude]
|
10
|
+
|
11
|
+
@latitude = params[:latitude]
|
12
|
+
@longitude = params[:longitude]
|
13
|
+
@title = params[:title]
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'geo-spider/location'
|
2
|
+
require 'geo-spider/extractors/master'
|
3
|
+
|
4
|
+
module GeoSpider
|
5
|
+
|
6
|
+
class Page
|
7
|
+
|
8
|
+
attr_reader :url
|
9
|
+
|
10
|
+
DEFAULT_CONTENT_CSS_SELECTOR = "body" # Find locations within the entire body by default
|
11
|
+
DEFAULT_TITLE_CSS_SELECTOR = "title" # Use the title in the head by deault
|
12
|
+
|
13
|
+
# Create a new page based on the URL.
|
14
|
+
|
15
|
+
def initialize(url, options = {})
|
16
|
+
@url = url
|
17
|
+
@site = options[:site]
|
18
|
+
@content_css_selector = options[:content_css_selector] || DEFAULT_CONTENT_CSS_SELECTOR
|
19
|
+
@title_css_selector = options[:title_css_selector] || DEFAULT_TITLE_CSS_SELECTOR
|
20
|
+
hpricot_doc
|
21
|
+
end
|
22
|
+
|
23
|
+
def title
|
24
|
+
hpricot_doc.at(@title_css_selector).inner_text
|
25
|
+
end
|
26
|
+
|
27
|
+
# Returns an array of Location objects based on the locations found in the page.
|
28
|
+
|
29
|
+
def locations
|
30
|
+
body_element = hpricot_doc.at(@content_css_selector)
|
31
|
+
master_extractor = Extractors::Master.new(body_element)
|
32
|
+
master_extractor.locations
|
33
|
+
end
|
34
|
+
|
35
|
+
# Returns a unique array of URLs present in the page as strings, normalized to remove anchors.
|
36
|
+
|
37
|
+
def links
|
38
|
+
hpricot_doc.search("a[@href]").map do |a|
|
39
|
+
normalize_url(a.attributes["href"])
|
40
|
+
end.uniq.reject { |b| rejected_url?(b) }
|
41
|
+
end
|
42
|
+
|
43
|
+
# Returns a unique array of internal URLs present in the page as string, normalized to remove anchors. Needs the page to know what site it is part of, or it cannot decide what is an internal link.
|
44
|
+
|
45
|
+
def internal_links
|
46
|
+
raise("Cannot discover internal links without knowing what site this page is part of.") if @site.nil?
|
47
|
+
links.select { |l| internal_url?(l) }
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def hpricot_doc
|
53
|
+
@hpricot_doc ||= Hpricot(raw_http)
|
54
|
+
end
|
55
|
+
|
56
|
+
def raw_http
|
57
|
+
open(self.url, 'User-Agent' => GeoSpider::user_agent)
|
58
|
+
end
|
59
|
+
|
60
|
+
def internal_url?(url_to_test)
|
61
|
+
# Does it begin with the URL of the site and what's the extension?
|
62
|
+
url_to_test[0, @site.url.to_s.length] == @site.url.to_s
|
63
|
+
end
|
64
|
+
|
65
|
+
def rejected_url?(url_to_test)
|
66
|
+
url_to_test =~ /(mp3|m4a|mov|jpg|png|gif|zip|pdf)$/i
|
67
|
+
end
|
68
|
+
|
69
|
+
def normalize_url(link_url)
|
70
|
+
begin
|
71
|
+
link_url = URI.parse(link_url)
|
72
|
+
link_url.merge(@url) unless link_url.absolute?
|
73
|
+
link_url.fragment = nil
|
74
|
+
link_url.to_s
|
75
|
+
rescue URI::InvalidURIError
|
76
|
+
""
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
|
83
|
+
end
|