geo-spider 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +3 -0
- data/License.txt +20 -0
- data/Manifest.txt +34 -0
- data/PostInstall.txt +0 -0
- data/README.txt +67 -0
- data/Rakefile +4 -0
- data/config/hoe.rb +73 -0
- data/config/requirements.rb +15 -0
- data/lib/geo-spider.rb +23 -0
- data/lib/geo-spider/extractors/base.rb +15 -0
- data/lib/geo-spider/extractors/master.rb +23 -0
- data/lib/geo-spider/extractors/microformat.rb +21 -0
- data/lib/geo-spider/extractors/postcode.rb +40 -0
- data/lib/geo-spider/location.rb +18 -0
- data/lib/geo-spider/page.rb +83 -0
- data/lib/geo-spider/site.rb +50 -0
- data/lib/geo-spider/version.rb +9 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/setup.rb +1585 -0
- data/spec/assets/pages/multiple_postcodes_and_microformats.html +15 -0
- data/spec/assets/pages/page_with_links.html +14 -0
- data/spec/assets/pages/separate_microformat_and_postcode.html +13 -0
- data/spec/assets/pages/single_microformat.html +13 -0
- data/spec/assets/pages/single_postcode.html +13 -0
- data/spec/geo-spider/page_spec.rb +125 -0
- data/spec/geo-spider/site_spec.rb +8 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +19 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/rspec.rake +21 -0
- data/tasks/website.rake +9 -0
- metadata +101 -0
@@ -0,0 +1,15 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
2
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
3
|
+
<html>
|
4
|
+
<head>
|
5
|
+
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
|
6
|
+
<title>Multiple Microformats and Postcodes</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<p>
|
10
|
+
BBC Broadcasting House is at W1A 1AA. <abbr class="geo" title="51.503571;-0.074500">Lafone Street</abbr> is in Southwark.
|
11
|
+
</p>
|
12
|
+
|
13
|
+
<p>Headshift are at SE1 2NQ, but our favourite pub is the <abbr class="geo" title="51.503587;-0.075939">Anchor Tap</abbr>.</p>
|
14
|
+
</body>
|
15
|
+
</html>
|
@@ -0,0 +1,14 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
2
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
3
|
+
<html>
|
4
|
+
<head>
|
5
|
+
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
|
6
|
+
<title>Page with Links</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<h1>Heading 1</h1>
|
10
|
+
<p>
|
11
|
+
<a href="http://www.example.com/broadcastinghouse">BBC Broadcasting House</a> is at W1A 1AA. <abbr class="geo" title="51.503571;-0.074500">Lafone Street</abbr> is in <a href="http://www.external.com/southwark">Southwark</a>. How about an <a href="http://www.example.com/download.mp3">MP3</a>?
|
12
|
+
</p>
|
13
|
+
</body>
|
14
|
+
</html>
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
2
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
3
|
+
<html>
|
4
|
+
<head>
|
5
|
+
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
|
6
|
+
<title>Separate Microformat and Postcode</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<p>
|
10
|
+
BBC Broadcasting House is at W1A 1AA. <abbr class="geo" title="51.503571;-0.074500">Lafone Street</abbr> is in Southwark.
|
11
|
+
</p>
|
12
|
+
</body>
|
13
|
+
</html>
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
2
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
3
|
+
<html>
|
4
|
+
<head>
|
5
|
+
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
|
6
|
+
<title>Single Microformat</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<p>
|
10
|
+
<abbr class="geo" title="51.517570;-0.138770">BBC Broadcasting House</abbr>
|
11
|
+
</p>
|
12
|
+
</body>
|
13
|
+
</html>
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
2
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
3
|
+
<html>
|
4
|
+
<head>
|
5
|
+
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
|
6
|
+
<title>Single Postcode</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<p>
|
10
|
+
BBC Broadcasting House is at W1A 1AA.
|
11
|
+
</p>
|
12
|
+
</body>
|
13
|
+
</html>
|
@@ -0,0 +1,125 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe Page, "with a single microformat which is being parsed" do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
OpenURI.should_receive(:open_uri).and_return(page_as_string('single_microformat.html'))
|
7
|
+
@page = Page.new("http://www.example.com")
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should find one location" do
|
11
|
+
@page.locations.length.should == 1
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should have the right location details" do
|
15
|
+
location = @page.locations.first
|
16
|
+
location.latitude.should == 51.51757
|
17
|
+
location.longitude.should == -0.13877
|
18
|
+
location.title.should == "BBC Broadcasting House"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
describe Page, "with a single postcode which is being parsed" do
|
23
|
+
|
24
|
+
before(:each) do
|
25
|
+
OpenURI.should_receive(:open_uri).and_return(page_as_string('single_postcode.html'))
|
26
|
+
@page = Page.new("http://www.example.com")
|
27
|
+
GeoSpider::Extractors::Postcode.api_key = "waffles"
|
28
|
+
mock_geocoder_result = OpenStruct.new( {:location => [51.000000, -1.000000]} )
|
29
|
+
Graticule.stub!(:service)
|
30
|
+
Graticule.service.should_receive(:new).and_return(mock_geocoder_result)
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should find one location" do
|
34
|
+
@page.locations.length.should == 1
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should have the right location details" do
|
38
|
+
location = @page.locations.first
|
39
|
+
location.latitude.should == 51.0
|
40
|
+
location.longitude.should == -1.0
|
41
|
+
location.title.should == "W1A 1AA"
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
describe Page, "with multiple microformats and postcodes being parsed" do
|
47
|
+
|
48
|
+
before(:each) do
|
49
|
+
OpenURI.should_receive(:open_uri).and_return(page_as_string('multiple_postcodes_and_microformats.html'))
|
50
|
+
@page = Page.new("http://www.example.com")
|
51
|
+
|
52
|
+
mock_geocoder_result = OpenStruct.new( {:location => [51.000000, -1.000000]} )
|
53
|
+
Graticule.stub!(:service)
|
54
|
+
Graticule.service.should_receive(:new).twice.and_return(mock_geocoder_result)
|
55
|
+
end
|
56
|
+
|
57
|
+
it "should find four locations" do
|
58
|
+
@page.locations.length.should == 4
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
describe Page, "which is not part of a site" do
|
64
|
+
|
65
|
+
before(:each) do
|
66
|
+
OpenURI.should_receive(:open_uri).and_return(page_as_string('page_with_links.html'))
|
67
|
+
@page = Page.new("http://www.waffles.com")
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should raise if you try and get the internal_links" do
|
71
|
+
lambda { @page.internal_links }.should raise_error
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
describe Page, "which is part of a site" do
|
77
|
+
|
78
|
+
before(:each) do
|
79
|
+
OpenURI.should_receive(:open_uri).and_return(page_as_string('page_with_links.html'))
|
80
|
+
@site = Site.new("http://www.example.com/")
|
81
|
+
@page = Page.new("http://www.example.com/waffles", :site => @site)
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should be able to extract them all" do
|
85
|
+
@page.links.length.should == 2
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should be able to extract just the internal links" do
|
89
|
+
@page.internal_links.length.should == 1
|
90
|
+
@page.internal_links.reject { |l| l =~ /^http:\/\/www.example.com\// }.length.should == 0
|
91
|
+
end
|
92
|
+
|
93
|
+
it "should exclude the media links" do
|
94
|
+
@page.links.should_not include("http://www.example.com/download.mp3")
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
describe Page, "which is finding the title" do
|
99
|
+
|
100
|
+
before(:each) do
|
101
|
+
OpenURI.should_receive(:open_uri).and_return(page_as_string('page_with_links.html'))
|
102
|
+
end
|
103
|
+
|
104
|
+
describe "using the default" do
|
105
|
+
|
106
|
+
before(:each) do
|
107
|
+
@page = Page.new("http://www.example.com")
|
108
|
+
end
|
109
|
+
|
110
|
+
it "should find the title from the head" do
|
111
|
+
@page.title.should == "Page with Links"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
describe "specifying a h1 css selector" do
|
116
|
+
|
117
|
+
before(:each) do
|
118
|
+
@page = Page.new("http://www.example.com", :title_css_selector => "h1")
|
119
|
+
end
|
120
|
+
|
121
|
+
it "should find the title from the h1 tag" do
|
122
|
+
@page.title.should == "Heading 1"
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--colour
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
begin
|
2
|
+
require 'spec'
|
3
|
+
rescue LoadError
|
4
|
+
require 'rubygems'
|
5
|
+
gem 'rspec'
|
6
|
+
require 'spec'
|
7
|
+
end
|
8
|
+
|
9
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
10
|
+
require 'geo-spider'
|
11
|
+
require 'ostruct'
|
12
|
+
include GeoSpider
|
13
|
+
|
14
|
+
# Set up the api key for testing so it doesn't raise
|
15
|
+
GeoSpider::Extractors::Postcode.api_key = "waffles"
|
16
|
+
|
17
|
+
def page_as_string(page_path)
|
18
|
+
IO.read(File.join(File.dirname(__FILE__), "assets", "pages", page_path))
|
19
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
desc 'Release the website and new gem version'
|
2
|
+
task :deploy => [:check_version, :website, :release] do
|
3
|
+
puts "Remember to create SVN tag:"
|
4
|
+
puts "svn copy svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/trunk " +
|
5
|
+
"svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/tags/REL-#{VERS} "
|
6
|
+
puts "Suggested comment:"
|
7
|
+
puts "Tagging release #{CHANGES}"
|
8
|
+
end
|
9
|
+
|
10
|
+
desc 'Runs tasks website_generate and install_gem as a local deployment of the gem'
|
11
|
+
task :local_deploy => [:website_generate, :install_gem]
|
12
|
+
|
13
|
+
task :check_version do
|
14
|
+
unless ENV['VERSION']
|
15
|
+
puts 'Must pass a VERSION=x.y.z release version'
|
16
|
+
exit
|
17
|
+
end
|
18
|
+
unless ENV['VERSION'] == VERS
|
19
|
+
puts "Please update your version.rb to match the release version, currently #{VERS}"
|
20
|
+
exit
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
desc 'Install the package as a gem, without generating documentation(ri/rdoc)'
|
25
|
+
task :install_gem_no_doc => [:clean, :package] do
|
26
|
+
sh "#{'sudo ' unless Hoe::WINDOZE }gem install pkg/*.gem --no-rdoc --no-ri"
|
27
|
+
end
|
28
|
+
|
29
|
+
namespace :manifest do
|
30
|
+
desc 'Recreate Manifest.txt to include ALL files'
|
31
|
+
task :refresh do
|
32
|
+
`rake check_manifest | patch -p0 > Manifest.txt`
|
33
|
+
end
|
34
|
+
end
|
data/tasks/rspec.rake
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
begin
|
2
|
+
require 'spec'
|
3
|
+
rescue LoadError
|
4
|
+
require 'rubygems'
|
5
|
+
require 'spec'
|
6
|
+
end
|
7
|
+
begin
|
8
|
+
require 'spec/rake/spectask'
|
9
|
+
rescue LoadError
|
10
|
+
puts <<-EOS
|
11
|
+
To use rspec for testing you must install rspec gem:
|
12
|
+
gem install rspec
|
13
|
+
EOS
|
14
|
+
exit(0)
|
15
|
+
end
|
16
|
+
|
17
|
+
desc "Run the specs under spec/models"
|
18
|
+
Spec::Rake::SpecTask.new do |t|
|
19
|
+
t.spec_opts = ['--options', "spec/spec.opts"]
|
20
|
+
t.spec_files = FileList['spec/**/*_spec.rb']
|
21
|
+
end
|
data/tasks/website.rake
ADDED
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: geo-spider
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Tom Taylor
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-09-29 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hoe
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.7.0
|
24
|
+
version:
|
25
|
+
description: Tool for spidering websites, extracting pages with geodata.
|
26
|
+
email:
|
27
|
+
- tom@tomtaylor.co.uk
|
28
|
+
executables: []
|
29
|
+
|
30
|
+
extensions: []
|
31
|
+
|
32
|
+
extra_rdoc_files:
|
33
|
+
- History.txt
|
34
|
+
- License.txt
|
35
|
+
- Manifest.txt
|
36
|
+
- PostInstall.txt
|
37
|
+
- README.txt
|
38
|
+
files:
|
39
|
+
- History.txt
|
40
|
+
- License.txt
|
41
|
+
- Manifest.txt
|
42
|
+
- PostInstall.txt
|
43
|
+
- README.txt
|
44
|
+
- Rakefile
|
45
|
+
- config/hoe.rb
|
46
|
+
- config/requirements.rb
|
47
|
+
- lib/geo-spider.rb
|
48
|
+
- lib/geo-spider/extractors/base.rb
|
49
|
+
- lib/geo-spider/extractors/master.rb
|
50
|
+
- lib/geo-spider/extractors/microformat.rb
|
51
|
+
- lib/geo-spider/extractors/postcode.rb
|
52
|
+
- lib/geo-spider/location.rb
|
53
|
+
- lib/geo-spider/page.rb
|
54
|
+
- lib/geo-spider/site.rb
|
55
|
+
- lib/geo-spider/version.rb
|
56
|
+
- script/console
|
57
|
+
- script/destroy
|
58
|
+
- script/generate
|
59
|
+
- setup.rb
|
60
|
+
- spec/assets/pages/multiple_postcodes_and_microformats.html
|
61
|
+
- spec/assets/pages/page_with_links.html
|
62
|
+
- spec/assets/pages/separate_microformat_and_postcode.html
|
63
|
+
- spec/assets/pages/single_microformat.html
|
64
|
+
- spec/assets/pages/single_postcode.html
|
65
|
+
- spec/geo-spider/page_spec.rb
|
66
|
+
- spec/geo-spider/site_spec.rb
|
67
|
+
- spec/spec.opts
|
68
|
+
- spec/spec_helper.rb
|
69
|
+
- tasks/deployment.rake
|
70
|
+
- tasks/environment.rake
|
71
|
+
- tasks/rspec.rake
|
72
|
+
- tasks/website.rake
|
73
|
+
has_rdoc: true
|
74
|
+
homepage: http://geospider.rubyforge.org
|
75
|
+
post_install_message: ""
|
76
|
+
rdoc_options:
|
77
|
+
- --main
|
78
|
+
- README.txt
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: "0"
|
86
|
+
version:
|
87
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: "0"
|
92
|
+
version:
|
93
|
+
requirements: []
|
94
|
+
|
95
|
+
rubyforge_project: geospider
|
96
|
+
rubygems_version: 1.2.0
|
97
|
+
signing_key:
|
98
|
+
specification_version: 2
|
99
|
+
summary: Tool for spidering websites, extracting pages with geodata.
|
100
|
+
test_files: []
|
101
|
+
|