geo-spider 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +3 -0
- data/License.txt +20 -0
- data/Manifest.txt +34 -0
- data/PostInstall.txt +0 -0
- data/README.txt +67 -0
- data/Rakefile +4 -0
- data/config/hoe.rb +73 -0
- data/config/requirements.rb +15 -0
- data/lib/geo-spider.rb +23 -0
- data/lib/geo-spider/extractors/base.rb +15 -0
- data/lib/geo-spider/extractors/master.rb +23 -0
- data/lib/geo-spider/extractors/microformat.rb +21 -0
- data/lib/geo-spider/extractors/postcode.rb +40 -0
- data/lib/geo-spider/location.rb +18 -0
- data/lib/geo-spider/page.rb +83 -0
- data/lib/geo-spider/site.rb +50 -0
- data/lib/geo-spider/version.rb +9 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/setup.rb +1585 -0
- data/spec/assets/pages/multiple_postcodes_and_microformats.html +15 -0
- data/spec/assets/pages/page_with_links.html +14 -0
- data/spec/assets/pages/separate_microformat_and_postcode.html +13 -0
- data/spec/assets/pages/single_microformat.html +13 -0
- data/spec/assets/pages/single_postcode.html +13 -0
- data/spec/geo-spider/page_spec.rb +125 -0
- data/spec/geo-spider/site_spec.rb +8 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +19 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/rspec.rake +21 -0
- data/tasks/website.rake +9 -0
- metadata +101 -0
@@ -0,0 +1,15 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
2
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
3
|
+
<html>
|
4
|
+
<head>
|
5
|
+
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
|
6
|
+
<title>Multiple Microformats and Postcodes</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<p>
|
10
|
+
BBC Broadcasting House is at W1A 1AA. <abbr class="geo" title="51.503571;-0.074500">Lafone Street</abbr> is in Southwark.
|
11
|
+
</p>
|
12
|
+
|
13
|
+
<p>Headshift are at SE1 2NQ, but our favourite pub is the <abbr class="geo" title="51.503587;-0.075939">Anchor Tap</abbr>.</p>
|
14
|
+
</body>
|
15
|
+
</html>
|
@@ -0,0 +1,14 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
2
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
3
|
+
<html>
|
4
|
+
<head>
|
5
|
+
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
|
6
|
+
<title>Page with Links</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<h1>Heading 1</h1>
|
10
|
+
<p>
|
11
|
+
<a href="http://www.example.com/broadcastinghouse">BBC Broadcasting House</a> is at W1A 1AA. <abbr class="geo" title="51.503571;-0.074500">Lafone Street</abbr> is in <a href="http://www.external.com/southwark">Southwark</a>. How about an <a href="http://www.example.com/download.mp3">MP3</a>?
|
12
|
+
</p>
|
13
|
+
</body>
|
14
|
+
</html>
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
2
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
3
|
+
<html>
|
4
|
+
<head>
|
5
|
+
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
|
6
|
+
<title>Separate Microformat and Postcode</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<p>
|
10
|
+
BBC Broadcasting House is at W1A 1AA. <abbr class="geo" title="51.503571;-0.074500">Lafone Street</abbr> is in Southwark.
|
11
|
+
</p>
|
12
|
+
</body>
|
13
|
+
</html>
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
2
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
3
|
+
<html>
|
4
|
+
<head>
|
5
|
+
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
|
6
|
+
<title>Single Microformat</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<p>
|
10
|
+
<abbr class="geo" title="51.517570;-0.138770">BBC Broadcasting House</abbr>
|
11
|
+
</p>
|
12
|
+
</body>
|
13
|
+
</html>
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
2
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
3
|
+
<html>
|
4
|
+
<head>
|
5
|
+
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
|
6
|
+
<title>Single Postcode</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<p>
|
10
|
+
BBC Broadcasting House is at W1A 1AA.
|
11
|
+
</p>
|
12
|
+
</body>
|
13
|
+
</html>
|
@@ -0,0 +1,125 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe Page, "with a single microformat which is being parsed" do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
OpenURI.should_receive(:open_uri).and_return(page_as_string('single_microformat.html'))
|
7
|
+
@page = Page.new("http://www.example.com")
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should find one location" do
|
11
|
+
@page.locations.length.should == 1
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should have the right location details" do
|
15
|
+
location = @page.locations.first
|
16
|
+
location.latitude.should == 51.51757
|
17
|
+
location.longitude.should == -0.13877
|
18
|
+
location.title.should == "BBC Broadcasting House"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
describe Page, "with a single postcode which is being parsed" do
|
23
|
+
|
24
|
+
before(:each) do
|
25
|
+
OpenURI.should_receive(:open_uri).and_return(page_as_string('single_postcode.html'))
|
26
|
+
@page = Page.new("http://www.example.com")
|
27
|
+
GeoSpider::Extractors::Postcode.api_key = "waffles"
|
28
|
+
mock_geocoder_result = OpenStruct.new( {:location => [51.000000, -1.000000]} )
|
29
|
+
Graticule.stub!(:service)
|
30
|
+
Graticule.service.should_receive(:new).and_return(mock_geocoder_result)
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should find one location" do
|
34
|
+
@page.locations.length.should == 1
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should have the right location details" do
|
38
|
+
location = @page.locations.first
|
39
|
+
location.latitude.should == 51.0
|
40
|
+
location.longitude.should == -1.0
|
41
|
+
location.title.should == "W1A 1AA"
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
describe Page, "with multiple microformats and postcodes being parsed" do
|
47
|
+
|
48
|
+
before(:each) do
|
49
|
+
OpenURI.should_receive(:open_uri).and_return(page_as_string('multiple_postcodes_and_microformats.html'))
|
50
|
+
@page = Page.new("http://www.example.com")
|
51
|
+
|
52
|
+
mock_geocoder_result = OpenStruct.new( {:location => [51.000000, -1.000000]} )
|
53
|
+
Graticule.stub!(:service)
|
54
|
+
Graticule.service.should_receive(:new).twice.and_return(mock_geocoder_result)
|
55
|
+
end
|
56
|
+
|
57
|
+
it "should find four locations" do
|
58
|
+
@page.locations.length.should == 4
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
describe Page, "which is not part of a site" do
|
64
|
+
|
65
|
+
before(:each) do
|
66
|
+
OpenURI.should_receive(:open_uri).and_return(page_as_string('page_with_links.html'))
|
67
|
+
@page = Page.new("http://www.waffles.com")
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should raise if you try and get the internal_links" do
|
71
|
+
lambda { @page.internal_links }.should raise_error
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
describe Page, "which is part of a site" do
|
77
|
+
|
78
|
+
before(:each) do
|
79
|
+
OpenURI.should_receive(:open_uri).and_return(page_as_string('page_with_links.html'))
|
80
|
+
@site = Site.new("http://www.example.com/")
|
81
|
+
@page = Page.new("http://www.example.com/waffles", :site => @site)
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should be able to extract them all" do
|
85
|
+
@page.links.length.should == 2
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should be able to extract just the internal links" do
|
89
|
+
@page.internal_links.length.should == 1
|
90
|
+
@page.internal_links.reject { |l| l =~ /^http:\/\/www.example.com\// }.length.should == 0
|
91
|
+
end
|
92
|
+
|
93
|
+
it "should exclude the media links" do
|
94
|
+
@page.links.should_not include("http://www.example.com/download.mp3")
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
describe Page, "which is finding the title" do
|
99
|
+
|
100
|
+
before(:each) do
|
101
|
+
OpenURI.should_receive(:open_uri).and_return(page_as_string('page_with_links.html'))
|
102
|
+
end
|
103
|
+
|
104
|
+
describe "using the default" do
|
105
|
+
|
106
|
+
before(:each) do
|
107
|
+
@page = Page.new("http://www.example.com")
|
108
|
+
end
|
109
|
+
|
110
|
+
it "should find the title from the head" do
|
111
|
+
@page.title.should == "Page with Links"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
describe "specifying a h1 css selector" do
|
116
|
+
|
117
|
+
before(:each) do
|
118
|
+
@page = Page.new("http://www.example.com", :title_css_selector => "h1")
|
119
|
+
end
|
120
|
+
|
121
|
+
it "should find the title from the h1 tag" do
|
122
|
+
@page.title.should == "Heading 1"
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--colour
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
begin
|
2
|
+
require 'spec'
|
3
|
+
rescue LoadError
|
4
|
+
require 'rubygems'
|
5
|
+
gem 'rspec'
|
6
|
+
require 'spec'
|
7
|
+
end
|
8
|
+
|
9
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
10
|
+
require 'geo-spider'
|
11
|
+
require 'ostruct'
|
12
|
+
include GeoSpider
|
13
|
+
|
14
|
+
# Set up the api key for testing so it doesn't raise
|
15
|
+
GeoSpider::Extractors::Postcode.api_key = "waffles"
|
16
|
+
|
17
|
+
def page_as_string(page_path)
|
18
|
+
IO.read(File.join(File.dirname(__FILE__), "assets", "pages", page_path))
|
19
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
desc 'Release the website and new gem version'
|
2
|
+
task :deploy => [:check_version, :website, :release] do
|
3
|
+
puts "Remember to create SVN tag:"
|
4
|
+
puts "svn copy svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/trunk " +
|
5
|
+
"svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/tags/REL-#{VERS} "
|
6
|
+
puts "Suggested comment:"
|
7
|
+
puts "Tagging release #{CHANGES}"
|
8
|
+
end
|
9
|
+
|
10
|
+
desc 'Runs tasks website_generate and install_gem as a local deployment of the gem'
|
11
|
+
task :local_deploy => [:website_generate, :install_gem]
|
12
|
+
|
13
|
+
task :check_version do
|
14
|
+
unless ENV['VERSION']
|
15
|
+
puts 'Must pass a VERSION=x.y.z release version'
|
16
|
+
exit
|
17
|
+
end
|
18
|
+
unless ENV['VERSION'] == VERS
|
19
|
+
puts "Please update your version.rb to match the release version, currently #{VERS}"
|
20
|
+
exit
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
desc 'Install the package as a gem, without generating documentation(ri/rdoc)'
|
25
|
+
task :install_gem_no_doc => [:clean, :package] do
|
26
|
+
sh "#{'sudo ' unless Hoe::WINDOZE }gem install pkg/*.gem --no-rdoc --no-ri"
|
27
|
+
end
|
28
|
+
|
29
|
+
namespace :manifest do
|
30
|
+
desc 'Recreate Manifest.txt to include ALL files'
|
31
|
+
task :refresh do
|
32
|
+
`rake check_manifest | patch -p0 > Manifest.txt`
|
33
|
+
end
|
34
|
+
end
|
data/tasks/rspec.rake
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
begin
|
2
|
+
require 'spec'
|
3
|
+
rescue LoadError
|
4
|
+
require 'rubygems'
|
5
|
+
require 'spec'
|
6
|
+
end
|
7
|
+
begin
|
8
|
+
require 'spec/rake/spectask'
|
9
|
+
rescue LoadError
|
10
|
+
puts <<-EOS
|
11
|
+
To use rspec for testing you must install rspec gem:
|
12
|
+
gem install rspec
|
13
|
+
EOS
|
14
|
+
exit(0)
|
15
|
+
end
|
16
|
+
|
17
|
+
desc "Run the specs under spec/models"
|
18
|
+
Spec::Rake::SpecTask.new do |t|
|
19
|
+
t.spec_opts = ['--options', "spec/spec.opts"]
|
20
|
+
t.spec_files = FileList['spec/**/*_spec.rb']
|
21
|
+
end
|
data/tasks/website.rake
ADDED
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: geo-spider
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Tom Taylor
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-09-29 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hoe
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.7.0
|
24
|
+
version:
|
25
|
+
description: Tool for spidering websites, extracting pages with geodata.
|
26
|
+
email:
|
27
|
+
- tom@tomtaylor.co.uk
|
28
|
+
executables: []
|
29
|
+
|
30
|
+
extensions: []
|
31
|
+
|
32
|
+
extra_rdoc_files:
|
33
|
+
- History.txt
|
34
|
+
- License.txt
|
35
|
+
- Manifest.txt
|
36
|
+
- PostInstall.txt
|
37
|
+
- README.txt
|
38
|
+
files:
|
39
|
+
- History.txt
|
40
|
+
- License.txt
|
41
|
+
- Manifest.txt
|
42
|
+
- PostInstall.txt
|
43
|
+
- README.txt
|
44
|
+
- Rakefile
|
45
|
+
- config/hoe.rb
|
46
|
+
- config/requirements.rb
|
47
|
+
- lib/geo-spider.rb
|
48
|
+
- lib/geo-spider/extractors/base.rb
|
49
|
+
- lib/geo-spider/extractors/master.rb
|
50
|
+
- lib/geo-spider/extractors/microformat.rb
|
51
|
+
- lib/geo-spider/extractors/postcode.rb
|
52
|
+
- lib/geo-spider/location.rb
|
53
|
+
- lib/geo-spider/page.rb
|
54
|
+
- lib/geo-spider/site.rb
|
55
|
+
- lib/geo-spider/version.rb
|
56
|
+
- script/console
|
57
|
+
- script/destroy
|
58
|
+
- script/generate
|
59
|
+
- setup.rb
|
60
|
+
- spec/assets/pages/multiple_postcodes_and_microformats.html
|
61
|
+
- spec/assets/pages/page_with_links.html
|
62
|
+
- spec/assets/pages/separate_microformat_and_postcode.html
|
63
|
+
- spec/assets/pages/single_microformat.html
|
64
|
+
- spec/assets/pages/single_postcode.html
|
65
|
+
- spec/geo-spider/page_spec.rb
|
66
|
+
- spec/geo-spider/site_spec.rb
|
67
|
+
- spec/spec.opts
|
68
|
+
- spec/spec_helper.rb
|
69
|
+
- tasks/deployment.rake
|
70
|
+
- tasks/environment.rake
|
71
|
+
- tasks/rspec.rake
|
72
|
+
- tasks/website.rake
|
73
|
+
has_rdoc: true
|
74
|
+
homepage: http://geospider.rubyforge.org
|
75
|
+
post_install_message: ""
|
76
|
+
rdoc_options:
|
77
|
+
- --main
|
78
|
+
- README.txt
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: "0"
|
86
|
+
version:
|
87
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: "0"
|
92
|
+
version:
|
93
|
+
requirements: []
|
94
|
+
|
95
|
+
rubyforge_project: geospider
|
96
|
+
rubygems_version: 1.2.0
|
97
|
+
signing_key:
|
98
|
+
specification_version: 2
|
99
|
+
summary: Tool for spidering websites, extracting pages with geodata.
|
100
|
+
test_files: []
|
101
|
+
|