geo-spider 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
+ <html>
4
+ <head>
5
+ <meta http-equiv="Content-type" content="text/html; charset=utf-8">
6
+ <title>Multiple Microformats and Postcodes</title>
7
+ </head>
8
+ <body>
9
+ <p>
10
+ BBC Broadcasting House is at W1A 1AA. <abbr class="geo" title="51.503571;-0.074500">Lafone Street</abbr> is in Southwark.
11
+ </p>
12
+
13
+ <p>Headshift are at SE1 2NQ, but our favourite pub is the <abbr class="geo" title="51.503587;-0.075939">Anchor Tap</abbr>.</p>
14
+ </body>
15
+ </html>
@@ -0,0 +1,14 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
+ <html>
4
+ <head>
5
+ <meta http-equiv="Content-type" content="text/html; charset=utf-8">
6
+ <title>Page with Links</title>
7
+ </head>
8
+ <body>
9
+ <h1>Heading 1</h1>
10
+ <p>
11
+ <a href="http://www.example.com/broadcastinghouse">BBC Broadcasting House</a> is at W1A 1AA. <abbr class="geo" title="51.503571;-0.074500">Lafone Street</abbr> is in <a href="http://www.external.com/southwark">Southwark</a>. How about an <a href="http://www.example.com/download.mp3">MP3</a>?
12
+ </p>
13
+ </body>
14
+ </html>
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
+ <html>
4
+ <head>
5
+ <meta http-equiv="Content-type" content="text/html; charset=utf-8">
6
+ <title>Separate Microformat and Postcode</title>
7
+ </head>
8
+ <body>
9
+ <p>
10
+ BBC Broadcasting House is at W1A 1AA. <abbr class="geo" title="51.503571;-0.074500">Lafone Street</abbr> is in Southwark.
11
+ </p>
12
+ </body>
13
+ </html>
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
+ <html>
4
+ <head>
5
+ <meta http-equiv="Content-type" content="text/html; charset=utf-8">
6
+ <title>Single Microformat</title>
7
+ </head>
8
+ <body>
9
+ <p>
10
+ <abbr class="geo" title="51.517570;-0.138770">BBC Broadcasting House</abbr>
11
+ </p>
12
+ </body>
13
+ </html>
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
+ <html>
4
+ <head>
5
+ <meta http-equiv="Content-type" content="text/html; charset=utf-8">
6
+ <title>Single Postcode</title>
7
+ </head>
8
+ <body>
9
+ <p>
10
+ BBC Broadcasting House is at W1A 1AA.
11
+ </p>
12
+ </body>
13
+ </html>
@@ -0,0 +1,125 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Page, "with a single microformat which is being parsed" do
4
+
5
+ before(:each) do
6
+ OpenURI.should_receive(:open_uri).and_return(page_as_string('single_microformat.html'))
7
+ @page = Page.new("http://www.example.com")
8
+ end
9
+
10
+ it "should find one location" do
11
+ @page.locations.length.should == 1
12
+ end
13
+
14
+ it "should have the right location details" do
15
+ location = @page.locations.first
16
+ location.latitude.should == 51.51757
17
+ location.longitude.should == -0.13877
18
+ location.title.should == "BBC Broadcasting House"
19
+ end
20
+ end
21
+
22
+ describe Page, "with a single postcode which is being parsed" do
23
+
24
+ before(:each) do
25
+ OpenURI.should_receive(:open_uri).and_return(page_as_string('single_postcode.html'))
26
+ @page = Page.new("http://www.example.com")
27
+ GeoSpider::Extractors::Postcode.api_key = "waffles"
28
+ mock_geocoder_result = OpenStruct.new( {:location => [51.000000, -1.000000]} )
29
+ Graticule.stub!(:service)
30
+ Graticule.service.should_receive(:new).and_return(mock_geocoder_result)
31
+ end
32
+
33
+ it "should find one location" do
34
+ @page.locations.length.should == 1
35
+ end
36
+
37
+ it "should have the right location details" do
38
+ location = @page.locations.first
39
+ location.latitude.should == 51.0
40
+ location.longitude.should == -1.0
41
+ location.title.should == "W1A 1AA"
42
+ end
43
+
44
+ end
45
+
46
+ describe Page, "with multiple microformats and postcodes being parsed" do
47
+
48
+ before(:each) do
49
+ OpenURI.should_receive(:open_uri).and_return(page_as_string('multiple_postcodes_and_microformats.html'))
50
+ @page = Page.new("http://www.example.com")
51
+
52
+ mock_geocoder_result = OpenStruct.new( {:location => [51.000000, -1.000000]} )
53
+ Graticule.stub!(:service)
54
+ Graticule.service.should_receive(:new).twice.and_return(mock_geocoder_result)
55
+ end
56
+
57
+ it "should find four locations" do
58
+ @page.locations.length.should == 4
59
+ end
60
+
61
+ end
62
+
63
+ describe Page, "which is not part of a site" do
64
+
65
+ before(:each) do
66
+ OpenURI.should_receive(:open_uri).and_return(page_as_string('page_with_links.html'))
67
+ @page = Page.new("http://www.waffles.com")
68
+ end
69
+
70
+ it "should raise if you try and get the internal_links" do
71
+ lambda { @page.internal_links }.should raise_error
72
+ end
73
+
74
+ end
75
+
76
+ describe Page, "which is part of a site" do
77
+
78
+ before(:each) do
79
+ OpenURI.should_receive(:open_uri).and_return(page_as_string('page_with_links.html'))
80
+ @site = Site.new("http://www.example.com/")
81
+ @page = Page.new("http://www.example.com/waffles", :site => @site)
82
+ end
83
+
84
+ it "should be able to extract them all" do
85
+ @page.links.length.should == 2
86
+ end
87
+
88
+ it "should be able to extract just the internal links" do
89
+ @page.internal_links.length.should == 1
90
+ @page.internal_links.reject { |l| l =~ /^http:\/\/www.example.com\// }.length.should == 0
91
+ end
92
+
93
+ it "should exclude the media links" do
94
+ @page.links.should_not include("http://www.example.com/download.mp3")
95
+ end
96
+ end
97
+
98
+ describe Page, "which is finding the title" do
99
+
100
+ before(:each) do
101
+ OpenURI.should_receive(:open_uri).and_return(page_as_string('page_with_links.html'))
102
+ end
103
+
104
+ describe "using the default" do
105
+
106
+ before(:each) do
107
+ @page = Page.new("http://www.example.com")
108
+ end
109
+
110
+ it "should find the title from the head" do
111
+ @page.title.should == "Page with Links"
112
+ end
113
+ end
114
+
115
+ describe "specifying a h1 css selector" do
116
+
117
+ before(:each) do
118
+ @page = Page.new("http://www.example.com", :title_css_selector => "h1")
119
+ end
120
+
121
+ it "should find the title from the h1 tag" do
122
+ @page.title.should == "Heading 1"
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,8 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Site, "which is being initialized" do
4
+
5
+ it "should work" do
6
+ Site.new("http://www.example.com").should be_kind_of(Site)
7
+ end
8
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ --colour
@@ -0,0 +1,19 @@
1
+ begin
2
+ require 'spec'
3
+ rescue LoadError
4
+ require 'rubygems'
5
+ gem 'rspec'
6
+ require 'spec'
7
+ end
8
+
9
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
10
+ require 'geo-spider'
11
+ require 'ostruct'
12
+ include GeoSpider
13
+
14
+ # Set up the api key for testing so it doesn't raise
15
+ GeoSpider::Extractors::Postcode.api_key = "waffles"
16
+
17
+ def page_as_string(page_path)
18
+ IO.read(File.join(File.dirname(__FILE__), "assets", "pages", page_path))
19
+ end
@@ -0,0 +1,34 @@
1
+ desc 'Release the website and new gem version'
2
+ task :deploy => [:check_version, :website, :release] do
3
+ puts "Remember to create SVN tag:"
4
+ puts "svn copy svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/trunk " +
5
+ "svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/tags/REL-#{VERS} "
6
+ puts "Suggested comment:"
7
+ puts "Tagging release #{CHANGES}"
8
+ end
9
+
10
+ desc 'Runs tasks website_generate and install_gem as a local deployment of the gem'
11
+ task :local_deploy => [:website_generate, :install_gem]
12
+
13
+ task :check_version do
14
+ unless ENV['VERSION']
15
+ puts 'Must pass a VERSION=x.y.z release version'
16
+ exit
17
+ end
18
+ unless ENV['VERSION'] == VERS
19
+ puts "Please update your version.rb to match the release version, currently #{VERS}"
20
+ exit
21
+ end
22
+ end
23
+
24
+ desc 'Install the package as a gem, without generating documentation(ri/rdoc)'
25
+ task :install_gem_no_doc => [:clean, :package] do
26
+ sh "#{'sudo ' unless Hoe::WINDOZE }gem install pkg/*.gem --no-rdoc --no-ri"
27
+ end
28
+
29
+ namespace :manifest do
30
+ desc 'Recreate Manifest.txt to include ALL files'
31
+ task :refresh do
32
+ `rake check_manifest | patch -p0 > Manifest.txt`
33
+ end
34
+ end
@@ -0,0 +1,7 @@
1
+ task :ruby_env do
2
+ RUBY_APP = if RUBY_PLATFORM =~ /java/
3
+ "jruby"
4
+ else
5
+ "ruby"
6
+ end unless defined? RUBY_APP
7
+ end
data/tasks/rspec.rake ADDED
@@ -0,0 +1,21 @@
1
+ begin
2
+ require 'spec'
3
+ rescue LoadError
4
+ require 'rubygems'
5
+ require 'spec'
6
+ end
7
+ begin
8
+ require 'spec/rake/spectask'
9
+ rescue LoadError
10
+ puts <<-EOS
11
+ To use rspec for testing you must install rspec gem:
12
+ gem install rspec
13
+ EOS
14
+ exit(0)
15
+ end
16
+
17
+ desc "Run the specs under spec/models"
18
+ Spec::Rake::SpecTask.new do |t|
19
+ t.spec_opts = ['--options', "spec/spec.opts"]
20
+ t.spec_files = FileList['spec/**/*_spec.rb']
21
+ end
@@ -0,0 +1,9 @@
1
+ # stubs for the website generation
2
+ # To install the website framework:
3
+ # script/generate website
4
+
5
+ task :website_generate
6
+
7
+ task :website_upload
8
+
9
+ task :website => :publish_docs
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: geo-spider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Tom Taylor
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-09-29 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.7.0
24
+ version:
25
+ description: Tool for spidering websites, extracting pages with geodata.
26
+ email:
27
+ - tom@tomtaylor.co.uk
28
+ executables: []
29
+
30
+ extensions: []
31
+
32
+ extra_rdoc_files:
33
+ - History.txt
34
+ - License.txt
35
+ - Manifest.txt
36
+ - PostInstall.txt
37
+ - README.txt
38
+ files:
39
+ - History.txt
40
+ - License.txt
41
+ - Manifest.txt
42
+ - PostInstall.txt
43
+ - README.txt
44
+ - Rakefile
45
+ - config/hoe.rb
46
+ - config/requirements.rb
47
+ - lib/geo-spider.rb
48
+ - lib/geo-spider/extractors/base.rb
49
+ - lib/geo-spider/extractors/master.rb
50
+ - lib/geo-spider/extractors/microformat.rb
51
+ - lib/geo-spider/extractors/postcode.rb
52
+ - lib/geo-spider/location.rb
53
+ - lib/geo-spider/page.rb
54
+ - lib/geo-spider/site.rb
55
+ - lib/geo-spider/version.rb
56
+ - script/console
57
+ - script/destroy
58
+ - script/generate
59
+ - setup.rb
60
+ - spec/assets/pages/multiple_postcodes_and_microformats.html
61
+ - spec/assets/pages/page_with_links.html
62
+ - spec/assets/pages/separate_microformat_and_postcode.html
63
+ - spec/assets/pages/single_microformat.html
64
+ - spec/assets/pages/single_postcode.html
65
+ - spec/geo-spider/page_spec.rb
66
+ - spec/geo-spider/site_spec.rb
67
+ - spec/spec.opts
68
+ - spec/spec_helper.rb
69
+ - tasks/deployment.rake
70
+ - tasks/environment.rake
71
+ - tasks/rspec.rake
72
+ - tasks/website.rake
73
+ has_rdoc: true
74
+ homepage: http://geospider.rubyforge.org
75
+ post_install_message: ""
76
+ rdoc_options:
77
+ - --main
78
+ - README.txt
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ version:
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ version: "0"
92
+ version:
93
+ requirements: []
94
+
95
+ rubyforge_project: geospider
96
+ rubygems_version: 1.2.0
97
+ signing_key:
98
+ specification_version: 2
99
+ summary: Tool for spidering websites, extracting pages with geodata.
100
+ test_files: []
101
+