geo-spider 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
+ <html>
4
+ <head>
5
+ <meta http-equiv="Content-type" content="text/html; charset=utf-8">
6
+ <title>Multiple Microformats and Postcodes</title>
7
+ </head>
8
+ <body>
9
+ <p>
10
+ BBC Broadcasting House is at W1A 1AA. <abbr class="geo" title="51.503571;-0.074500">Lafone Street</abbr> is in Southwark.
11
+ </p>
12
+
13
+ <p>Headshift are at SE1 2NQ, but our favourite pub is the <abbr class="geo" title="51.503587;-0.075939">Anchor Tap</abbr>.</p>
14
+ </body>
15
+ </html>
@@ -0,0 +1,14 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
+ <html>
4
+ <head>
5
+ <meta http-equiv="Content-type" content="text/html; charset=utf-8">
6
+ <title>Page with Links</title>
7
+ </head>
8
+ <body>
9
+ <h1>Heading 1</h1>
10
+ <p>
11
+ <a href="http://www.example.com/broadcastinghouse">BBC Broadcasting House</a> is at W1A 1AA. <abbr class="geo" title="51.503571;-0.074500">Lafone Street</abbr> is in <a href="http://www.external.com/southwark">Southwark</a>. How about an <a href="http://www.example.com/download.mp3">MP3</a>?
12
+ </p>
13
+ </body>
14
+ </html>
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
+ <html>
4
+ <head>
5
+ <meta http-equiv="Content-type" content="text/html; charset=utf-8">
6
+ <title>Separate Microformat and Postcode</title>
7
+ </head>
8
+ <body>
9
+ <p>
10
+ BBC Broadcasting House is at W1A 1AA. <abbr class="geo" title="51.503571;-0.074500">Lafone Street</abbr> is in Southwark.
11
+ </p>
12
+ </body>
13
+ </html>
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
+ <html>
4
+ <head>
5
+ <meta http-equiv="Content-type" content="text/html; charset=utf-8">
6
+ <title>Single Microformat</title>
7
+ </head>
8
+ <body>
9
+ <p>
10
+ <abbr class="geo" title="51.517570;-0.138770">BBC Broadcasting House</abbr>
11
+ </p>
12
+ </body>
13
+ </html>
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
+ <html>
4
+ <head>
5
+ <meta http-equiv="Content-type" content="text/html; charset=utf-8">
6
+ <title>Single Postcode</title>
7
+ </head>
8
+ <body>
9
+ <p>
10
+ BBC Broadcasting House is at W1A 1AA.
11
+ </p>
12
+ </body>
13
+ </html>
@@ -0,0 +1,125 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Page, "with a single microformat which is being parsed" do
4
+
5
+ before(:each) do
6
+ OpenURI.should_receive(:open_uri).and_return(page_as_string('single_microformat.html'))
7
+ @page = Page.new("http://www.example.com")
8
+ end
9
+
10
+ it "should find one location" do
11
+ @page.locations.length.should == 1
12
+ end
13
+
14
+ it "should have the right location details" do
15
+ location = @page.locations.first
16
+ location.latitude.should == 51.51757
17
+ location.longitude.should == -0.13877
18
+ location.title.should == "BBC Broadcasting House"
19
+ end
20
+ end
21
+
22
+ describe Page, "with a single postcode which is being parsed" do
23
+
24
+ before(:each) do
25
+ OpenURI.should_receive(:open_uri).and_return(page_as_string('single_postcode.html'))
26
+ @page = Page.new("http://www.example.com")
27
+ GeoSpider::Extractors::Postcode.api_key = "waffles"
28
+ mock_geocoder_result = OpenStruct.new( {:location => [51.000000, -1.000000]} )
29
+ Graticule.stub!(:service)
30
+ Graticule.service.should_receive(:new).and_return(mock_geocoder_result)
31
+ end
32
+
33
+ it "should find one location" do
34
+ @page.locations.length.should == 1
35
+ end
36
+
37
+ it "should have the right location details" do
38
+ location = @page.locations.first
39
+ location.latitude.should == 51.0
40
+ location.longitude.should == -1.0
41
+ location.title.should == "W1A 1AA"
42
+ end
43
+
44
+ end
45
+
46
+ describe Page, "with multiple microformats and postcodes being parsed" do
47
+
48
+ before(:each) do
49
+ OpenURI.should_receive(:open_uri).and_return(page_as_string('multiple_postcodes_and_microformats.html'))
50
+ @page = Page.new("http://www.example.com")
51
+
52
+ mock_geocoder_result = OpenStruct.new( {:location => [51.000000, -1.000000]} )
53
+ Graticule.stub!(:service)
54
+ Graticule.service.should_receive(:new).twice.and_return(mock_geocoder_result)
55
+ end
56
+
57
+ it "should find four locations" do
58
+ @page.locations.length.should == 4
59
+ end
60
+
61
+ end
62
+
63
+ describe Page, "which is not part of a site" do
64
+
65
+ before(:each) do
66
+ OpenURI.should_receive(:open_uri).and_return(page_as_string('page_with_links.html'))
67
+ @page = Page.new("http://www.waffles.com")
68
+ end
69
+
70
+ it "should raise if you try and get the internal_links" do
71
+ lambda { @page.internal_links }.should raise_error
72
+ end
73
+
74
+ end
75
+
76
+ describe Page, "which is part of a site" do
77
+
78
+ before(:each) do
79
+ OpenURI.should_receive(:open_uri).and_return(page_as_string('page_with_links.html'))
80
+ @site = Site.new("http://www.example.com/")
81
+ @page = Page.new("http://www.example.com/waffles", :site => @site)
82
+ end
83
+
84
+ it "should be able to extract them all" do
85
+ @page.links.length.should == 2
86
+ end
87
+
88
+ it "should be able to extract just the internal links" do
89
+ @page.internal_links.length.should == 1
90
+ @page.internal_links.reject { |l| l =~ /^http:\/\/www.example.com\// }.length.should == 0
91
+ end
92
+
93
+ it "should exclude the media links" do
94
+ @page.links.should_not include("http://www.example.com/download.mp3")
95
+ end
96
+ end
97
+
98
+ describe Page, "which is finding the title" do
99
+
100
+ before(:each) do
101
+ OpenURI.should_receive(:open_uri).and_return(page_as_string('page_with_links.html'))
102
+ end
103
+
104
+ describe "using the default" do
105
+
106
+ before(:each) do
107
+ @page = Page.new("http://www.example.com")
108
+ end
109
+
110
+ it "should find the title from the head" do
111
+ @page.title.should == "Page with Links"
112
+ end
113
+ end
114
+
115
+ describe "specifying a h1 css selector" do
116
+
117
+ before(:each) do
118
+ @page = Page.new("http://www.example.com", :title_css_selector => "h1")
119
+ end
120
+
121
+ it "should find the title from the h1 tag" do
122
+ @page.title.should == "Heading 1"
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,8 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Site, "which is being initialized" do
4
+
5
+ it "should work" do
6
+ Site.new("http://www.example.com").should be_kind_of(Site)
7
+ end
8
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ --colour
@@ -0,0 +1,19 @@
1
+ begin
2
+ require 'spec'
3
+ rescue LoadError
4
+ require 'rubygems'
5
+ gem 'rspec'
6
+ require 'spec'
7
+ end
8
+
9
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
10
+ require 'geo-spider'
11
+ require 'ostruct'
12
+ include GeoSpider
13
+
14
+ # Set up the api key for testing so it doesn't raise
15
+ GeoSpider::Extractors::Postcode.api_key = "waffles"
16
+
17
+ def page_as_string(page_path)
18
+ IO.read(File.join(File.dirname(__FILE__), "assets", "pages", page_path))
19
+ end
@@ -0,0 +1,34 @@
1
+ desc 'Release the website and new gem version'
2
+ task :deploy => [:check_version, :website, :release] do
3
+ puts "Remember to create SVN tag:"
4
+ puts "svn copy svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/trunk " +
5
+ "svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/tags/REL-#{VERS} "
6
+ puts "Suggested comment:"
7
+ puts "Tagging release #{CHANGES}"
8
+ end
9
+
10
+ desc 'Runs tasks website_generate and install_gem as a local deployment of the gem'
11
+ task :local_deploy => [:website_generate, :install_gem]
12
+
13
+ task :check_version do
14
+ unless ENV['VERSION']
15
+ puts 'Must pass a VERSION=x.y.z release version'
16
+ exit
17
+ end
18
+ unless ENV['VERSION'] == VERS
19
+ puts "Please update your version.rb to match the release version, currently #{VERS}"
20
+ exit
21
+ end
22
+ end
23
+
24
+ desc 'Install the package as a gem, without generating documentation(ri/rdoc)'
25
+ task :install_gem_no_doc => [:clean, :package] do
26
+ sh "#{'sudo ' unless Hoe::WINDOZE }gem install pkg/*.gem --no-rdoc --no-ri"
27
+ end
28
+
29
+ namespace :manifest do
30
+ desc 'Recreate Manifest.txt to include ALL files'
31
+ task :refresh do
32
+ `rake check_manifest | patch -p0 > Manifest.txt`
33
+ end
34
+ end
@@ -0,0 +1,7 @@
1
+ task :ruby_env do
2
+ RUBY_APP = if RUBY_PLATFORM =~ /java/
3
+ "jruby"
4
+ else
5
+ "ruby"
6
+ end unless defined? RUBY_APP
7
+ end
data/tasks/rspec.rake ADDED
@@ -0,0 +1,21 @@
1
+ begin
2
+ require 'spec'
3
+ rescue LoadError
4
+ require 'rubygems'
5
+ require 'spec'
6
+ end
7
+ begin
8
+ require 'spec/rake/spectask'
9
+ rescue LoadError
10
+ puts <<-EOS
11
+ To use rspec for testing you must install rspec gem:
12
+ gem install rspec
13
+ EOS
14
+ exit(0)
15
+ end
16
+
17
+ desc "Run the specs under spec/models"
18
+ Spec::Rake::SpecTask.new do |t|
19
+ t.spec_opts = ['--options', "spec/spec.opts"]
20
+ t.spec_files = FileList['spec/**/*_spec.rb']
21
+ end
@@ -0,0 +1,9 @@
1
+ # stubs for the website generation
2
+ # To install the website framework:
3
+ # script/generate website
4
+
5
+ task :website_generate
6
+
7
+ task :website_upload
8
+
9
+ task :website => :publish_docs
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: geo-spider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Tom Taylor
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-09-29 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.7.0
24
+ version:
25
+ description: Tool for spidering websites, extracting pages with geodata.
26
+ email:
27
+ - tom@tomtaylor.co.uk
28
+ executables: []
29
+
30
+ extensions: []
31
+
32
+ extra_rdoc_files:
33
+ - History.txt
34
+ - License.txt
35
+ - Manifest.txt
36
+ - PostInstall.txt
37
+ - README.txt
38
+ files:
39
+ - History.txt
40
+ - License.txt
41
+ - Manifest.txt
42
+ - PostInstall.txt
43
+ - README.txt
44
+ - Rakefile
45
+ - config/hoe.rb
46
+ - config/requirements.rb
47
+ - lib/geo-spider.rb
48
+ - lib/geo-spider/extractors/base.rb
49
+ - lib/geo-spider/extractors/master.rb
50
+ - lib/geo-spider/extractors/microformat.rb
51
+ - lib/geo-spider/extractors/postcode.rb
52
+ - lib/geo-spider/location.rb
53
+ - lib/geo-spider/page.rb
54
+ - lib/geo-spider/site.rb
55
+ - lib/geo-spider/version.rb
56
+ - script/console
57
+ - script/destroy
58
+ - script/generate
59
+ - setup.rb
60
+ - spec/assets/pages/multiple_postcodes_and_microformats.html
61
+ - spec/assets/pages/page_with_links.html
62
+ - spec/assets/pages/separate_microformat_and_postcode.html
63
+ - spec/assets/pages/single_microformat.html
64
+ - spec/assets/pages/single_postcode.html
65
+ - spec/geo-spider/page_spec.rb
66
+ - spec/geo-spider/site_spec.rb
67
+ - spec/spec.opts
68
+ - spec/spec_helper.rb
69
+ - tasks/deployment.rake
70
+ - tasks/environment.rake
71
+ - tasks/rspec.rake
72
+ - tasks/website.rake
73
+ has_rdoc: true
74
+ homepage: http://geospider.rubyforge.org
75
+ post_install_message: ""
76
+ rdoc_options:
77
+ - --main
78
+ - README.txt
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ version:
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ version: "0"
92
+ version:
93
+ requirements: []
94
+
95
+ rubyforge_project: geospider
96
+ rubygems_version: 1.2.0
97
+ signing_key:
98
+ specification_version: 2
99
+ summary: Tool for spidering websites, extracting pages with geodata.
100
+ test_files: []
101
+