spidr 0.1.9 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ require 'spidr/page'
2
+
3
+ require 'spec_helper'
4
+
5
+ shared_examples_for "Page" do
6
+ it "should have a status code" do
7
+ @page.code.should be_integer
8
+ end
9
+
10
+ it "should have a body" do
11
+ @page.body.should_not be_empty
12
+ end
13
+
14
+ it "should provide transparent access to the response headers" do
15
+ @page.content_type.should == @page.content_type
16
+ end
17
+ end
data/spec/page_spec.rb ADDED
@@ -0,0 +1,81 @@
1
+ require 'spidr/page'
2
+
3
+ require 'spec_helper'
4
+ require 'page_examples'
5
+ require 'helpers/page'
6
+
7
+ describe Page do
8
+ describe "html" do
9
+ before(:all) do
10
+ @page = get_page('http://spidr.rubyforge.org/course/start.html')
11
+ end
12
+
13
+ it_should_behave_like "Page"
14
+
15
+ it "should be OK" do
16
+ @page.should be_ok
17
+ end
18
+
19
+ it "should have a content-type" do
20
+ @page.content_type.should =~ /text\/html/
21
+ end
22
+
23
+ it "should be a html page" do
24
+ @page.should be_html
25
+ end
26
+
27
+ it "should have provide a document" do
28
+ @page.doc.class.should == Nokogiri::HTML::Document
29
+ end
30
+
31
+ it "should allow searching the document" do
32
+ @page.doc.search('//p').length.should == 2
33
+ @page.doc.at('//p[2]').inner_text.should == 'Ready! Set! Go!'
34
+ end
35
+
36
+ it "should have a title" do
37
+ @page.title.should == 'Spidr :: Web-Spider Obstacle Course :: Start'
38
+ end
39
+
40
+ it "should have links" do
41
+ @page.links.should_not be_empty
42
+ end
43
+ end
44
+
45
+ describe "txt" do
46
+ before(:all) do
47
+ @page = get_page('http://www.ruby-lang.org/en/LICENSE.txt')
48
+ end
49
+
50
+ it_should_behave_like "Page"
51
+
52
+ it "should be OK" do
53
+ @page.should be_ok
54
+ end
55
+
56
+ it "should have a content-type" do
57
+ @page.content_type.should =~ /text\/plain/
58
+ end
59
+
60
+ it "should be a txt page" do
61
+ @page.should be_txt
62
+ end
63
+
64
+ it "should not have provide a document" do
65
+ @page.doc.should be_nil
66
+ end
67
+
68
+ it "should not allow searching the document" do
69
+ @page.search('//p').should be_empty
70
+ @page.at('//p').should be_nil
71
+ end
72
+
73
+ it "should not have links" do
74
+ @page.links.should be_empty
75
+ end
76
+
77
+ it "should not have a title" do
78
+ @page.title.should be_nil
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,43 @@
1
+ require 'spidr/rules'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Rules do
6
+ it "should accept data based on acceptance data" do
7
+ rules = Rules.new(:accept => [1])
8
+
9
+ rules.accept?(1).should == true
10
+ end
11
+
12
+ it "should accept data based on acceptance regexps" do
13
+ rules = Rules.new(:accept => [/1/])
14
+
15
+ rules.accept?('1').should == true
16
+ end
17
+
18
+ it "should match non-Strings using acceptance regexps" do
19
+ rules = Rules.new(:accept => [/1/])
20
+
21
+ rules.accept?(1).should == true
22
+ end
23
+
24
+ it "should accept data using acceptance lambdas" do
25
+ rules = Rules.new(:accept => [lambda { |data| data > 2 }])
26
+
27
+ rules.accept?(3).should == true
28
+ end
29
+
30
+ it "should reject data that does not match any acceptance patterns" do
31
+ rules = Rules.new(:accept => [1, 2, 3])
32
+
33
+ rules.accept?(2).should == true
34
+ rules.accept?(4).should == false
35
+ end
36
+
37
+ it "should accept data that does not match any rejection patterns" do
38
+ rules = Rules.new(:reject => [1, 2, 3])
39
+
40
+ rules.accept?(2).should == false
41
+ rules.accept?(4).should == true
42
+ end
43
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'rubygems'
2
- gem 'rspec', '>=1.1.3'
2
+ gem 'rspec', '>=1.2.8'
3
3
  require 'spec'
4
4
 
5
5
  require 'spidr/version'
data/spec/spidr_spec.rb CHANGED
@@ -6,4 +6,34 @@ describe Spidr do
6
6
  it "should have a VERSION constant" do
7
7
  Spidr.const_defined?('VERSION').should == true
8
8
  end
9
+
10
+ describe "proxy" do
11
+ after(:all) do
12
+ Spidr.disable_proxy!
13
+ end
14
+
15
+ it "should not have proxy settings by default" do
16
+ Spidr.proxy[:host].should be_nil
17
+ end
18
+
19
+ it "should allow setting new proxy settings" do
20
+ Spidr.proxy = {:host => 'example.com', :port => 8010}
21
+
22
+ Spidr.proxy[:host].should == 'example.com'
23
+ Spidr.proxy[:port].should == 8010
24
+ end
25
+
26
+ it "should default the :port option of new proxy settings" do
27
+ Spidr.proxy = {:host => 'example.com'}
28
+
29
+ Spidr.proxy[:host].should == 'example.com'
30
+ Spidr.proxy[:port].should == Spidr::COMMON_PROXY_PORT
31
+ end
32
+
33
+ it "should allow disabling the proxy" do
34
+ Spidr.disable_proxy!
35
+
36
+ Spidr.proxy[:host].should be_nil
37
+ end
38
+ end
9
39
  end
@@ -1 +1 @@
1
- [{"link":"\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>","message":"should follow absolute links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","behavior":"follow"},{"link":"\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>","message":"should not follow absolute links to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","behavior":"nofollow"},{"link":"","example":"<a>should not follow links with no href attributes<\/a>","message":"should not follow links with no href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","behavior":"nofollow"},{"link":"","example":"<a href=\"\">should not follow links with empty href attributes<\/a>","message":"should not follow links with empty href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","behavior":"nofollow"},{"link":" ","example":"<a href=\"\">should ignore links with blank href attributes<\/a>","message":"should ignore links with blank href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","behavior":"ignore"},{"link":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>","message":"should ignore links beginning with \"javascript:\"","url":"javascript:fail();","behavior":"ignore"},{"link":"#","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>","message":"should ignore links with an onclick attribute and a href pointing to the page.","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","behavior":"ignore"},{"link":"start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>","message":"should not follow links to previously visited pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","behavior":"nofollow"},{"link":"next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>","message":"should follow links pointing to other pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","behavior":"follow"},{"link":"start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>","message":"should not follow links pointing to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","behavior":"nofollow"},{"link":"normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>","message":"should follow relative links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","behavior":"follow"},{"link":".\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>","message":"should follow relative links to files in the current directory","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","behavior":"follow"},{"link":"..\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>","message":"should follow links that transverse directories","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","behavior":"follow"},{"link":"#","example":"<a href=\"#\">should ignore in-page links<\/a>","message":"should ignore in-page links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","behavior":"ignore"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>","message":"should follow remote links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","behavior":"follow"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>","message":"should not follow remote links to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow"},{"link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>","message":"should not follow remote links with a relative path to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow"},{"link":"http:\/\/spidr.rubyforge.org:1337\/path\/","example":"<a href=\"http:\/\/spidr.rubyforge.org:1337\/path\/\">should ignore links that fail<\/a>","message":"should ignore links that fail","url":"http:\/\/spidr.rubyforge.org:1337\/path","behavior":"fail"},{"link":"iframe_next.html","example":"<a href=\"iframe_next.html\">should follow links within iframes<\/a>","message":"should follow links within iframes","url":"http:\/\/spidr.rubyforge.org\/course\/frames\/iframe_next.html","behavior":"follow"},{"link":"frame_next.html","example":"<a href=\"frame_next.html\">should follow links within frames<\/a>","message":"should follow links within frames","url":"http:\/\/spidr.rubyforge.org\/course\/frames\/frame_next.html","behavior":"follow"}]
1
+ [{"url":"http://spidr.rubyforge.org/course/absolute/next.html","link":"/course/absolute/next.html","example":"<a href=\"/course/absolute/next.html\">should follow absolute links to unvisited pages</a>","message":"should follow absolute links to unvisited pages","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/absolute/start.html","link":"/course/absolute/start.html","example":"<a href=\"/course/absolute/start.html\">should not follow absolute links to the current page</a>","message":"should not follow absolute links to the current page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/empty/start.html","link":"","example":"<a>should not follow links with no href attributes</a>","message":"should not follow links with no href attributes","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/empty/start.html","link":"","example":"<a href=\"\">should not follow links with empty href attributes</a>","message":"should not follow links with empty href attributes","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/empty/%20","link":" ","example":"<a href=\"\">should ignore links with blank href attributes</a>","message":"should ignore links with blank href attributes","behavior":"ignore"},{"url":"javascript:fail();","link":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"</a>","message":"should ignore links beginning with \"javascript:\"","behavior":"ignore"},{"url":"http://spidr.rubyforge.org/course/javascript/%23","link":"#","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.</a>","message":"should ignore links with an onclick attribute and a href pointing to the page.","behavior":"ignore"},{"url":"http://spidr.rubyforge.org/course/loop/start.html","link":"start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages</a>","message":"should not follow links to previously visited pages","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/loop/next.html","link":"next.html","example":"<a href=\"next.html\">should follow links pointing to other pages</a>","message":"should follow links pointing to other pages","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/loop/start.html","link":"start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page</a>","message":"should not follow links pointing to the current page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/relative/normal.html","link":"normal.html","example":"<a href=\"normal.html\">should follow relative links</a>","message":"should follow relative links","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/relative/current_directory.html","link":"./current_directory.html","example":"<a href=\"./current_directory.html\">should follow relative links to files in the current directory</a>","message":"should follow relative links to files in the current directory","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/relative/same_directory.html","link":"../relative/same_directory.html","example":"<a href=\"../relative/same_directory.html\">should follow links that transverse directories</a>","message":"should follow links that transverse directories","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/relative/%23","link":"#","example":"<a href=\"#\">should ignore in-page links</a>","message":"should ignore in-page links","behavior":"ignore"},{"url":"http://spidr.rubyforge.org/course/remote/next.html","link":"http://spidr.rubyforge.org/course/remote/next.html","example":"<a href=\"http://spidr.rubyforge.org/course/remote/next.html\">should follow remote links to unvisited pages</a>","message":"should follow remote links to unvisited pages","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/remote/start.html","link":"http://spidr.rubyforge.org/course/remote/start.html","example":"<a href=\"http://spidr.rubyforge.org/course/remote/start.html\">should not follow remote links to the same page</a>","message":"should not follow remote links to the same page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/remote/start.html","link":"http://spidr.rubyforge.org/course/loop/../remote/start.html","example":"<a href=\"http://spidr.rubyforge.org/course/loop/../remote/start.html\">should not follow remote links with a relative path to the same page</a>","message":"should not follow remote links with a relative path to the same page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org:1337/path/","link":"http://spidr.rubyforge.org:1337/path/","example":"<a href=\"http://spidr.rubyforge.org:1337/path/\">should ignore links that fail</a>","message":"should ignore links that fail","behavior":"fail"},{"url":"http://spidr.rubyforge.org/course/frames/iframe_next.html","link":"iframe_next.html","example":"<a href=\"iframe_next.html\">should follow links within iframes</a>","message":"should follow links within iframes","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/frames/frame_next.html","link":"frame_next.html","example":"<a href=\"frame_next.html\">should follow links within frames</a>","message":"should follow links within frames","behavior":"follow"}]
data/tasks/course.rb CHANGED
@@ -1,3 +1,10 @@
1
+ lib_dir = File.expand_path(File.join(File.dirname(__FILE__),'..','lib'))
2
+ unless $LOAD_PATH.include?(lib_dir)
3
+ $LOAD_PATH.unshift(lib_dir)
4
+ end
5
+
6
+ require 'spidr/extensions/uri'
7
+
1
8
  require 'nokogiri'
2
9
  require 'json'
3
10
 
@@ -22,7 +29,7 @@ namespace :course do
22
29
  absolute_url = page_url.merge(URI.encode(relative_url))
23
30
 
24
31
  if absolute_url.path
25
- absolute_url.path = File.expand_path(absolute_url.path)
32
+ absolute_url.path = URI.expand_path(absolute_url.path)
26
33
  end
27
34
 
28
35
  spec_data.merge(
data/tasks/spec.rb CHANGED
@@ -6,4 +6,5 @@ Spec::Rake::SpecTask.new(:spec) do |t|
6
6
  t.spec_opts = ['--colour', '--format', 'specdoc']
7
7
  end
8
8
 
9
+ task :test => :spec
9
10
  task :default => :spec
data/tasks/yard.rb ADDED
@@ -0,0 +1,12 @@
1
+ require 'yard'
2
+
3
+ YARD::Rake::YardocTask.new do |t|
4
+ t.files = ['lib/**/*.rb']
5
+ t.options = [
6
+ '--protected',
7
+ '--files', 'History.txt',
8
+ '--title', 'Spidr'
9
+ ]
10
+ end
11
+
12
+ task :docs => :yardoc
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -30,7 +30,7 @@ cert_chain:
30
30
  pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2009-06-13 00:00:00 -07:00
33
+ date: 2009-10-10 00:00:00 -07:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
@@ -41,7 +41,27 @@ dependencies:
41
41
  requirements:
42
42
  - - ">="
43
43
  - !ruby/object:Gem::Version
44
- version: "0"
44
+ version: 1.2.0
45
+ version:
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ type: :development
49
+ version_requirement:
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 1.2.8
55
+ version:
56
+ - !ruby/object:Gem::Dependency
57
+ name: yard
58
+ type: :development
59
+ version_requirement:
60
+ version_requirements: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: 0.2.3.5
45
65
  version:
46
66
  - !ruby/object:Gem::Dependency
47
67
  name: hoe
@@ -51,7 +71,7 @@ dependencies:
51
71
  requirements:
52
72
  - - ">="
53
73
  - !ruby/object:Gem::Version
54
- version: 2.0.0
74
+ version: 2.3.3
55
75
  version:
56
76
  description: |-
57
77
  Spidr is a versatile Ruby web spidering library that can spider a site,
@@ -73,15 +93,34 @@ files:
73
93
  - README.txt
74
94
  - Rakefile
75
95
  - lib/spidr.rb
96
+ - lib/spidr/extensions.rb
97
+ - lib/spidr/extensions/uri.rb
76
98
  - lib/spidr/page.rb
77
99
  - lib/spidr/rules.rb
100
+ - lib/spidr/filters.rb
101
+ - lib/spidr/events.rb
102
+ - lib/spidr/actions.rb
103
+ - lib/spidr/actions/exceptions.rb
104
+ - lib/spidr/actions/exceptions/action.rb
105
+ - lib/spidr/actions/exceptions/paused.rb
106
+ - lib/spidr/actions/exceptions/skip_link.rb
107
+ - lib/spidr/actions/exceptions/skip_page.rb
108
+ - lib/spidr/actions/actions.rb
78
109
  - lib/spidr/agent.rb
79
110
  - lib/spidr/spidr.rb
80
111
  - lib/spidr/version.rb
81
112
  - tasks/spec.rb
113
+ - tasks/yard.rb
82
114
  - tasks/course.rb
83
115
  - spec/spec_helper.rb
84
116
  - spec/helpers/course.rb
117
+ - spec/helpers/page.rb
118
+ - spec/extensions/uri_spec.rb
119
+ - spec/page_examples.rb
120
+ - spec/page_spec.rb
121
+ - spec/rules_spec.rb
122
+ - spec/filters_spec.rb
123
+ - spec/actions_spec.rb
85
124
  - spec/agent_spec.rb
86
125
  - spec/spidr_spec.rb
87
126
  - static/course/index.html
@@ -114,7 +153,7 @@ files:
114
153
  - static/course/frames/frame.html
115
154
  - static/course/frames/frame_next.html
116
155
  - static/course/specs.json
117
- has_rdoc: true
156
+ has_rdoc: yard
118
157
  homepage: http://spidr.rubyforge.org/
119
158
  licenses: []
120
159
 
@@ -139,7 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
139
178
  requirements: []
140
179
 
141
180
  rubyforge_project: spidr
142
- rubygems_version: 1.3.4
181
+ rubygems_version: 1.3.5
143
182
  signing_key:
144
183
  specification_version: 3
145
184
  summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
metadata.gz.sig CHANGED
Binary file