spidr 0.1.9 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,17 @@
1
+ require 'spidr/page'
2
+
3
+ require 'spec_helper'
4
+
5
+ shared_examples_for "Page" do
6
+ it "should have a status code" do
7
+ @page.code.should be_integer
8
+ end
9
+
10
+ it "should have a body" do
11
+ @page.body.should_not be_empty
12
+ end
13
+
14
+ it "should provide transparent access to the response headers" do
15
+ @page.content_type.should == @page.content_type
16
+ end
17
+ end
data/spec/page_spec.rb ADDED
@@ -0,0 +1,81 @@
1
+ require 'spidr/page'
2
+
3
+ require 'spec_helper'
4
+ require 'page_examples'
5
+ require 'helpers/page'
6
+
7
+ describe Page do
8
+ describe "html" do
9
+ before(:all) do
10
+ @page = get_page('http://spidr.rubyforge.org/course/start.html')
11
+ end
12
+
13
+ it_should_behave_like "Page"
14
+
15
+ it "should be OK" do
16
+ @page.should be_ok
17
+ end
18
+
19
+ it "should have a content-type" do
20
+ @page.content_type.should =~ /text\/html/
21
+ end
22
+
23
+ it "should be a html page" do
24
+ @page.should be_html
25
+ end
26
+
27
+ it "should have provide a document" do
28
+ @page.doc.class.should == Nokogiri::HTML::Document
29
+ end
30
+
31
+ it "should allow searching the document" do
32
+ @page.doc.search('//p').length.should == 2
33
+ @page.doc.at('//p[2]').inner_text.should == 'Ready! Set! Go!'
34
+ end
35
+
36
+ it "should have a title" do
37
+ @page.title.should == 'Spidr :: Web-Spider Obstacle Course :: Start'
38
+ end
39
+
40
+ it "should have links" do
41
+ @page.links.should_not be_empty
42
+ end
43
+ end
44
+
45
+ describe "txt" do
46
+ before(:all) do
47
+ @page = get_page('http://www.ruby-lang.org/en/LICENSE.txt')
48
+ end
49
+
50
+ it_should_behave_like "Page"
51
+
52
+ it "should be OK" do
53
+ @page.should be_ok
54
+ end
55
+
56
+ it "should have a content-type" do
57
+ @page.content_type.should =~ /text\/plain/
58
+ end
59
+
60
+ it "should be a txt page" do
61
+ @page.should be_txt
62
+ end
63
+
64
+ it "should not have provide a document" do
65
+ @page.doc.should be_nil
66
+ end
67
+
68
+ it "should not allow searching the document" do
69
+ @page.search('//p').should be_empty
70
+ @page.at('//p').should be_nil
71
+ end
72
+
73
+ it "should not have links" do
74
+ @page.links.should be_empty
75
+ end
76
+
77
+ it "should not have a title" do
78
+ @page.title.should be_nil
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,43 @@
1
+ require 'spidr/rules'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Rules do
6
+ it "should accept data based on acceptance data" do
7
+ rules = Rules.new(:accept => [1])
8
+
9
+ rules.accept?(1).should == true
10
+ end
11
+
12
+ it "should accept data based on acceptance regexps" do
13
+ rules = Rules.new(:accept => [/1/])
14
+
15
+ rules.accept?('1').should == true
16
+ end
17
+
18
+ it "should match non-Strings using acceptance regexps" do
19
+ rules = Rules.new(:accept => [/1/])
20
+
21
+ rules.accept?(1).should == true
22
+ end
23
+
24
+ it "should accept data using acceptance lambdas" do
25
+ rules = Rules.new(:accept => [lambda { |data| data > 2 }])
26
+
27
+ rules.accept?(3).should == true
28
+ end
29
+
30
+ it "should reject data that does not match any acceptance patterns" do
31
+ rules = Rules.new(:accept => [1, 2, 3])
32
+
33
+ rules.accept?(2).should == true
34
+ rules.accept?(4).should == false
35
+ end
36
+
37
+ it "should accept data that does not match any rejection patterns" do
38
+ rules = Rules.new(:reject => [1, 2, 3])
39
+
40
+ rules.accept?(2).should == false
41
+ rules.accept?(4).should == true
42
+ end
43
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'rubygems'
2
- gem 'rspec', '>=1.1.3'
2
+ gem 'rspec', '>=1.2.8'
3
3
  require 'spec'
4
4
 
5
5
  require 'spidr/version'
data/spec/spidr_spec.rb CHANGED
@@ -6,4 +6,34 @@ describe Spidr do
6
6
  it "should have a VERSION constant" do
7
7
  Spidr.const_defined?('VERSION').should == true
8
8
  end
9
+
10
+ describe "proxy" do
11
+ after(:all) do
12
+ Spidr.disable_proxy!
13
+ end
14
+
15
+ it "should not have proxy settings by default" do
16
+ Spidr.proxy[:host].should be_nil
17
+ end
18
+
19
+ it "should allow setting new proxy settings" do
20
+ Spidr.proxy = {:host => 'example.com', :port => 8010}
21
+
22
+ Spidr.proxy[:host].should == 'example.com'
23
+ Spidr.proxy[:port].should == 8010
24
+ end
25
+
26
+ it "should default the :port option of new proxy settings" do
27
+ Spidr.proxy = {:host => 'example.com'}
28
+
29
+ Spidr.proxy[:host].should == 'example.com'
30
+ Spidr.proxy[:port].should == Spidr::COMMON_PROXY_PORT
31
+ end
32
+
33
+ it "should allow disabling the proxy" do
34
+ Spidr.disable_proxy!
35
+
36
+ Spidr.proxy[:host].should be_nil
37
+ end
38
+ end
9
39
  end
@@ -1 +1 @@
1
- [{"link":"\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>","message":"should follow absolute links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","behavior":"follow"},{"link":"\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>","message":"should not follow absolute links to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","behavior":"nofollow"},{"link":"","example":"<a>should not follow links with no href attributes<\/a>","message":"should not follow links with no href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","behavior":"nofollow"},{"link":"","example":"<a href=\"\">should not follow links with empty href attributes<\/a>","message":"should not follow links with empty href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","behavior":"nofollow"},{"link":" ","example":"<a href=\"\">should ignore links with blank href attributes<\/a>","message":"should ignore links with blank href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","behavior":"ignore"},{"link":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>","message":"should ignore links beginning with \"javascript:\"","url":"javascript:fail();","behavior":"ignore"},{"link":"#","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>","message":"should ignore links with an onclick attribute and a href pointing to the page.","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","behavior":"ignore"},{"link":"start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>","message":"should not follow links to previously visited pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","behavior":"nofollow"},{"link":"next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>","message":"should follow links pointing to other pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","behavior":"follow"},{"link":"start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>","message":"should not follow links pointing to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","behavior":"nofollow"},{"link":"normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>","message":"should follow relative links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","behavior":"follow"},{"link":".\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>","message":"should follow relative links to files in the current directory","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","behavior":"follow"},{"link":"..\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>","message":"should follow links that transverse directories","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","behavior":"follow"},{"link":"#","example":"<a href=\"#\">should ignore in-page links<\/a>","message":"should ignore in-page links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","behavior":"ignore"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>","message":"should follow remote links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","behavior":"follow"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>","message":"should not follow remote links to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow"},{"link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>","message":"should not follow remote links with a relative path to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow"},{"link":"http:\/\/spidr.rubyforge.org:1337\/path\/","example":"<a href=\"http:\/\/spidr.rubyforge.org:1337\/path\/\">should ignore links that fail<\/a>","message":"should ignore links that fail","url":"http:\/\/spidr.rubyforge.org:1337\/path","behavior":"fail"},{"link":"iframe_next.html","example":"<a href=\"iframe_next.html\">should follow links within iframes<\/a>","message":"should follow links within iframes","url":"http:\/\/spidr.rubyforge.org\/course\/frames\/iframe_next.html","behavior":"follow"},{"link":"frame_next.html","example":"<a href=\"frame_next.html\">should follow links within frames<\/a>","message":"should follow links within frames","url":"http:\/\/spidr.rubyforge.org\/course\/frames\/frame_next.html","behavior":"follow"}]
1
+ [{"url":"http://spidr.rubyforge.org/course/absolute/next.html","link":"/course/absolute/next.html","example":"<a href=\"/course/absolute/next.html\">should follow absolute links to unvisited pages</a>","message":"should follow absolute links to unvisited pages","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/absolute/start.html","link":"/course/absolute/start.html","example":"<a href=\"/course/absolute/start.html\">should not follow absolute links to the current page</a>","message":"should not follow absolute links to the current page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/empty/start.html","link":"","example":"<a>should not follow links with no href attributes</a>","message":"should not follow links with no href attributes","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/empty/start.html","link":"","example":"<a href=\"\">should not follow links with empty href attributes</a>","message":"should not follow links with empty href attributes","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/empty/%20","link":" ","example":"<a href=\"\">should ignore links with blank href attributes</a>","message":"should ignore links with blank href attributes","behavior":"ignore"},{"url":"javascript:fail();","link":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"</a>","message":"should ignore links beginning with \"javascript:\"","behavior":"ignore"},{"url":"http://spidr.rubyforge.org/course/javascript/%23","link":"#","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.</a>","message":"should ignore links with an onclick attribute and a href pointing to the page.","behavior":"ignore"},{"url":"http://spidr.rubyforge.org/course/loop/start.html","link":"start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages</a>","message":"should not follow links to previously visited pages","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/loop/next.html","link":"next.html","example":"<a href=\"next.html\">should follow links pointing to other pages</a>","message":"should follow links pointing to other pages","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/loop/start.html","link":"start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page</a>","message":"should not follow links pointing to the current page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/relative/normal.html","link":"normal.html","example":"<a href=\"normal.html\">should follow relative links</a>","message":"should follow relative links","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/relative/current_directory.html","link":"./current_directory.html","example":"<a href=\"./current_directory.html\">should follow relative links to files in the current directory</a>","message":"should follow relative links to files in the current directory","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/relative/same_directory.html","link":"../relative/same_directory.html","example":"<a href=\"../relative/same_directory.html\">should follow links that transverse directories</a>","message":"should follow links that transverse directories","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/relative/%23","link":"#","example":"<a href=\"#\">should ignore in-page links</a>","message":"should ignore in-page links","behavior":"ignore"},{"url":"http://spidr.rubyforge.org/course/remote/next.html","link":"http://spidr.rubyforge.org/course/remote/next.html","example":"<a href=\"http://spidr.rubyforge.org/course/remote/next.html\">should follow remote links to unvisited pages</a>","message":"should follow remote links to unvisited pages","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/remote/start.html","link":"http://spidr.rubyforge.org/course/remote/start.html","example":"<a href=\"http://spidr.rubyforge.org/course/remote/start.html\">should not follow remote links to the same page</a>","message":"should not follow remote links to the same page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/remote/start.html","link":"http://spidr.rubyforge.org/course/loop/../remote/start.html","example":"<a href=\"http://spidr.rubyforge.org/course/loop/../remote/start.html\">should not follow remote links with a relative path to the same page</a>","message":"should not follow remote links with a relative path to the same page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org:1337/path/","link":"http://spidr.rubyforge.org:1337/path/","example":"<a href=\"http://spidr.rubyforge.org:1337/path/\">should ignore links that fail</a>","message":"should ignore links that fail","behavior":"fail"},{"url":"http://spidr.rubyforge.org/course/frames/iframe_next.html","link":"iframe_next.html","example":"<a href=\"iframe_next.html\">should follow links within iframes</a>","message":"should follow links within iframes","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/frames/frame_next.html","link":"frame_next.html","example":"<a href=\"frame_next.html\">should follow links within frames</a>","message":"should follow links within frames","behavior":"follow"}]
data/tasks/course.rb CHANGED
@@ -1,3 +1,10 @@
1
+ lib_dir = File.expand_path(File.join(File.dirname(__FILE__),'..','lib'))
2
+ unless $LOAD_PATH.include?(lib_dir)
3
+ $LOAD_PATH.unshift(lib_dir)
4
+ end
5
+
6
+ require 'spidr/extensions/uri'
7
+
1
8
  require 'nokogiri'
2
9
  require 'json'
3
10
 
@@ -22,7 +29,7 @@ namespace :course do
22
29
  absolute_url = page_url.merge(URI.encode(relative_url))
23
30
 
24
31
  if absolute_url.path
25
- absolute_url.path = File.expand_path(absolute_url.path)
32
+ absolute_url.path = URI.expand_path(absolute_url.path)
26
33
  end
27
34
 
28
35
  spec_data.merge(
data/tasks/spec.rb CHANGED
@@ -6,4 +6,5 @@ Spec::Rake::SpecTask.new(:spec) do |t|
6
6
  t.spec_opts = ['--colour', '--format', 'specdoc']
7
7
  end
8
8
 
9
+ task :test => :spec
9
10
  task :default => :spec
data/tasks/yard.rb ADDED
@@ -0,0 +1,12 @@
1
+ require 'yard'
2
+
3
+ YARD::Rake::YardocTask.new do |t|
4
+ t.files = ['lib/**/*.rb']
5
+ t.options = [
6
+ '--protected',
7
+ '--files', 'History.txt',
8
+ '--title', 'Spidr'
9
+ ]
10
+ end
11
+
12
+ task :docs => :yardoc
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -30,7 +30,7 @@ cert_chain:
30
30
  pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2009-06-13 00:00:00 -07:00
33
+ date: 2009-10-10 00:00:00 -07:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
@@ -41,7 +41,27 @@ dependencies:
41
41
  requirements:
42
42
  - - ">="
43
43
  - !ruby/object:Gem::Version
44
- version: "0"
44
+ version: 1.2.0
45
+ version:
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ type: :development
49
+ version_requirement:
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 1.2.8
55
+ version:
56
+ - !ruby/object:Gem::Dependency
57
+ name: yard
58
+ type: :development
59
+ version_requirement:
60
+ version_requirements: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: 0.2.3.5
45
65
  version:
46
66
  - !ruby/object:Gem::Dependency
47
67
  name: hoe
@@ -51,7 +71,7 @@ dependencies:
51
71
  requirements:
52
72
  - - ">="
53
73
  - !ruby/object:Gem::Version
54
- version: 2.0.0
74
+ version: 2.3.3
55
75
  version:
56
76
  description: |-
57
77
  Spidr is a versatile Ruby web spidering library that can spider a site,
@@ -73,15 +93,34 @@ files:
73
93
  - README.txt
74
94
  - Rakefile
75
95
  - lib/spidr.rb
96
+ - lib/spidr/extensions.rb
97
+ - lib/spidr/extensions/uri.rb
76
98
  - lib/spidr/page.rb
77
99
  - lib/spidr/rules.rb
100
+ - lib/spidr/filters.rb
101
+ - lib/spidr/events.rb
102
+ - lib/spidr/actions.rb
103
+ - lib/spidr/actions/exceptions.rb
104
+ - lib/spidr/actions/exceptions/action.rb
105
+ - lib/spidr/actions/exceptions/paused.rb
106
+ - lib/spidr/actions/exceptions/skip_link.rb
107
+ - lib/spidr/actions/exceptions/skip_page.rb
108
+ - lib/spidr/actions/actions.rb
78
109
  - lib/spidr/agent.rb
79
110
  - lib/spidr/spidr.rb
80
111
  - lib/spidr/version.rb
81
112
  - tasks/spec.rb
113
+ - tasks/yard.rb
82
114
  - tasks/course.rb
83
115
  - spec/spec_helper.rb
84
116
  - spec/helpers/course.rb
117
+ - spec/helpers/page.rb
118
+ - spec/extensions/uri_spec.rb
119
+ - spec/page_examples.rb
120
+ - spec/page_spec.rb
121
+ - spec/rules_spec.rb
122
+ - spec/filters_spec.rb
123
+ - spec/actions_spec.rb
85
124
  - spec/agent_spec.rb
86
125
  - spec/spidr_spec.rb
87
126
  - static/course/index.html
@@ -114,7 +153,7 @@ files:
114
153
  - static/course/frames/frame.html
115
154
  - static/course/frames/frame_next.html
116
155
  - static/course/specs.json
117
- has_rdoc: true
156
+ has_rdoc: yard
118
157
  homepage: http://spidr.rubyforge.org/
119
158
  licenses: []
120
159
 
@@ -139,7 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
139
178
  requirements: []
140
179
 
141
180
  rubyforge_project: spidr
142
- rubygems_version: 1.3.4
181
+ rubygems_version: 1.3.5
143
182
  signing_key:
144
183
  specification_version: 3
145
184
  summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
metadata.gz.sig CHANGED
Binary file