spidr 0.1.9 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +0 -0
- data/History.txt +43 -0
- data/Manifest.txt +19 -0
- data/README.txt +100 -11
- data/Rakefile +15 -5
- data/lib/spidr/actions.rb +2 -0
- data/lib/spidr/actions/actions.rb +79 -0
- data/lib/spidr/actions/exceptions.rb +4 -0
- data/lib/spidr/actions/exceptions/action.rb +6 -0
- data/lib/spidr/actions/exceptions/paused.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +8 -0
- data/lib/spidr/agent.rb +385 -444
- data/lib/spidr/events.rb +87 -0
- data/lib/spidr/extensions.rb +1 -0
- data/lib/spidr/extensions/uri.rb +45 -0
- data/lib/spidr/filters.rb +438 -0
- data/lib/spidr/page.rb +211 -70
- data/lib/spidr/rules.rb +40 -18
- data/lib/spidr/spidr.rb +57 -7
- data/lib/spidr/version.rb +2 -1
- data/spec/actions_spec.rb +61 -0
- data/spec/agent_spec.rb +24 -31
- data/spec/extensions/uri_spec.rb +39 -0
- data/spec/filters_spec.rb +53 -0
- data/spec/helpers/page.rb +8 -0
- data/spec/page_examples.rb +17 -0
- data/spec/page_spec.rb +81 -0
- data/spec/rules_spec.rb +43 -0
- data/spec/spec_helper.rb +1 -1
- data/spec/spidr_spec.rb +30 -0
- data/static/course/specs.json +1 -1
- data/tasks/course.rb +8 -1
- data/tasks/spec.rb +1 -0
- data/tasks/yard.rb +12 -0
- metadata +45 -6
- metadata.gz.sig +0 -0
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'spidr/page'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
shared_examples_for "Page" do
|
6
|
+
it "should have a status code" do
|
7
|
+
@page.code.should be_integer
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should have a body" do
|
11
|
+
@page.body.should_not be_empty
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should provide transparent access to the response headers" do
|
15
|
+
@page.content_type.should == @page.content_type
|
16
|
+
end
|
17
|
+
end
|
data/spec/page_spec.rb
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'spidr/page'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'page_examples'
|
5
|
+
require 'helpers/page'
|
6
|
+
|
7
|
+
describe Page do
|
8
|
+
describe "html" do
|
9
|
+
before(:all) do
|
10
|
+
@page = get_page('http://spidr.rubyforge.org/course/start.html')
|
11
|
+
end
|
12
|
+
|
13
|
+
it_should_behave_like "Page"
|
14
|
+
|
15
|
+
it "should be OK" do
|
16
|
+
@page.should be_ok
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should have a content-type" do
|
20
|
+
@page.content_type.should =~ /text\/html/
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should be a html page" do
|
24
|
+
@page.should be_html
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should have provide a document" do
|
28
|
+
@page.doc.class.should == Nokogiri::HTML::Document
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should allow searching the document" do
|
32
|
+
@page.doc.search('//p').length.should == 2
|
33
|
+
@page.doc.at('//p[2]').inner_text.should == 'Ready! Set! Go!'
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should have a title" do
|
37
|
+
@page.title.should == 'Spidr :: Web-Spider Obstacle Course :: Start'
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should have links" do
|
41
|
+
@page.links.should_not be_empty
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "txt" do
|
46
|
+
before(:all) do
|
47
|
+
@page = get_page('http://www.ruby-lang.org/en/LICENSE.txt')
|
48
|
+
end
|
49
|
+
|
50
|
+
it_should_behave_like "Page"
|
51
|
+
|
52
|
+
it "should be OK" do
|
53
|
+
@page.should be_ok
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should have a content-type" do
|
57
|
+
@page.content_type.should =~ /text\/plain/
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should be a txt page" do
|
61
|
+
@page.should be_txt
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should not have provide a document" do
|
65
|
+
@page.doc.should be_nil
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should not allow searching the document" do
|
69
|
+
@page.search('//p').should be_empty
|
70
|
+
@page.at('//p').should be_nil
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should not have links" do
|
74
|
+
@page.links.should be_empty
|
75
|
+
end
|
76
|
+
|
77
|
+
it "should not have a title" do
|
78
|
+
@page.title.should be_nil
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
data/spec/rules_spec.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'spidr/rules'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe Rules do
|
6
|
+
it "should accept data based on acceptance data" do
|
7
|
+
rules = Rules.new(:accept => [1])
|
8
|
+
|
9
|
+
rules.accept?(1).should == true
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should accept data based on acceptance regexps" do
|
13
|
+
rules = Rules.new(:accept => [/1/])
|
14
|
+
|
15
|
+
rules.accept?('1').should == true
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should match non-Strings using acceptance regexps" do
|
19
|
+
rules = Rules.new(:accept => [/1/])
|
20
|
+
|
21
|
+
rules.accept?(1).should == true
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should accept data using acceptance lambdas" do
|
25
|
+
rules = Rules.new(:accept => [lambda { |data| data > 2 }])
|
26
|
+
|
27
|
+
rules.accept?(3).should == true
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should reject data that does not match any acceptance patterns" do
|
31
|
+
rules = Rules.new(:accept => [1, 2, 3])
|
32
|
+
|
33
|
+
rules.accept?(2).should == true
|
34
|
+
rules.accept?(4).should == false
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should accept data that does not match any rejection patterns" do
|
38
|
+
rules = Rules.new(:reject => [1, 2, 3])
|
39
|
+
|
40
|
+
rules.accept?(2).should == false
|
41
|
+
rules.accept?(4).should == true
|
42
|
+
end
|
43
|
+
end
|
data/spec/spec_helper.rb
CHANGED
data/spec/spidr_spec.rb
CHANGED
@@ -6,4 +6,34 @@ describe Spidr do
|
|
6
6
|
it "should have a VERSION constant" do
|
7
7
|
Spidr.const_defined?('VERSION').should == true
|
8
8
|
end
|
9
|
+
|
10
|
+
describe "proxy" do
|
11
|
+
after(:all) do
|
12
|
+
Spidr.disable_proxy!
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should not have proxy settings by default" do
|
16
|
+
Spidr.proxy[:host].should be_nil
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should allow setting new proxy settings" do
|
20
|
+
Spidr.proxy = {:host => 'example.com', :port => 8010}
|
21
|
+
|
22
|
+
Spidr.proxy[:host].should == 'example.com'
|
23
|
+
Spidr.proxy[:port].should == 8010
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should default the :port option of new proxy settings" do
|
27
|
+
Spidr.proxy = {:host => 'example.com'}
|
28
|
+
|
29
|
+
Spidr.proxy[:host].should == 'example.com'
|
30
|
+
Spidr.proxy[:port].should == Spidr::COMMON_PROXY_PORT
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should allow disabling the proxy" do
|
34
|
+
Spidr.disable_proxy!
|
35
|
+
|
36
|
+
Spidr.proxy[:host].should be_nil
|
37
|
+
end
|
38
|
+
end
|
9
39
|
end
|
data/static/course/specs.json
CHANGED
@@ -1 +1 @@
|
|
1
|
-
[{"link":"
|
1
|
+
[{"url":"http://spidr.rubyforge.org/course/absolute/next.html","link":"/course/absolute/next.html","example":"<a href=\"/course/absolute/next.html\">should follow absolute links to unvisited pages</a>","message":"should follow absolute links to unvisited pages","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/absolute/start.html","link":"/course/absolute/start.html","example":"<a href=\"/course/absolute/start.html\">should not follow absolute links to the current page</a>","message":"should not follow absolute links to the current page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/empty/start.html","link":"","example":"<a>should not follow links with no href attributes</a>","message":"should not follow links with no href attributes","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/empty/start.html","link":"","example":"<a href=\"\">should not follow links with empty href attributes</a>","message":"should not follow links with empty href attributes","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/empty/%20","link":" ","example":"<a href=\"\">should ignore links with blank href attributes</a>","message":"should ignore links with blank href attributes","behavior":"ignore"},{"url":"javascript:fail();","link":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"</a>","message":"should ignore links beginning with \"javascript:\"","behavior":"ignore"},{"url":"http://spidr.rubyforge.org/course/javascript/%23","link":"#","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.</a>","message":"should ignore links with an onclick attribute and a href pointing to the page.","behavior":"ignore"},{"url":"http://spidr.rubyforge.org/course/loop/start.html","link":"start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages</a>","message":"should not follow links to previously visited pages","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/loop/next.html","link":"next.html","example":"<a href=\"next.html\">should follow links pointing to other pages</a>","message":"should follow links pointing to other pages","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/loop/start.html","link":"start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page</a>","message":"should not follow links pointing to the current page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/relative/normal.html","link":"normal.html","example":"<a href=\"normal.html\">should follow relative links</a>","message":"should follow relative links","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/relative/current_directory.html","link":"./current_directory.html","example":"<a href=\"./current_directory.html\">should follow relative links to files in the current directory</a>","message":"should follow relative links to files in the current directory","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/relative/same_directory.html","link":"../relative/same_directory.html","example":"<a href=\"../relative/same_directory.html\">should follow links that transverse directories</a>","message":"should follow links that transverse directories","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/relative/%23","link":"#","example":"<a href=\"#\">should ignore in-page links</a>","message":"should ignore in-page links","behavior":"ignore"},{"url":"http://spidr.rubyforge.org/course/remote/next.html","link":"http://spidr.rubyforge.org/course/remote/next.html","example":"<a href=\"http://spidr.rubyforge.org/course/remote/next.html\">should follow remote links to unvisited pages</a>","message":"should follow remote links to unvisited pages","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/remote/start.html","link":"http://spidr.rubyforge.org/course/remote/start.html","example":"<a href=\"http://spidr.rubyforge.org/course/remote/start.html\">should not follow remote links to the same page</a>","message":"should not follow remote links to the same page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/remote/start.html","link":"http://spidr.rubyforge.org/course/loop/../remote/start.html","example":"<a href=\"http://spidr.rubyforge.org/course/loop/../remote/start.html\">should not follow remote links with a relative path to the same page</a>","message":"should not follow remote links with a relative path to the same page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org:1337/path/","link":"http://spidr.rubyforge.org:1337/path/","example":"<a href=\"http://spidr.rubyforge.org:1337/path/\">should ignore links that fail</a>","message":"should ignore links that fail","behavior":"fail"},{"url":"http://spidr.rubyforge.org/course/frames/iframe_next.html","link":"iframe_next.html","example":"<a href=\"iframe_next.html\">should follow links within iframes</a>","message":"should follow links within iframes","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/frames/frame_next.html","link":"frame_next.html","example":"<a href=\"frame_next.html\">should follow links within frames</a>","message":"should follow links within frames","behavior":"follow"}]
|
data/tasks/course.rb
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
lib_dir = File.expand_path(File.join(File.dirname(__FILE__),'..','lib'))
|
2
|
+
unless $LOAD_PATH.include?(lib_dir)
|
3
|
+
$LOAD_PATH.unshift(lib_dir)
|
4
|
+
end
|
5
|
+
|
6
|
+
require 'spidr/extensions/uri'
|
7
|
+
|
1
8
|
require 'nokogiri'
|
2
9
|
require 'json'
|
3
10
|
|
@@ -22,7 +29,7 @@ namespace :course do
|
|
22
29
|
absolute_url = page_url.merge(URI.encode(relative_url))
|
23
30
|
|
24
31
|
if absolute_url.path
|
25
|
-
absolute_url.path =
|
32
|
+
absolute_url.path = URI.expand_path(absolute_url.path)
|
26
33
|
end
|
27
34
|
|
28
35
|
spec_data.merge(
|
data/tasks/spec.rb
CHANGED
data/tasks/yard.rb
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
|
31
31
|
-----END CERTIFICATE-----
|
32
32
|
|
33
|
-
date: 2009-
|
33
|
+
date: 2009-10-10 00:00:00 -07:00
|
34
34
|
default_executable:
|
35
35
|
dependencies:
|
36
36
|
- !ruby/object:Gem::Dependency
|
@@ -41,7 +41,27 @@ dependencies:
|
|
41
41
|
requirements:
|
42
42
|
- - ">="
|
43
43
|
- !ruby/object:Gem::Version
|
44
|
-
version:
|
44
|
+
version: 1.2.0
|
45
|
+
version:
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rspec
|
48
|
+
type: :development
|
49
|
+
version_requirement:
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.2.8
|
55
|
+
version:
|
56
|
+
- !ruby/object:Gem::Dependency
|
57
|
+
name: yard
|
58
|
+
type: :development
|
59
|
+
version_requirement:
|
60
|
+
version_requirements: !ruby/object:Gem::Requirement
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: 0.2.3.5
|
45
65
|
version:
|
46
66
|
- !ruby/object:Gem::Dependency
|
47
67
|
name: hoe
|
@@ -51,7 +71,7 @@ dependencies:
|
|
51
71
|
requirements:
|
52
72
|
- - ">="
|
53
73
|
- !ruby/object:Gem::Version
|
54
|
-
version: 2.
|
74
|
+
version: 2.3.3
|
55
75
|
version:
|
56
76
|
description: |-
|
57
77
|
Spidr is a versatile Ruby web spidering library that can spider a site,
|
@@ -73,15 +93,34 @@ files:
|
|
73
93
|
- README.txt
|
74
94
|
- Rakefile
|
75
95
|
- lib/spidr.rb
|
96
|
+
- lib/spidr/extensions.rb
|
97
|
+
- lib/spidr/extensions/uri.rb
|
76
98
|
- lib/spidr/page.rb
|
77
99
|
- lib/spidr/rules.rb
|
100
|
+
- lib/spidr/filters.rb
|
101
|
+
- lib/spidr/events.rb
|
102
|
+
- lib/spidr/actions.rb
|
103
|
+
- lib/spidr/actions/exceptions.rb
|
104
|
+
- lib/spidr/actions/exceptions/action.rb
|
105
|
+
- lib/spidr/actions/exceptions/paused.rb
|
106
|
+
- lib/spidr/actions/exceptions/skip_link.rb
|
107
|
+
- lib/spidr/actions/exceptions/skip_page.rb
|
108
|
+
- lib/spidr/actions/actions.rb
|
78
109
|
- lib/spidr/agent.rb
|
79
110
|
- lib/spidr/spidr.rb
|
80
111
|
- lib/spidr/version.rb
|
81
112
|
- tasks/spec.rb
|
113
|
+
- tasks/yard.rb
|
82
114
|
- tasks/course.rb
|
83
115
|
- spec/spec_helper.rb
|
84
116
|
- spec/helpers/course.rb
|
117
|
+
- spec/helpers/page.rb
|
118
|
+
- spec/extensions/uri_spec.rb
|
119
|
+
- spec/page_examples.rb
|
120
|
+
- spec/page_spec.rb
|
121
|
+
- spec/rules_spec.rb
|
122
|
+
- spec/filters_spec.rb
|
123
|
+
- spec/actions_spec.rb
|
85
124
|
- spec/agent_spec.rb
|
86
125
|
- spec/spidr_spec.rb
|
87
126
|
- static/course/index.html
|
@@ -114,7 +153,7 @@ files:
|
|
114
153
|
- static/course/frames/frame.html
|
115
154
|
- static/course/frames/frame_next.html
|
116
155
|
- static/course/specs.json
|
117
|
-
has_rdoc:
|
156
|
+
has_rdoc: yard
|
118
157
|
homepage: http://spidr.rubyforge.org/
|
119
158
|
licenses: []
|
120
159
|
|
@@ -139,7 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
139
178
|
requirements: []
|
140
179
|
|
141
180
|
rubyforge_project: spidr
|
142
|
-
rubygems_version: 1.3.
|
181
|
+
rubygems_version: 1.3.5
|
143
182
|
signing_key:
|
144
183
|
specification_version: 3
|
145
184
|
summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
|
metadata.gz.sig
CHANGED
Binary file
|