spidr 0.1.9 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.txt +43 -0
- data/Manifest.txt +19 -0
- data/README.txt +100 -11
- data/Rakefile +15 -5
- data/lib/spidr/actions.rb +2 -0
- data/lib/spidr/actions/actions.rb +79 -0
- data/lib/spidr/actions/exceptions.rb +4 -0
- data/lib/spidr/actions/exceptions/action.rb +6 -0
- data/lib/spidr/actions/exceptions/paused.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +8 -0
- data/lib/spidr/agent.rb +385 -444
- data/lib/spidr/events.rb +87 -0
- data/lib/spidr/extensions.rb +1 -0
- data/lib/spidr/extensions/uri.rb +45 -0
- data/lib/spidr/filters.rb +438 -0
- data/lib/spidr/page.rb +211 -70
- data/lib/spidr/rules.rb +40 -18
- data/lib/spidr/spidr.rb +57 -7
- data/lib/spidr/version.rb +2 -1
- data/spec/actions_spec.rb +61 -0
- data/spec/agent_spec.rb +24 -31
- data/spec/extensions/uri_spec.rb +39 -0
- data/spec/filters_spec.rb +53 -0
- data/spec/helpers/page.rb +8 -0
- data/spec/page_examples.rb +17 -0
- data/spec/page_spec.rb +81 -0
- data/spec/rules_spec.rb +43 -0
- data/spec/spec_helper.rb +1 -1
- data/spec/spidr_spec.rb +30 -0
- data/static/course/specs.json +1 -1
- data/tasks/course.rb +8 -1
- data/tasks/spec.rb +1 -0
- data/tasks/yard.rb +12 -0
- metadata +45 -6
- metadata.gz.sig +0 -0
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'spidr/page'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
shared_examples_for "Page" do
|
6
|
+
it "should have a status code" do
|
7
|
+
@page.code.should be_integer
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should have a body" do
|
11
|
+
@page.body.should_not be_empty
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should provide transparent access to the response headers" do
|
15
|
+
@page.content_type.should == @page.content_type
|
16
|
+
end
|
17
|
+
end
|
data/spec/page_spec.rb
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'spidr/page'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'page_examples'
|
5
|
+
require 'helpers/page'
|
6
|
+
|
7
|
+
describe Page do
|
8
|
+
describe "html" do
|
9
|
+
before(:all) do
|
10
|
+
@page = get_page('http://spidr.rubyforge.org/course/start.html')
|
11
|
+
end
|
12
|
+
|
13
|
+
it_should_behave_like "Page"
|
14
|
+
|
15
|
+
it "should be OK" do
|
16
|
+
@page.should be_ok
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should have a content-type" do
|
20
|
+
@page.content_type.should =~ /text\/html/
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should be a html page" do
|
24
|
+
@page.should be_html
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should have provide a document" do
|
28
|
+
@page.doc.class.should == Nokogiri::HTML::Document
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should allow searching the document" do
|
32
|
+
@page.doc.search('//p').length.should == 2
|
33
|
+
@page.doc.at('//p[2]').inner_text.should == 'Ready! Set! Go!'
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should have a title" do
|
37
|
+
@page.title.should == 'Spidr :: Web-Spider Obstacle Course :: Start'
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should have links" do
|
41
|
+
@page.links.should_not be_empty
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "txt" do
|
46
|
+
before(:all) do
|
47
|
+
@page = get_page('http://www.ruby-lang.org/en/LICENSE.txt')
|
48
|
+
end
|
49
|
+
|
50
|
+
it_should_behave_like "Page"
|
51
|
+
|
52
|
+
it "should be OK" do
|
53
|
+
@page.should be_ok
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should have a content-type" do
|
57
|
+
@page.content_type.should =~ /text\/plain/
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should be a txt page" do
|
61
|
+
@page.should be_txt
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should not have provide a document" do
|
65
|
+
@page.doc.should be_nil
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should not allow searching the document" do
|
69
|
+
@page.search('//p').should be_empty
|
70
|
+
@page.at('//p').should be_nil
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should not have links" do
|
74
|
+
@page.links.should be_empty
|
75
|
+
end
|
76
|
+
|
77
|
+
it "should not have a title" do
|
78
|
+
@page.title.should be_nil
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
data/spec/rules_spec.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'spidr/rules'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe Rules do
|
6
|
+
it "should accept data based on acceptance data" do
|
7
|
+
rules = Rules.new(:accept => [1])
|
8
|
+
|
9
|
+
rules.accept?(1).should == true
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should accept data based on acceptance regexps" do
|
13
|
+
rules = Rules.new(:accept => [/1/])
|
14
|
+
|
15
|
+
rules.accept?('1').should == true
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should match non-Strings using acceptance regexps" do
|
19
|
+
rules = Rules.new(:accept => [/1/])
|
20
|
+
|
21
|
+
rules.accept?(1).should == true
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should accept data using acceptance lambdas" do
|
25
|
+
rules = Rules.new(:accept => [lambda { |data| data > 2 }])
|
26
|
+
|
27
|
+
rules.accept?(3).should == true
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should reject data that does not match any acceptance patterns" do
|
31
|
+
rules = Rules.new(:accept => [1, 2, 3])
|
32
|
+
|
33
|
+
rules.accept?(2).should == true
|
34
|
+
rules.accept?(4).should == false
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should accept data that does not match any rejection patterns" do
|
38
|
+
rules = Rules.new(:reject => [1, 2, 3])
|
39
|
+
|
40
|
+
rules.accept?(2).should == false
|
41
|
+
rules.accept?(4).should == true
|
42
|
+
end
|
43
|
+
end
|
data/spec/spec_helper.rb
CHANGED
data/spec/spidr_spec.rb
CHANGED
@@ -6,4 +6,34 @@ describe Spidr do
|
|
6
6
|
it "should have a VERSION constant" do
|
7
7
|
Spidr.const_defined?('VERSION').should == true
|
8
8
|
end
|
9
|
+
|
10
|
+
describe "proxy" do
|
11
|
+
after(:all) do
|
12
|
+
Spidr.disable_proxy!
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should not have proxy settings by default" do
|
16
|
+
Spidr.proxy[:host].should be_nil
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should allow setting new proxy settings" do
|
20
|
+
Spidr.proxy = {:host => 'example.com', :port => 8010}
|
21
|
+
|
22
|
+
Spidr.proxy[:host].should == 'example.com'
|
23
|
+
Spidr.proxy[:port].should == 8010
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should default the :port option of new proxy settings" do
|
27
|
+
Spidr.proxy = {:host => 'example.com'}
|
28
|
+
|
29
|
+
Spidr.proxy[:host].should == 'example.com'
|
30
|
+
Spidr.proxy[:port].should == Spidr::COMMON_PROXY_PORT
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should allow disabling the proxy" do
|
34
|
+
Spidr.disable_proxy!
|
35
|
+
|
36
|
+
Spidr.proxy[:host].should be_nil
|
37
|
+
end
|
38
|
+
end
|
9
39
|
end
|
data/static/course/specs.json
CHANGED
@@ -1 +1 @@
|
|
1
|
-
[{"link":"
|
1
|
+
[{"url":"http://spidr.rubyforge.org/course/absolute/next.html","link":"/course/absolute/next.html","example":"<a href=\"/course/absolute/next.html\">should follow absolute links to unvisited pages</a>","message":"should follow absolute links to unvisited pages","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/absolute/start.html","link":"/course/absolute/start.html","example":"<a href=\"/course/absolute/start.html\">should not follow absolute links to the current page</a>","message":"should not follow absolute links to the current page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/empty/start.html","link":"","example":"<a>should not follow links with no href attributes</a>","message":"should not follow links with no href attributes","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/empty/start.html","link":"","example":"<a href=\"\">should not follow links with empty href attributes</a>","message":"should not follow links with empty href attributes","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/empty/%20","link":" ","example":"<a href=\"\">should ignore links with blank href attributes</a>","message":"should ignore links with blank href attributes","behavior":"ignore"},{"url":"javascript:fail();","link":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"</a>","message":"should ignore links beginning with \"javascript:\"","behavior":"ignore"},{"url":"http://spidr.rubyforge.org/course/javascript/%23","link":"#","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.</a>","message":"should ignore links with an onclick attribute and a href pointing to the page.","behavior":"ignore"},{"url":"http://spidr.rubyforge.org/course/loop/start.html","link":"start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages</a>","message":"should not follow links to previously visited pages","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/loop/next.html","link":"next.html","example":"<a href=\"next.html\">should follow links pointing to other pages</a>","message":"should follow links pointing to other pages","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/loop/start.html","link":"start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page</a>","message":"should not follow links pointing to the current page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/relative/normal.html","link":"normal.html","example":"<a href=\"normal.html\">should follow relative links</a>","message":"should follow relative links","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/relative/current_directory.html","link":"./current_directory.html","example":"<a href=\"./current_directory.html\">should follow relative links to files in the current directory</a>","message":"should follow relative links to files in the current directory","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/relative/same_directory.html","link":"../relative/same_directory.html","example":"<a href=\"../relative/same_directory.html\">should follow links that transverse directories</a>","message":"should follow links that transverse directories","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/relative/%23","link":"#","example":"<a href=\"#\">should ignore in-page links</a>","message":"should ignore in-page links","behavior":"ignore"},{"url":"http://spidr.rubyforge.org/course/remote/next.html","link":"http://spidr.rubyforge.org/course/remote/next.html","example":"<a href=\"http://spidr.rubyforge.org/course/remote/next.html\">should follow remote links to unvisited pages</a>","message":"should follow remote links to unvisited pages","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/remote/start.html","link":"http://spidr.rubyforge.org/course/remote/start.html","example":"<a href=\"http://spidr.rubyforge.org/course/remote/start.html\">should not follow remote links to the same page</a>","message":"should not follow remote links to the same page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/remote/start.html","link":"http://spidr.rubyforge.org/course/loop/../remote/start.html","example":"<a href=\"http://spidr.rubyforge.org/course/loop/../remote/start.html\">should not follow remote links with a relative path to the same page</a>","message":"should not follow remote links with a relative path to the same page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org:1337/path/","link":"http://spidr.rubyforge.org:1337/path/","example":"<a href=\"http://spidr.rubyforge.org:1337/path/\">should ignore links that fail</a>","message":"should ignore links that fail","behavior":"fail"},{"url":"http://spidr.rubyforge.org/course/frames/iframe_next.html","link":"iframe_next.html","example":"<a href=\"iframe_next.html\">should follow links within iframes</a>","message":"should follow links within iframes","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/frames/frame_next.html","link":"frame_next.html","example":"<a href=\"frame_next.html\">should follow links within frames</a>","message":"should follow links within frames","behavior":"follow"}]
|
data/tasks/course.rb
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
lib_dir = File.expand_path(File.join(File.dirname(__FILE__),'..','lib'))
|
2
|
+
unless $LOAD_PATH.include?(lib_dir)
|
3
|
+
$LOAD_PATH.unshift(lib_dir)
|
4
|
+
end
|
5
|
+
|
6
|
+
require 'spidr/extensions/uri'
|
7
|
+
|
1
8
|
require 'nokogiri'
|
2
9
|
require 'json'
|
3
10
|
|
@@ -22,7 +29,7 @@ namespace :course do
|
|
22
29
|
absolute_url = page_url.merge(URI.encode(relative_url))
|
23
30
|
|
24
31
|
if absolute_url.path
|
25
|
-
absolute_url.path =
|
32
|
+
absolute_url.path = URI.expand_path(absolute_url.path)
|
26
33
|
end
|
27
34
|
|
28
35
|
spec_data.merge(
|
data/tasks/spec.rb
CHANGED
data/tasks/yard.rb
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
|
31
31
|
-----END CERTIFICATE-----
|
32
32
|
|
33
|
-
date: 2009-
|
33
|
+
date: 2009-10-10 00:00:00 -07:00
|
34
34
|
default_executable:
|
35
35
|
dependencies:
|
36
36
|
- !ruby/object:Gem::Dependency
|
@@ -41,7 +41,27 @@ dependencies:
|
|
41
41
|
requirements:
|
42
42
|
- - ">="
|
43
43
|
- !ruby/object:Gem::Version
|
44
|
-
version:
|
44
|
+
version: 1.2.0
|
45
|
+
version:
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rspec
|
48
|
+
type: :development
|
49
|
+
version_requirement:
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.2.8
|
55
|
+
version:
|
56
|
+
- !ruby/object:Gem::Dependency
|
57
|
+
name: yard
|
58
|
+
type: :development
|
59
|
+
version_requirement:
|
60
|
+
version_requirements: !ruby/object:Gem::Requirement
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: 0.2.3.5
|
45
65
|
version:
|
46
66
|
- !ruby/object:Gem::Dependency
|
47
67
|
name: hoe
|
@@ -51,7 +71,7 @@ dependencies:
|
|
51
71
|
requirements:
|
52
72
|
- - ">="
|
53
73
|
- !ruby/object:Gem::Version
|
54
|
-
version: 2.
|
74
|
+
version: 2.3.3
|
55
75
|
version:
|
56
76
|
description: |-
|
57
77
|
Spidr is a versatile Ruby web spidering library that can spider a site,
|
@@ -73,15 +93,34 @@ files:
|
|
73
93
|
- README.txt
|
74
94
|
- Rakefile
|
75
95
|
- lib/spidr.rb
|
96
|
+
- lib/spidr/extensions.rb
|
97
|
+
- lib/spidr/extensions/uri.rb
|
76
98
|
- lib/spidr/page.rb
|
77
99
|
- lib/spidr/rules.rb
|
100
|
+
- lib/spidr/filters.rb
|
101
|
+
- lib/spidr/events.rb
|
102
|
+
- lib/spidr/actions.rb
|
103
|
+
- lib/spidr/actions/exceptions.rb
|
104
|
+
- lib/spidr/actions/exceptions/action.rb
|
105
|
+
- lib/spidr/actions/exceptions/paused.rb
|
106
|
+
- lib/spidr/actions/exceptions/skip_link.rb
|
107
|
+
- lib/spidr/actions/exceptions/skip_page.rb
|
108
|
+
- lib/spidr/actions/actions.rb
|
78
109
|
- lib/spidr/agent.rb
|
79
110
|
- lib/spidr/spidr.rb
|
80
111
|
- lib/spidr/version.rb
|
81
112
|
- tasks/spec.rb
|
113
|
+
- tasks/yard.rb
|
82
114
|
- tasks/course.rb
|
83
115
|
- spec/spec_helper.rb
|
84
116
|
- spec/helpers/course.rb
|
117
|
+
- spec/helpers/page.rb
|
118
|
+
- spec/extensions/uri_spec.rb
|
119
|
+
- spec/page_examples.rb
|
120
|
+
- spec/page_spec.rb
|
121
|
+
- spec/rules_spec.rb
|
122
|
+
- spec/filters_spec.rb
|
123
|
+
- spec/actions_spec.rb
|
85
124
|
- spec/agent_spec.rb
|
86
125
|
- spec/spidr_spec.rb
|
87
126
|
- static/course/index.html
|
@@ -114,7 +153,7 @@ files:
|
|
114
153
|
- static/course/frames/frame.html
|
115
154
|
- static/course/frames/frame_next.html
|
116
155
|
- static/course/specs.json
|
117
|
-
has_rdoc:
|
156
|
+
has_rdoc: yard
|
118
157
|
homepage: http://spidr.rubyforge.org/
|
119
158
|
licenses: []
|
120
159
|
|
@@ -139,7 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
139
178
|
requirements: []
|
140
179
|
|
141
180
|
rubyforge_project: spidr
|
142
|
-
rubygems_version: 1.3.
|
181
|
+
rubygems_version: 1.3.5
|
143
182
|
signing_key:
|
144
183
|
specification_version: 3
|
145
184
|
summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
|
metadata.gz.sig
CHANGED
Binary file
|