spidr 0.1.9 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +0 -0
- data/History.txt +43 -0
- data/Manifest.txt +19 -0
- data/README.txt +100 -11
- data/Rakefile +15 -5
- data/lib/spidr/actions.rb +2 -0
- data/lib/spidr/actions/actions.rb +79 -0
- data/lib/spidr/actions/exceptions.rb +4 -0
- data/lib/spidr/actions/exceptions/action.rb +6 -0
- data/lib/spidr/actions/exceptions/paused.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +8 -0
- data/lib/spidr/agent.rb +385 -444
- data/lib/spidr/events.rb +87 -0
- data/lib/spidr/extensions.rb +1 -0
- data/lib/spidr/extensions/uri.rb +45 -0
- data/lib/spidr/filters.rb +438 -0
- data/lib/spidr/page.rb +211 -70
- data/lib/spidr/rules.rb +40 -18
- data/lib/spidr/spidr.rb +57 -7
- data/lib/spidr/version.rb +2 -1
- data/spec/actions_spec.rb +61 -0
- data/spec/agent_spec.rb +24 -31
- data/spec/extensions/uri_spec.rb +39 -0
- data/spec/filters_spec.rb +53 -0
- data/spec/helpers/page.rb +8 -0
- data/spec/page_examples.rb +17 -0
- data/spec/page_spec.rb +81 -0
- data/spec/rules_spec.rb +43 -0
- data/spec/spec_helper.rb +1 -1
- data/spec/spidr_spec.rb +30 -0
- data/static/course/specs.json +1 -1
- data/tasks/course.rb +8 -1
- data/tasks/spec.rb +1 -0
- data/tasks/yard.rb +12 -0
- metadata +45 -6
- metadata.gz.sig +0 -0
data/lib/spidr/rules.rb
CHANGED
@@ -7,25 +7,43 @@ module Spidr
|
|
7
7
|
# Reject rules
|
8
8
|
attr_reader :reject
|
9
9
|
|
10
|
+
#
|
11
|
+
# Creates a new Rules object.
|
12
|
+
#
|
13
|
+
# @param [Hash] options
|
14
|
+
# Additional options.
|
15
|
+
#
|
16
|
+
# @option options [Array<String, Regexp, Proc>] :accept
|
17
|
+
# The patterns to accept data with.
|
18
|
+
#
|
19
|
+
# @option options [Array<String, Regexp, Proc>] :reject
|
20
|
+
# The patterns to reject data with.
|
21
|
+
#
|
10
22
|
def initialize(options={})
|
11
|
-
@accept =
|
12
|
-
@reject =
|
23
|
+
@accept = []
|
24
|
+
@reject = []
|
25
|
+
|
26
|
+
@accept += options[:accept] if options[:accept]
|
27
|
+
@reject += options[:reject] if options[:reject]
|
13
28
|
end
|
14
29
|
|
15
30
|
#
|
16
|
-
#
|
17
|
-
#
|
31
|
+
# Determines whether the data should be accepted or rejected.
|
32
|
+
#
|
33
|
+
# @return [Boolean]
|
34
|
+
# Specifies whether the given data was accepted, using the rules
|
35
|
+
# acceptance patterns.
|
18
36
|
#
|
19
|
-
def accept?(
|
37
|
+
def accept?(data)
|
20
38
|
unless @accept.empty?
|
21
39
|
@accept.each do |rule|
|
22
|
-
return true if
|
40
|
+
return true if test_data(data,rule)
|
23
41
|
end
|
24
42
|
|
25
43
|
return false
|
26
44
|
else
|
27
45
|
@reject.each do |rule|
|
28
|
-
return false if
|
46
|
+
return false if test_data(data,rule)
|
29
47
|
end
|
30
48
|
|
31
49
|
return true
|
@@ -33,27 +51,31 @@ module Spidr
|
|
33
51
|
end
|
34
52
|
|
35
53
|
#
|
36
|
-
#
|
37
|
-
# returns +false+ otherwise.
|
54
|
+
# Determines whether the data should be rejected or accepted.
|
38
55
|
#
|
39
|
-
|
40
|
-
|
56
|
+
# @return [Boolean]
|
57
|
+
# Specifies whether the given data was rejected, using the rules
|
58
|
+
# rejection patterns.
|
59
|
+
#
|
60
|
+
def reject?(data)
|
61
|
+
!(accept?(data))
|
41
62
|
end
|
42
63
|
|
43
64
|
protected
|
44
65
|
|
45
66
|
#
|
46
|
-
# Tests the
|
47
|
-
#
|
48
|
-
#
|
67
|
+
# Tests the given data against a given pattern.
|
68
|
+
#
|
69
|
+
# @return [Boolean]
|
70
|
+
# Specifies whether the given data matched the pattern.
|
49
71
|
#
|
50
|
-
def
|
72
|
+
def test_data(data,rule)
|
51
73
|
if rule.kind_of?(Proc)
|
52
|
-
return (rule.call(
|
74
|
+
return (rule.call(data) == true)
|
53
75
|
elsif rule.kind_of?(Regexp)
|
54
|
-
return !((
|
76
|
+
return !((data.to_s =~ rule).nil?)
|
55
77
|
else
|
56
|
-
return
|
78
|
+
return data == rule
|
57
79
|
end
|
58
80
|
end
|
59
81
|
|
data/lib/spidr/spidr.rb
CHANGED
@@ -4,43 +4,93 @@ module Spidr
|
|
4
4
|
# Common proxy port.
|
5
5
|
COMMON_PROXY_PORT = 8080
|
6
6
|
|
7
|
+
# Default proxy information.
|
8
|
+
DEFAULT_PROXY = {
|
9
|
+
:host => nil,
|
10
|
+
:port => COMMON_PROXY_PORT,
|
11
|
+
:user => nil,
|
12
|
+
:password => nil
|
13
|
+
}
|
14
|
+
|
15
|
+
#
|
16
|
+
# Proxy information used by all newly created Agent objects by default.
|
7
17
|
#
|
8
|
-
#
|
18
|
+
# @return [Hash]
|
19
|
+
# The Spidr proxy information.
|
9
20
|
#
|
10
21
|
def Spidr.proxy
|
11
|
-
@@spidr_proxy ||=
|
22
|
+
@@spidr_proxy ||= DEFAULT_PROXY
|
23
|
+
end
|
24
|
+
|
25
|
+
#
|
26
|
+
# Sets the proxy information used by Agent objects.
|
27
|
+
#
|
28
|
+
# @param [Hash] new_proxy
|
29
|
+
# The new proxy information.
|
30
|
+
#
|
31
|
+
# @option new_proxy [String] :host
|
32
|
+
# The host-name of the proxy.
|
33
|
+
#
|
34
|
+
# @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
|
35
|
+
# The port of the proxy.
|
36
|
+
#
|
37
|
+
# @option new_proxy [String] :user
|
38
|
+
# The user to authenticate with the proxy as.
|
39
|
+
#
|
40
|
+
# @option new_proxy [String] :password
|
41
|
+
# The password to authenticate with the proxy.
|
42
|
+
#
|
43
|
+
# @return [Hash]
|
44
|
+
# The new proxy information.
|
45
|
+
#
|
46
|
+
def Spidr.proxy=(new_proxy)
|
47
|
+
@@spidr_proxy = {:port => COMMON_PROXY_PORT}.merge(new_proxy)
|
12
48
|
end
|
13
49
|
|
14
50
|
#
|
15
|
-
#
|
51
|
+
# Disables the proxy settings used by all newly created Agent objects.
|
52
|
+
#
|
53
|
+
def Spidr.disable_proxy!
|
54
|
+
@@spidr_proxy = DEFAULT_PROXY
|
55
|
+
return true
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# The User-Agent string used by all Agent objects by default.
|
60
|
+
#
|
61
|
+
# @return [String]
|
62
|
+
# The Spidr User-Agent string.
|
16
63
|
#
|
17
64
|
def Spidr.user_agent
|
18
65
|
@@spidr_user_agent ||= nil
|
19
66
|
end
|
20
67
|
|
21
68
|
#
|
22
|
-
# Sets the Spidr
|
69
|
+
# Sets the Spidr User-Agent string.
|
70
|
+
#
|
71
|
+
# @param [String] new_agent
|
72
|
+
# The new User-Agent string.
|
23
73
|
#
|
24
74
|
def Spidr.user_agent=(new_agent)
|
25
75
|
@@spidr_user_agent = new_agent
|
26
76
|
end
|
27
77
|
|
28
78
|
#
|
29
|
-
#
|
79
|
+
# @see Agent.start_at
|
30
80
|
#
|
31
81
|
def Spidr.start_at(url,options={},&block)
|
32
82
|
Agent.start_at(url,options,&block)
|
33
83
|
end
|
34
84
|
|
35
85
|
#
|
36
|
-
#
|
86
|
+
# @see Agent.host
|
37
87
|
#
|
38
88
|
def Spidr.host(name,options={},&block)
|
39
89
|
Agent.host(name,options,&block)
|
40
90
|
end
|
41
91
|
|
42
92
|
#
|
43
|
-
#
|
93
|
+
# @see Agent.site
|
44
94
|
#
|
45
95
|
def Spidr.site(url,options={},&block)
|
46
96
|
Agent.site(url,options,&block)
|
data/lib/spidr/version.rb
CHANGED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'spidr/actions'
|
2
|
+
require 'spidr/agent'
|
3
|
+
|
4
|
+
require 'spec_helper'
|
5
|
+
|
6
|
+
describe Actions do
|
7
|
+
before(:all) do
|
8
|
+
@url = URI('http://spidr.rubyforge.org/')
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should be able to pause spidering" do
|
12
|
+
count = 0
|
13
|
+
agent = Agent.host('spidr.rubyforge.org') do |spider|
|
14
|
+
spider.every_page do |page|
|
15
|
+
count += 1
|
16
|
+
spider.pause! if count >= 2
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
agent.should be_paused
|
21
|
+
agent.history.length.should == 2
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should be able to continue spidering after being paused" do
|
25
|
+
agent = Agent.new do |spider|
|
26
|
+
spider.every_page do |page|
|
27
|
+
spider.pause!
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
agent.enqueue(@url)
|
32
|
+
agent.continue!
|
33
|
+
|
34
|
+
agent.visited?(@url).should == true
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should allow skipping of enqueued links" do
|
38
|
+
agent = Agent.new do |spider|
|
39
|
+
spider.every_url do |url|
|
40
|
+
spider.skip_link!
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
agent.enqueue(@url)
|
45
|
+
|
46
|
+
agent.queue.should be_empty
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should allow skipping of visited pages" do
|
50
|
+
agent = Agent.new do |spider|
|
51
|
+
spider.every_page do |url|
|
52
|
+
spider.skip_page!
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
agent.visit_page(@url)
|
57
|
+
|
58
|
+
agent.history.should == Set[@url]
|
59
|
+
agent.queue.should be_empty
|
60
|
+
end
|
61
|
+
end
|
data/spec/agent_spec.rb
CHANGED
@@ -20,19 +20,38 @@ describe Agent do
|
|
20
20
|
|
21
21
|
it "should be able to restore the history" do
|
22
22
|
agent = Agent.new
|
23
|
-
previous_history = [URI('http://www.example.com')]
|
23
|
+
previous_history = Set[URI('http://www.example.com')]
|
24
24
|
|
25
25
|
agent.history = previous_history
|
26
26
|
agent.history.should == previous_history
|
27
27
|
end
|
28
28
|
|
29
|
-
it "should convert new histories to an
|
29
|
+
it "should convert new histories to an Set of URIs" do
|
30
30
|
agent = Agent.new
|
31
31
|
previous_history = ['http://www.example.com']
|
32
|
+
expected_history = Set[URI('http://www.example.com')]
|
32
33
|
|
33
34
|
agent.history = previous_history
|
34
35
|
agent.history.should_not == previous_history
|
35
|
-
agent.history.should ==
|
36
|
+
agent.history.should == expected_history
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should be able to restore the failures" do
|
40
|
+
agent = Agent.new
|
41
|
+
previous_failures = Set[URI('http://localhost/')]
|
42
|
+
|
43
|
+
agent.failures = previous_failures
|
44
|
+
agent.failures.should == previous_failures
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should convert new histories to a Set of URIs" do
|
48
|
+
agent = Agent.new
|
49
|
+
previous_failures = ['http://localhost/']
|
50
|
+
expected_failures = Set[URI('http://localhost/')]
|
51
|
+
|
52
|
+
agent.failures = previous_failures
|
53
|
+
agent.failures.should_not == previous_failures
|
54
|
+
agent.failures.should == expected_failures
|
36
55
|
end
|
37
56
|
|
38
57
|
it "should be able to restore the queue" do
|
@@ -46,37 +65,11 @@ describe Agent do
|
|
46
65
|
it "should convert new queues to an Array of URIs" do
|
47
66
|
agent = Agent.new
|
48
67
|
previous_queue = ['http://www.example.com']
|
68
|
+
expected_queue = [URI('http://www.example.com')]
|
49
69
|
|
50
70
|
agent.queue = previous_queue
|
51
71
|
agent.queue.should_not == previous_queue
|
52
|
-
agent.queue.should ==
|
53
|
-
end
|
54
|
-
|
55
|
-
it "should be able to pause spidering" do
|
56
|
-
count = 0
|
57
|
-
agent = Agent.host('spidr.rubyforge.org') do |spider|
|
58
|
-
spider.every_page do |page|
|
59
|
-
count += 1
|
60
|
-
spider.pause! if count >= 2
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
agent.should be_paused
|
65
|
-
agent.history.length.should == 2
|
66
|
-
end
|
67
|
-
|
68
|
-
it "should be able to continue spidering after being paused" do
|
69
|
-
agent = Agent.new do |spider|
|
70
|
-
spider.enqueue('http://spidr.rubyforge.org/')
|
71
|
-
spider.every_page do |page|
|
72
|
-
spider.pause!
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
agent.pause!
|
77
|
-
agent.continue!
|
78
|
-
|
79
|
-
agent.visited?('http://spidr.rubyforge.org/').should == true
|
72
|
+
agent.queue.should == expected_queue
|
80
73
|
end
|
81
74
|
|
82
75
|
it "should provide a to_hash method that returns the queue and history" do
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'spidr/extensions/uri'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe URI do
|
6
|
+
describe "expand_path" do
|
7
|
+
it "should preserve single directory paths" do
|
8
|
+
URI.expand_path('path').should == 'path'
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should preserve trailing '/'" do
|
12
|
+
URI.expand_path('test/path/').should == 'test/path/'
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should remove multiple '/' characters" do
|
16
|
+
URI.expand_path('///test///path///').should == '/test/path/'
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should remove '.' directories from the path" do
|
20
|
+
URI.expand_path('test/./path').should == 'test/path'
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should handle '..' directories properly" do
|
24
|
+
URI.expand_path('test/../path').should == 'path'
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should limit the number of '..' directories resolved" do
|
28
|
+
URI.expand_path('/test/../../../..').should == '/'
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should preserve absolute paths" do
|
32
|
+
URI.expand_path('/test/path').should == '/test/path'
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should preserve the root path" do
|
36
|
+
URI.expand_path('/').should == '/'
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'spidr/filters'
|
2
|
+
require 'spidr/agent'
|
3
|
+
|
4
|
+
require 'spec_helper'
|
5
|
+
|
6
|
+
describe Filters do
|
7
|
+
it "should allow setting the acceptable schemes" do
|
8
|
+
agent = Agent.new
|
9
|
+
|
10
|
+
agent.schemes = [:http]
|
11
|
+
agent.schemes.should == ['http']
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should provide the hosts that will be visited" do
|
15
|
+
agent = Agent.new(:hosts => ['spidr.rubyforge.org'])
|
16
|
+
agent.visit_hosts.should == ['spidr.rubyforge.org']
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should provide the hosts that will not be visited" do
|
20
|
+
agent = Agent.new(:ignore_hosts => ['example.com'])
|
21
|
+
agent.ignore_hosts.should == ['example.com']
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should provide the ports that will be visited" do
|
25
|
+
agent = Agent.new(:ports => [80, 443, 8000])
|
26
|
+
agent.visit_ports.should == [80, 443, 8000]
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should provide the ports that will not be visited" do
|
30
|
+
agent = Agent.new(:ignore_ports => [8000, 8080])
|
31
|
+
agent.ignore_ports.should == [8000, 8080]
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should provide the links that will be visited" do
|
35
|
+
agent = Agent.new(:links => ['index.php'])
|
36
|
+
agent.visit_links.should == ['index.php']
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should provide the links that will not be visited" do
|
40
|
+
agent = Agent.new(:ignore_links => [/login/])
|
41
|
+
agent.ignore_links.should == [/login/]
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should provide the exts that will be visited" do
|
45
|
+
agent = Agent.new(:exts => ['htm'])
|
46
|
+
agent.visit_exts.should == ['htm']
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should provide the exts that will not be visited" do
|
50
|
+
agent = Agent.new(:ignore_exts => ['cfm'])
|
51
|
+
agent.ignore_exts.should == ['cfm']
|
52
|
+
end
|
53
|
+
end
|