spidr 0.1.9 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.txt +43 -0
- data/Manifest.txt +19 -0
- data/README.txt +100 -11
- data/Rakefile +15 -5
- data/lib/spidr/actions.rb +2 -0
- data/lib/spidr/actions/actions.rb +79 -0
- data/lib/spidr/actions/exceptions.rb +4 -0
- data/lib/spidr/actions/exceptions/action.rb +6 -0
- data/lib/spidr/actions/exceptions/paused.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +8 -0
- data/lib/spidr/agent.rb +385 -444
- data/lib/spidr/events.rb +87 -0
- data/lib/spidr/extensions.rb +1 -0
- data/lib/spidr/extensions/uri.rb +45 -0
- data/lib/spidr/filters.rb +438 -0
- data/lib/spidr/page.rb +211 -70
- data/lib/spidr/rules.rb +40 -18
- data/lib/spidr/spidr.rb +57 -7
- data/lib/spidr/version.rb +2 -1
- data/spec/actions_spec.rb +61 -0
- data/spec/agent_spec.rb +24 -31
- data/spec/extensions/uri_spec.rb +39 -0
- data/spec/filters_spec.rb +53 -0
- data/spec/helpers/page.rb +8 -0
- data/spec/page_examples.rb +17 -0
- data/spec/page_spec.rb +81 -0
- data/spec/rules_spec.rb +43 -0
- data/spec/spec_helper.rb +1 -1
- data/spec/spidr_spec.rb +30 -0
- data/static/course/specs.json +1 -1
- data/tasks/course.rb +8 -1
- data/tasks/spec.rb +1 -0
- data/tasks/yard.rb +12 -0
- metadata +45 -6
- metadata.gz.sig +0 -0
data/lib/spidr/rules.rb
CHANGED
@@ -7,25 +7,43 @@ module Spidr
|
|
7
7
|
# Reject rules
|
8
8
|
attr_reader :reject
|
9
9
|
|
10
|
+
#
|
11
|
+
# Creates a new Rules object.
|
12
|
+
#
|
13
|
+
# @param [Hash] options
|
14
|
+
# Additional options.
|
15
|
+
#
|
16
|
+
# @option options [Array<String, Regexp, Proc>] :accept
|
17
|
+
# The patterns to accept data with.
|
18
|
+
#
|
19
|
+
# @option options [Array<String, Regexp, Proc>] :reject
|
20
|
+
# The patterns to reject data with.
|
21
|
+
#
|
10
22
|
def initialize(options={})
|
11
|
-
@accept =
|
12
|
-
@reject =
|
23
|
+
@accept = []
|
24
|
+
@reject = []
|
25
|
+
|
26
|
+
@accept += options[:accept] if options[:accept]
|
27
|
+
@reject += options[:reject] if options[:reject]
|
13
28
|
end
|
14
29
|
|
15
30
|
#
|
16
|
-
#
|
17
|
-
#
|
31
|
+
# Determines whether the data should be accepted or rejected.
|
32
|
+
#
|
33
|
+
# @return [Boolean]
|
34
|
+
# Specifies whether the given data was accepted, using the rules
|
35
|
+
# acceptance patterns.
|
18
36
|
#
|
19
|
-
def accept?(
|
37
|
+
def accept?(data)
|
20
38
|
unless @accept.empty?
|
21
39
|
@accept.each do |rule|
|
22
|
-
return true if
|
40
|
+
return true if test_data(data,rule)
|
23
41
|
end
|
24
42
|
|
25
43
|
return false
|
26
44
|
else
|
27
45
|
@reject.each do |rule|
|
28
|
-
return false if
|
46
|
+
return false if test_data(data,rule)
|
29
47
|
end
|
30
48
|
|
31
49
|
return true
|
@@ -33,27 +51,31 @@ module Spidr
|
|
33
51
|
end
|
34
52
|
|
35
53
|
#
|
36
|
-
#
|
37
|
-
# returns +false+ otherwise.
|
54
|
+
# Determines whether the data should be rejected or accepted.
|
38
55
|
#
|
39
|
-
|
40
|
-
|
56
|
+
# @return [Boolean]
|
57
|
+
# Specifies whether the given data was rejected, using the rules
|
58
|
+
# rejection patterns.
|
59
|
+
#
|
60
|
+
def reject?(data)
|
61
|
+
!(accept?(data))
|
41
62
|
end
|
42
63
|
|
43
64
|
protected
|
44
65
|
|
45
66
|
#
|
46
|
-
# Tests the
|
47
|
-
#
|
48
|
-
#
|
67
|
+
# Tests the given data against a given pattern.
|
68
|
+
#
|
69
|
+
# @return [Boolean]
|
70
|
+
# Specifies whether the given data matched the pattern.
|
49
71
|
#
|
50
|
-
def
|
72
|
+
def test_data(data,rule)
|
51
73
|
if rule.kind_of?(Proc)
|
52
|
-
return (rule.call(
|
74
|
+
return (rule.call(data) == true)
|
53
75
|
elsif rule.kind_of?(Regexp)
|
54
|
-
return !((
|
76
|
+
return !((data.to_s =~ rule).nil?)
|
55
77
|
else
|
56
|
-
return
|
78
|
+
return data == rule
|
57
79
|
end
|
58
80
|
end
|
59
81
|
|
data/lib/spidr/spidr.rb
CHANGED
@@ -4,43 +4,93 @@ module Spidr
|
|
4
4
|
# Common proxy port.
|
5
5
|
COMMON_PROXY_PORT = 8080
|
6
6
|
|
7
|
+
# Default proxy information.
|
8
|
+
DEFAULT_PROXY = {
|
9
|
+
:host => nil,
|
10
|
+
:port => COMMON_PROXY_PORT,
|
11
|
+
:user => nil,
|
12
|
+
:password => nil
|
13
|
+
}
|
14
|
+
|
15
|
+
#
|
16
|
+
# Proxy information used by all newly created Agent objects by default.
|
7
17
|
#
|
8
|
-
#
|
18
|
+
# @return [Hash]
|
19
|
+
# The Spidr proxy information.
|
9
20
|
#
|
10
21
|
def Spidr.proxy
|
11
|
-
@@spidr_proxy ||=
|
22
|
+
@@spidr_proxy ||= DEFAULT_PROXY
|
23
|
+
end
|
24
|
+
|
25
|
+
#
|
26
|
+
# Sets the proxy information used by Agent objects.
|
27
|
+
#
|
28
|
+
# @param [Hash] new_proxy
|
29
|
+
# The new proxy information.
|
30
|
+
#
|
31
|
+
# @option new_proxy [String] :host
|
32
|
+
# The host-name of the proxy.
|
33
|
+
#
|
34
|
+
# @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
|
35
|
+
# The port of the proxy.
|
36
|
+
#
|
37
|
+
# @option new_proxy [String] :user
|
38
|
+
# The user to authenticate with the proxy as.
|
39
|
+
#
|
40
|
+
# @option new_proxy [String] :password
|
41
|
+
# The password to authenticate with the proxy.
|
42
|
+
#
|
43
|
+
# @return [Hash]
|
44
|
+
# The new proxy information.
|
45
|
+
#
|
46
|
+
def Spidr.proxy=(new_proxy)
|
47
|
+
@@spidr_proxy = {:port => COMMON_PROXY_PORT}.merge(new_proxy)
|
12
48
|
end
|
13
49
|
|
14
50
|
#
|
15
|
-
#
|
51
|
+
# Disables the proxy settings used by all newly created Agent objects.
|
52
|
+
#
|
53
|
+
def Spidr.disable_proxy!
|
54
|
+
@@spidr_proxy = DEFAULT_PROXY
|
55
|
+
return true
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# The User-Agent string used by all Agent objects by default.
|
60
|
+
#
|
61
|
+
# @return [String]
|
62
|
+
# The Spidr User-Agent string.
|
16
63
|
#
|
17
64
|
def Spidr.user_agent
|
18
65
|
@@spidr_user_agent ||= nil
|
19
66
|
end
|
20
67
|
|
21
68
|
#
|
22
|
-
# Sets the Spidr
|
69
|
+
# Sets the Spidr User-Agent string.
|
70
|
+
#
|
71
|
+
# @param [String] new_agent
|
72
|
+
# The new User-Agent string.
|
23
73
|
#
|
24
74
|
def Spidr.user_agent=(new_agent)
|
25
75
|
@@spidr_user_agent = new_agent
|
26
76
|
end
|
27
77
|
|
28
78
|
#
|
29
|
-
#
|
79
|
+
# @see Agent.start_at
|
30
80
|
#
|
31
81
|
def Spidr.start_at(url,options={},&block)
|
32
82
|
Agent.start_at(url,options,&block)
|
33
83
|
end
|
34
84
|
|
35
85
|
#
|
36
|
-
#
|
86
|
+
# @see Agent.host
|
37
87
|
#
|
38
88
|
def Spidr.host(name,options={},&block)
|
39
89
|
Agent.host(name,options,&block)
|
40
90
|
end
|
41
91
|
|
42
92
|
#
|
43
|
-
#
|
93
|
+
# @see Agent.site
|
44
94
|
#
|
45
95
|
def Spidr.site(url,options={},&block)
|
46
96
|
Agent.site(url,options,&block)
|
data/lib/spidr/version.rb
CHANGED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'spidr/actions'
|
2
|
+
require 'spidr/agent'
|
3
|
+
|
4
|
+
require 'spec_helper'
|
5
|
+
|
6
|
+
describe Actions do
|
7
|
+
before(:all) do
|
8
|
+
@url = URI('http://spidr.rubyforge.org/')
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should be able to pause spidering" do
|
12
|
+
count = 0
|
13
|
+
agent = Agent.host('spidr.rubyforge.org') do |spider|
|
14
|
+
spider.every_page do |page|
|
15
|
+
count += 1
|
16
|
+
spider.pause! if count >= 2
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
agent.should be_paused
|
21
|
+
agent.history.length.should == 2
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should be able to continue spidering after being paused" do
|
25
|
+
agent = Agent.new do |spider|
|
26
|
+
spider.every_page do |page|
|
27
|
+
spider.pause!
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
agent.enqueue(@url)
|
32
|
+
agent.continue!
|
33
|
+
|
34
|
+
agent.visited?(@url).should == true
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should allow skipping of enqueued links" do
|
38
|
+
agent = Agent.new do |spider|
|
39
|
+
spider.every_url do |url|
|
40
|
+
spider.skip_link!
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
agent.enqueue(@url)
|
45
|
+
|
46
|
+
agent.queue.should be_empty
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should allow skipping of visited pages" do
|
50
|
+
agent = Agent.new do |spider|
|
51
|
+
spider.every_page do |url|
|
52
|
+
spider.skip_page!
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
agent.visit_page(@url)
|
57
|
+
|
58
|
+
agent.history.should == Set[@url]
|
59
|
+
agent.queue.should be_empty
|
60
|
+
end
|
61
|
+
end
|
data/spec/agent_spec.rb
CHANGED
@@ -20,19 +20,38 @@ describe Agent do
|
|
20
20
|
|
21
21
|
it "should be able to restore the history" do
|
22
22
|
agent = Agent.new
|
23
|
-
previous_history = [URI('http://www.example.com')]
|
23
|
+
previous_history = Set[URI('http://www.example.com')]
|
24
24
|
|
25
25
|
agent.history = previous_history
|
26
26
|
agent.history.should == previous_history
|
27
27
|
end
|
28
28
|
|
29
|
-
it "should convert new histories to an
|
29
|
+
it "should convert new histories to an Set of URIs" do
|
30
30
|
agent = Agent.new
|
31
31
|
previous_history = ['http://www.example.com']
|
32
|
+
expected_history = Set[URI('http://www.example.com')]
|
32
33
|
|
33
34
|
agent.history = previous_history
|
34
35
|
agent.history.should_not == previous_history
|
35
|
-
agent.history.should ==
|
36
|
+
agent.history.should == expected_history
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should be able to restore the failures" do
|
40
|
+
agent = Agent.new
|
41
|
+
previous_failures = Set[URI('http://localhost/')]
|
42
|
+
|
43
|
+
agent.failures = previous_failures
|
44
|
+
agent.failures.should == previous_failures
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should convert new histories to a Set of URIs" do
|
48
|
+
agent = Agent.new
|
49
|
+
previous_failures = ['http://localhost/']
|
50
|
+
expected_failures = Set[URI('http://localhost/')]
|
51
|
+
|
52
|
+
agent.failures = previous_failures
|
53
|
+
agent.failures.should_not == previous_failures
|
54
|
+
agent.failures.should == expected_failures
|
36
55
|
end
|
37
56
|
|
38
57
|
it "should be able to restore the queue" do
|
@@ -46,37 +65,11 @@ describe Agent do
|
|
46
65
|
it "should convert new queues to an Array of URIs" do
|
47
66
|
agent = Agent.new
|
48
67
|
previous_queue = ['http://www.example.com']
|
68
|
+
expected_queue = [URI('http://www.example.com')]
|
49
69
|
|
50
70
|
agent.queue = previous_queue
|
51
71
|
agent.queue.should_not == previous_queue
|
52
|
-
agent.queue.should ==
|
53
|
-
end
|
54
|
-
|
55
|
-
it "should be able to pause spidering" do
|
56
|
-
count = 0
|
57
|
-
agent = Agent.host('spidr.rubyforge.org') do |spider|
|
58
|
-
spider.every_page do |page|
|
59
|
-
count += 1
|
60
|
-
spider.pause! if count >= 2
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
agent.should be_paused
|
65
|
-
agent.history.length.should == 2
|
66
|
-
end
|
67
|
-
|
68
|
-
it "should be able to continue spidering after being paused" do
|
69
|
-
agent = Agent.new do |spider|
|
70
|
-
spider.enqueue('http://spidr.rubyforge.org/')
|
71
|
-
spider.every_page do |page|
|
72
|
-
spider.pause!
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
agent.pause!
|
77
|
-
agent.continue!
|
78
|
-
|
79
|
-
agent.visited?('http://spidr.rubyforge.org/').should == true
|
72
|
+
agent.queue.should == expected_queue
|
80
73
|
end
|
81
74
|
|
82
75
|
it "should provide a to_hash method that returns the queue and history" do
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'spidr/extensions/uri'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe URI do
|
6
|
+
describe "expand_path" do
|
7
|
+
it "should preserve single directory paths" do
|
8
|
+
URI.expand_path('path').should == 'path'
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should preserve trailing '/'" do
|
12
|
+
URI.expand_path('test/path/').should == 'test/path/'
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should remove multiple '/' characters" do
|
16
|
+
URI.expand_path('///test///path///').should == '/test/path/'
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should remove '.' directories from the path" do
|
20
|
+
URI.expand_path('test/./path').should == 'test/path'
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should handle '..' directories properly" do
|
24
|
+
URI.expand_path('test/../path').should == 'path'
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should limit the number of '..' directories resolved" do
|
28
|
+
URI.expand_path('/test/../../../..').should == '/'
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should preserve absolute paths" do
|
32
|
+
URI.expand_path('/test/path').should == '/test/path'
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should preserve the root path" do
|
36
|
+
URI.expand_path('/').should == '/'
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'spidr/filters'
|
2
|
+
require 'spidr/agent'
|
3
|
+
|
4
|
+
require 'spec_helper'
|
5
|
+
|
6
|
+
describe Filters do
|
7
|
+
it "should allow setting the acceptable schemes" do
|
8
|
+
agent = Agent.new
|
9
|
+
|
10
|
+
agent.schemes = [:http]
|
11
|
+
agent.schemes.should == ['http']
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should provide the hosts that will be visited" do
|
15
|
+
agent = Agent.new(:hosts => ['spidr.rubyforge.org'])
|
16
|
+
agent.visit_hosts.should == ['spidr.rubyforge.org']
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should provide the hosts that will not be visited" do
|
20
|
+
agent = Agent.new(:ignore_hosts => ['example.com'])
|
21
|
+
agent.ignore_hosts.should == ['example.com']
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should provide the ports that will be visited" do
|
25
|
+
agent = Agent.new(:ports => [80, 443, 8000])
|
26
|
+
agent.visit_ports.should == [80, 443, 8000]
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should provide the ports that will not be visited" do
|
30
|
+
agent = Agent.new(:ignore_ports => [8000, 8080])
|
31
|
+
agent.ignore_ports.should == [8000, 8080]
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should provide the links that will be visited" do
|
35
|
+
agent = Agent.new(:links => ['index.php'])
|
36
|
+
agent.visit_links.should == ['index.php']
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should provide the links that will not be visited" do
|
40
|
+
agent = Agent.new(:ignore_links => [/login/])
|
41
|
+
agent.ignore_links.should == [/login/]
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should provide the exts that will be visited" do
|
45
|
+
agent = Agent.new(:exts => ['htm'])
|
46
|
+
agent.visit_exts.should == ['htm']
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should provide the exts that will not be visited" do
|
50
|
+
agent = Agent.new(:ignore_exts => ['cfm'])
|
51
|
+
agent.ignore_exts.should == ['cfm']
|
52
|
+
end
|
53
|
+
end
|