spidr 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/ChangeLog.md +69 -54
  3. data/Gemfile +9 -5
  4. data/LICENSE.txt +1 -1
  5. data/README.md +34 -26
  6. data/Rakefile +4 -15
  7. data/gemspec.yml +3 -2
  8. data/lib/spidr/agent.rb +101 -44
  9. data/lib/spidr/{actions → agent}/actions.rb +32 -12
  10. data/lib/spidr/{events.rb → agent/events.rb} +4 -8
  11. data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
  12. data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
  13. data/lib/spidr/auth_store.rb +2 -2
  14. data/lib/spidr/cookie_jar.rb +2 -2
  15. data/lib/spidr/extensions/uri.rb +28 -16
  16. data/lib/spidr/page.rb +7 -11
  17. data/lib/spidr/{body.rb → page/body.rb} +1 -1
  18. data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
  19. data/lib/spidr/{links.rb → page/links.rb} +43 -7
  20. data/lib/spidr/session_cache.rb +2 -2
  21. data/lib/spidr/spidr.rb +32 -5
  22. data/lib/spidr/version.rb +1 -1
  23. data/spec/agent/actions_spec.rb +60 -0
  24. data/spec/agent/filters_spec.rb +62 -0
  25. data/spec/agent/sanitizers_spec.rb +62 -0
  26. data/spec/agent_spec.rb +13 -13
  27. data/spec/auth_store_spec.rb +17 -17
  28. data/spec/cookie_jar_spec.rb +26 -26
  29. data/spec/extensions/uri_spec.rb +19 -9
  30. data/spec/helpers/history.rb +5 -5
  31. data/spec/helpers/wsoc.rb +2 -2
  32. data/spec/page_examples.rb +4 -4
  33. data/spec/page_spec.rb +28 -25
  34. data/spec/rules_spec.rb +14 -14
  35. data/spec/session_cache.rb +7 -7
  36. data/spec/spidr_spec.rb +10 -10
  37. metadata +37 -51
  38. data/lib/spidr/actions.rb +0 -2
  39. data/lib/spidr/actions/exceptions.rb +0 -4
  40. data/lib/spidr/actions/exceptions/action.rb +0 -9
  41. data/lib/spidr/actions/exceptions/paused.rb +0 -11
  42. data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
  43. data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
  44. data/spec/actions_spec.rb +0 -59
  45. data/spec/filters_spec.rb +0 -61
  46. data/spec/sanitizers_spec.rb +0 -61
@@ -6,10 +6,10 @@ module Spidr
6
6
 
7
7
  # Default proxy information.
8
8
  DEFAULT_PROXY = {
9
- :host => nil,
10
- :port => COMMON_PROXY_PORT,
11
- :user => nil,
12
- :password => nil
9
+ host: nil,
10
+ port: COMMON_PROXY_PORT,
11
+ user: nil,
12
+ password: nil
13
13
  }
14
14
 
15
15
  #
@@ -44,7 +44,7 @@ module Spidr
44
44
  # The new proxy information.
45
45
  #
46
46
  def Spidr.proxy=(new_proxy)
47
- @@spidr_proxy = {:port => COMMON_PROXY_PORT}.merge(new_proxy)
47
+ @@spidr_proxy = {port: COMMON_PROXY_PORT}.merge(new_proxy)
48
48
  end
49
49
 
50
50
  #
@@ -75,6 +75,30 @@ module Spidr
75
75
  @@spidr_user_agent = new_agent
76
76
  end
77
77
 
78
+ #
79
+ # Specifies whether `robots.txt` should be honored globally.
80
+ #
81
+ # @return [Boolean]
82
+ #
83
+ # @since 0.5.0
84
+ #
85
+ def Spidr.robots?
86
+ @robots
87
+ end
88
+
89
+ #
90
+ # Enables or disables `robots.txt` globally.
91
+ #
92
+ # @param [Boolean] mode
93
+ #
94
+ # @return [Boolean]
95
+ #
96
+ # @since 0.5.0
97
+ #
98
+ def Spidr.robots=(mode)
99
+ @robots = mode
100
+ end
101
+
78
102
  #
79
103
  # @see Agent.start_at
80
104
  #
@@ -95,4 +119,7 @@ module Spidr
95
119
  def Spidr.site(url,options={},&block)
96
120
  Agent.site(url,options,&block)
97
121
  end
122
+
123
+ def Spidr.robots
124
+ end
98
125
  end
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.4.1'
3
+ VERSION = '0.5.0'
4
4
  end
@@ -0,0 +1,60 @@
1
+ require 'spidr/agent'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Agent do
6
+ describe "actions" do
7
+ let(:url) { URI('http://spidr.rubyforge.org/') }
8
+
9
+ it "should be able to pause spidering" do
10
+ count = 0
11
+ agent = Agent.host('spidr.rubyforge.org') do |spider|
12
+ spider.every_page do |page|
13
+ count += 1
14
+ spider.pause! if count >= 2
15
+ end
16
+ end
17
+
18
+ expect(agent).to be_paused
19
+ expect(agent.history.length).to eq(2)
20
+ end
21
+
22
+ it "should be able to continue spidering after being paused" do
23
+ agent = Agent.new do |spider|
24
+ spider.every_page do |page|
25
+ spider.pause!
26
+ end
27
+ end
28
+
29
+ agent.enqueue(url)
30
+ agent.continue!
31
+
32
+ expect(agent.visited?(url)).to eq(true)
33
+ end
34
+
35
+ it "should allow skipping of enqueued links" do
36
+ agent = Agent.new do |spider|
37
+ spider.every_url do |url|
38
+ spider.skip_link!
39
+ end
40
+ end
41
+
42
+ agent.enqueue(url)
43
+
44
+ expect(agent.queue).to be_empty
45
+ end
46
+
47
+ it "should allow skipping of visited pages" do
48
+ agent = Agent.new do |spider|
49
+ spider.every_page do |url|
50
+ spider.skip_page!
51
+ end
52
+ end
53
+
54
+ agent.visit_page(url)
55
+
56
+ expect(agent.history).to eq(Set[url])
57
+ expect(agent.queue).to be_empty
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,62 @@
1
+ require 'spidr/agent'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Agent do
6
+ describe "filters" do
7
+ it "should allow setting the acceptable schemes" do
8
+ agent = Agent.new
9
+
10
+ agent.schemes = [:http]
11
+ expect(agent.schemes).to eq(['http'])
12
+ end
13
+
14
+ it "should provide the hosts that will be visited" do
15
+ agent = Agent.new(hosts: ['spidr.rubyforge.org'])
16
+
17
+ expect(agent.visit_hosts).to eq(['spidr.rubyforge.org'])
18
+ end
19
+
20
+ it "should provide the hosts that will not be visited" do
21
+ agent = Agent.new(ignore_hosts: ['example.com'])
22
+
23
+ expect(agent.ignore_hosts).to eq(['example.com'])
24
+ end
25
+
26
+ it "should provide the ports that will be visited" do
27
+ agent = Agent.new(ports: [80, 443, 8000])
28
+
29
+ expect(agent.visit_ports).to eq([80, 443, 8000])
30
+ end
31
+
32
+ it "should provide the ports that will not be visited" do
33
+ agent = Agent.new(ignore_ports: [8000, 8080])
34
+
35
+ expect(agent.ignore_ports).to eq([8000, 8080])
36
+ end
37
+
38
+ it "should provide the links that will be visited" do
39
+ agent = Agent.new(links: ['index.php'])
40
+
41
+ expect(agent.visit_links).to eq(['index.php'])
42
+ end
43
+
44
+ it "should provide the links that will not be visited" do
45
+ agent = Agent.new(ignore_links: [/login/])
46
+
47
+ expect(agent.ignore_links).to eq([/login/])
48
+ end
49
+
50
+ it "should provide the exts that will be visited" do
51
+ agent = Agent.new(exts: ['htm'])
52
+
53
+ expect(agent.visit_exts).to eq(['htm'])
54
+ end
55
+
56
+ it "should provide the exts that will not be visited" do
57
+ agent = Agent.new(ignore_exts: ['cfm'])
58
+
59
+ expect(agent.ignore_exts).to eq(['cfm'])
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,62 @@
1
+ require 'spidr/agent'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Agent do
6
+ describe "sanitizers" do
7
+ describe "sanitize_url" do
8
+ let(:url) { 'http://host.com' }
9
+ before(:all) { @agent = Agent.new }
10
+
11
+ it "should sanitize URLs" do
12
+ agent = Agent.new
13
+ clean_url = agent.sanitize_url(URI(url))
14
+
15
+ expect(clean_url.host).to eq('host.com')
16
+ end
17
+
18
+ it "should sanitize URLs given as Strings" do
19
+ agent = Agent.new
20
+ clean_url = agent.sanitize_url(url)
21
+
22
+ expect(clean_url.host).to eq('host.com')
23
+ end
24
+ end
25
+
26
+ describe "strip_fragments" do
27
+ let(:url) { URI("http://host.com/page#lol") }
28
+
29
+ it "should strip fragment components by default" do
30
+ agent = Agent.new
31
+ clean_url = agent.sanitize_url(url)
32
+
33
+ expect(clean_url.fragment).to be_nil
34
+ end
35
+
36
+ it "should allow perserving fragment components" do
37
+ agent = Agent.new(strip_fragments: false)
38
+ clean_url = agent.sanitize_url(url)
39
+
40
+ expect(clean_url.fragment).to eq('lol')
41
+ end
42
+ end
43
+
44
+ describe "strip_query" do
45
+ let(:url) { URI("http://host.com/page?x=1") }
46
+
47
+ it "should not strip query components by default" do
48
+ agent = Agent.new
49
+ clean_url = agent.sanitize_url(url)
50
+
51
+ expect(clean_url.query).to eq('x=1')
52
+ end
53
+
54
+ it "should allow stripping of query components" do
55
+ agent = Agent.new(strip_query: true)
56
+ clean_url = agent.sanitize_url(url)
57
+
58
+ expect(clean_url.query).to be_nil
59
+ end
60
+ end
61
+ end
62
+ end
@@ -11,11 +11,11 @@ describe Agent do
11
11
  end
12
12
 
13
13
  it "should provide the history" do
14
- @agent.history.should_not be_empty
14
+ expect(@agent.history).not_to be_empty
15
15
  end
16
16
 
17
17
  it "should provide the queue" do
18
- @agent.queue.should be_empty
18
+ expect(@agent.queue).to be_empty
19
19
  end
20
20
 
21
21
  it "should be able to restore the history" do
@@ -23,7 +23,7 @@ describe Agent do
23
23
  previous_history = Set[URI('http://www.example.com')]
24
24
 
25
25
  agent.history = previous_history
26
- agent.history.should == previous_history
26
+ expect(agent.history).to eq(previous_history)
27
27
  end
28
28
 
29
29
  it "should convert new histories to an Set of URIs" do
@@ -32,8 +32,8 @@ describe Agent do
32
32
  expected_history = Set[URI('http://www.example.com')]
33
33
 
34
34
  agent.history = previous_history
35
- agent.history.should_not == previous_history
36
- agent.history.should == expected_history
35
+ expect(agent.history).not_to eq(previous_history)
36
+ expect(agent.history).to eq(expected_history)
37
37
  end
38
38
 
39
39
  it "should be able to restore the failures" do
@@ -41,7 +41,7 @@ describe Agent do
41
41
  previous_failures = Set[URI('http://localhost/')]
42
42
 
43
43
  agent.failures = previous_failures
44
- agent.failures.should == previous_failures
44
+ expect(agent.failures).to eq(previous_failures)
45
45
  end
46
46
 
47
47
  it "should convert new histories to a Set of URIs" do
@@ -50,8 +50,8 @@ describe Agent do
50
50
  expected_failures = Set[URI('http://localhost/')]
51
51
 
52
52
  agent.failures = previous_failures
53
- agent.failures.should_not == previous_failures
54
- agent.failures.should == expected_failures
53
+ expect(agent.failures).not_to eq(previous_failures)
54
+ expect(agent.failures).to eq(expected_failures)
55
55
  end
56
56
 
57
57
  it "should be able to restore the queue" do
@@ -59,7 +59,7 @@ describe Agent do
59
59
  previous_queue = [URI('http://www.example.com')]
60
60
 
61
61
  agent.queue = previous_queue
62
- agent.queue.should == previous_queue
62
+ expect(agent.queue).to eq(previous_queue)
63
63
  end
64
64
 
65
65
  it "should convert new queues to an Array of URIs" do
@@ -68,14 +68,14 @@ describe Agent do
68
68
  expected_queue = [URI('http://www.example.com')]
69
69
 
70
70
  agent.queue = previous_queue
71
- agent.queue.should_not == previous_queue
72
- agent.queue.should == expected_queue
71
+ expect(agent.queue).not_to eq(previous_queue)
72
+ expect(agent.queue).to eq(expected_queue)
73
73
  end
74
74
 
75
75
  it "should provide a to_hash method that returns the queue and history" do
76
76
  hash = @agent.to_hash
77
77
 
78
- hash[:queue].should be_empty
79
- hash[:history].should_not be_empty
78
+ expect(hash[:queue]).to be_empty
79
+ expect(hash[:history]).not_to be_empty
80
80
  end
81
81
  end
@@ -17,25 +17,25 @@ describe AuthStore do
17
17
 
18
18
  it 'should retrieve auth credentials for the URL' do
19
19
  @auth_store[root_uri] = AuthCredential.new('user1', 'pass1')
20
- @auth_store[root_uri].username.should == 'user1'
21
- @auth_store[root_uri].password.should == 'pass1'
20
+ expect(@auth_store[root_uri].username).to eq('user1')
21
+ expect(@auth_store[root_uri].password).to eq('pass1')
22
22
  end
23
23
 
24
24
  it 'should add auth credentials for the URL' do
25
- lambda {
25
+ expect {
26
26
  @auth_store.add(root_uri, 'user1', 'pass1')
27
- }.should change(@auth_store, :size)
27
+ }.to change(@auth_store, :size)
28
28
 
29
- @auth_store[root_uri].username.should == 'user1'
30
- @auth_store[root_uri].password.should == 'pass1'
29
+ expect(@auth_store[root_uri].username).to eq('user1')
30
+ expect(@auth_store[root_uri].password).to eq('pass1')
31
31
  end
32
32
 
33
33
  describe 'matching' do
34
34
  let(:sub_uri) { uri.merge('/course/auth/protected.html') }
35
35
 
36
36
  it 'should match a longer URL to the base' do
37
- @auth_store[sub_uri].username.should == 'admin'
38
- @auth_store[sub_uri].password.should == 'password'
37
+ expect(@auth_store[sub_uri].username).to eq('admin')
38
+ expect(@auth_store[sub_uri].password).to eq('password')
39
39
  end
40
40
 
41
41
  it 'should match the longest of all matching URLs' do
@@ -44,42 +44,42 @@ describe AuthStore do
44
44
  @auth_store.add(uri.merge('/course/auth/special/extra'), 'user3', 'pass3')
45
45
 
46
46
  auth = @auth_store[uri.merge('/course/auth/special/1.html')]
47
- auth.username.should == 'user2'
48
- auth.password.should == 'pass2'
47
+ expect(auth.username).to eq('user2')
48
+ expect(auth.password).to eq('pass2')
49
49
  end
50
50
 
51
51
  it 'should not match a URL with a different host' do
52
52
  remote_uri = URI('http://spidr.rubyforge.org/course/auth')
53
53
 
54
- @auth_store[remote_uri].should be_nil
54
+ expect(@auth_store[remote_uri]).to be_nil
55
55
  end
56
56
 
57
57
  it 'should not match a URL with an alternate path' do
58
58
  relative_uri = uri.merge('/course/admin/protected.html')
59
59
 
60
- @auth_store[relative_uri].should be_nil
60
+ expect(@auth_store[relative_uri]).to be_nil
61
61
  end
62
62
  end
63
63
 
64
64
  it 'should override previous auth credentials' do
65
65
  @auth_store.add(uri, 'newuser', 'newpass')
66
66
 
67
- @auth_store[uri].username.should == 'newuser'
68
- @auth_store[uri].password.should == 'newpass'
67
+ expect(@auth_store[uri].username).to eq('newuser')
68
+ expect(@auth_store[uri].password).to eq('newpass')
69
69
  end
70
70
 
71
71
  it 'should clear all cookies' do
72
72
  @auth_store.clear!
73
- @auth_store.size.should == 0
73
+ expect(@auth_store.size).to eq(0)
74
74
  end
75
75
 
76
76
  describe 'for_url' do
77
77
  it 'should return nil if no authorization exists' do
78
- @auth_store.for_url(URI('http://php.net')).should be_nil
78
+ expect(@auth_store.for_url(URI('http://php.net'))).to be_nil
79
79
  end
80
80
 
81
81
  it 'should create an encoded authorization string' do
82
- @auth_store.for_url(uri).should == "YWRtaW46cGFzc3dvcmQ=\n"
82
+ expect(@auth_store.for_url(uri)).to eq("YWRtaW46cGFzc3dvcmQ=\n")
83
83
  end
84
84
  end
85
85
  end
@@ -6,39 +6,39 @@ describe CookieJar do
6
6
  it "should retrieve cookies for the named host" do
7
7
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
8
8
 
9
- subject['zerosum.org'].should == {'admin' => 'ofcourseiam'}
9
+ expect(subject['zerosum.org']).to eq({'admin' => 'ofcourseiam'})
10
10
  end
11
11
 
12
12
  it "should add a cookie to the jar" do
13
13
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
14
14
 
15
- subject['zerosum.org'].should == {'admin' => 'ofcourseiam'}
15
+ expect(subject['zerosum.org']).to eq({'admin' => 'ofcourseiam'})
16
16
  end
17
17
 
18
18
  it "should merge new cookies into the jar" do
19
19
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
20
20
  subject['zerosum.org'] = {'other' => '1'}
21
21
 
22
- subject['zerosum.org'].should == {
22
+ expect(subject['zerosum.org']).to eq({
23
23
  'admin' => 'ofcourseiam',
24
24
  'other' => '1'
25
- }
25
+ })
26
26
  end
27
27
 
28
28
  it "should override previous cookies in the jar" do
29
29
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
30
30
  subject['zerosum.org'] = {'admin' => 'somethingcompletelydifferent'}
31
31
 
32
- subject['zerosum.org'].should == {
32
+ expect(subject['zerosum.org']).to eq({
33
33
  'admin' => 'somethingcompletelydifferent'
34
- }
34
+ })
35
35
  end
36
36
 
37
37
  it "should clear all cookies" do
38
38
  subject['zerosum.org'] = {'cookie' => 'foobar'}
39
39
  subject.clear!
40
40
 
41
- subject.size.should == 0
41
+ expect(subject.size).to eq(0)
42
42
  end
43
43
 
44
44
  describe "dirty" do
@@ -48,37 +48,37 @@ describe CookieJar do
48
48
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
49
49
  subject['zerosum.org'] = {'other' => '1'}
50
50
 
51
- dirty.include?('zerosum.org').should == true
51
+ expect(dirty.include?('zerosum.org')).to eq(true)
52
52
  end
53
53
 
54
54
  it "should mark a cookie dirty after overriding params" do
55
55
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
56
56
  subject['zerosum.org'] = {'admin' => 'nope'}
57
57
 
58
- dirty.include?('zerosum.org').should == true
58
+ expect(dirty.include?('zerosum.org')).to eq(true)
59
59
  end
60
60
 
61
61
  it "should un-mark a cookie as dirty after re-encoding it" do
62
62
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
63
63
  subject['zerosum.org'] = {'admin' => 'nope'}
64
64
 
65
- dirty.include?('zerosum.org').should == true
65
+ expect(dirty.include?('zerosum.org')).to eq(true)
66
66
 
67
67
  subject.for_host('zerosum.org')
68
68
 
69
- dirty.include?('zerosum.org').should == false
69
+ expect(dirty.include?('zerosum.org')).to eq(false)
70
70
  end
71
71
  end
72
72
 
73
73
  describe "cookies_for_host" do
74
74
  it "should return an empty Hash for unknown hosts" do
75
- subject.cookies_for_host('lol.com').should be_empty
75
+ expect(subject.cookies_for_host('lol.com')).to be_empty
76
76
  end
77
77
 
78
78
  it "should return an empty Hash for hosts with no cookie params" do
79
79
  subject['lol.com'] = {}
80
80
 
81
- subject.cookies_for_host('lol.com').should be_empty
81
+ expect(subject.cookies_for_host('lol.com')).to be_empty
82
82
  end
83
83
 
84
84
  it "should return cookie parameters for the host" do
@@ -87,8 +87,8 @@ describe CookieJar do
87
87
 
88
88
  cookie = subject.cookies_for_host('zerosum.org')
89
89
 
90
- cookie['admin'].should == 'ofcourseiam'
91
- cookie['other'].should == '1'
90
+ expect(cookie['admin']).to eq('ofcourseiam')
91
+ expect(cookie['other']).to eq('1')
92
92
  end
93
93
 
94
94
  it "should include cookies for the parent domain" do
@@ -97,26 +97,26 @@ describe CookieJar do
97
97
 
98
98
  cookie = subject.cookies_for_host('sub.zerosum.org')
99
99
 
100
- cookie['admin'].should == 'ofcourseiam'
101
- cookie['other'].should == '1'
100
+ expect(cookie['admin']).to eq('ofcourseiam')
101
+ expect(cookie['other']).to eq('1')
102
102
  end
103
103
  end
104
104
 
105
105
  describe "for_host" do
106
106
  it "should return nil for unknown hosts" do
107
- subject.for_host('lol.com').should be_nil
107
+ expect(subject.for_host('lol.com')).to be_nil
108
108
  end
109
109
 
110
110
  it "should return nil for hosts with no cookie params" do
111
111
  subject['lol.com'] = {}
112
112
 
113
- subject.for_host('lol.com').should be_nil
113
+ expect(subject.for_host('lol.com')).to be_nil
114
114
  end
115
115
 
116
116
  it "should encode single cookie params" do
117
117
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
118
118
 
119
- subject.for_host('zerosum.org').should == 'admin=ofcourseiam'
119
+ expect(subject.for_host('zerosum.org')).to eq('admin=ofcourseiam')
120
120
  end
121
121
 
122
122
  it "should encode multiple cookie params" do
@@ -125,9 +125,9 @@ describe CookieJar do
125
125
 
126
126
  cookie = subject.for_host('zerosum.org')
127
127
 
128
- cookie.should include('admin=ofcourseiam')
129
- cookie.should include('; ')
130
- cookie.should include('other=1')
128
+ expect(cookie).to include('admin=ofcourseiam')
129
+ expect(cookie).to include('; ')
130
+ expect(cookie).to include('other=1')
131
131
  end
132
132
 
133
133
  it "should include cookies for the parent domain" do
@@ -136,9 +136,9 @@ describe CookieJar do
136
136
 
137
137
  cookie = subject.for_host('sub.zerosum.org')
138
138
 
139
- cookie.should include('admin=ofcourseiam')
140
- cookie.should include('; ')
141
- cookie.should include('other=1')
139
+ expect(cookie).to include('admin=ofcourseiam')
140
+ expect(cookie).to include('; ')
141
+ expect(cookie).to include('other=1')
142
142
  end
143
143
  end
144
144
  end