spidr 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/ChangeLog.md +69 -54
  3. data/Gemfile +9 -5
  4. data/LICENSE.txt +1 -1
  5. data/README.md +34 -26
  6. data/Rakefile +4 -15
  7. data/gemspec.yml +3 -2
  8. data/lib/spidr/agent.rb +101 -44
  9. data/lib/spidr/{actions → agent}/actions.rb +32 -12
  10. data/lib/spidr/{events.rb → agent/events.rb} +4 -8
  11. data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
  12. data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
  13. data/lib/spidr/auth_store.rb +2 -2
  14. data/lib/spidr/cookie_jar.rb +2 -2
  15. data/lib/spidr/extensions/uri.rb +28 -16
  16. data/lib/spidr/page.rb +7 -11
  17. data/lib/spidr/{body.rb → page/body.rb} +1 -1
  18. data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
  19. data/lib/spidr/{links.rb → page/links.rb} +43 -7
  20. data/lib/spidr/session_cache.rb +2 -2
  21. data/lib/spidr/spidr.rb +32 -5
  22. data/lib/spidr/version.rb +1 -1
  23. data/spec/agent/actions_spec.rb +60 -0
  24. data/spec/agent/filters_spec.rb +62 -0
  25. data/spec/agent/sanitizers_spec.rb +62 -0
  26. data/spec/agent_spec.rb +13 -13
  27. data/spec/auth_store_spec.rb +17 -17
  28. data/spec/cookie_jar_spec.rb +26 -26
  29. data/spec/extensions/uri_spec.rb +19 -9
  30. data/spec/helpers/history.rb +5 -5
  31. data/spec/helpers/wsoc.rb +2 -2
  32. data/spec/page_examples.rb +4 -4
  33. data/spec/page_spec.rb +28 -25
  34. data/spec/rules_spec.rb +14 -14
  35. data/spec/session_cache.rb +7 -7
  36. data/spec/spidr_spec.rb +10 -10
  37. metadata +37 -51
  38. data/lib/spidr/actions.rb +0 -2
  39. data/lib/spidr/actions/exceptions.rb +0 -4
  40. data/lib/spidr/actions/exceptions/action.rb +0 -9
  41. data/lib/spidr/actions/exceptions/paused.rb +0 -11
  42. data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
  43. data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
  44. data/spec/actions_spec.rb +0 -59
  45. data/spec/filters_spec.rb +0 -61
  46. data/spec/sanitizers_spec.rb +0 -61
@@ -6,10 +6,10 @@ module Spidr
6
6
 
7
7
  # Default proxy information.
8
8
  DEFAULT_PROXY = {
9
- :host => nil,
10
- :port => COMMON_PROXY_PORT,
11
- :user => nil,
12
- :password => nil
9
+ host: nil,
10
+ port: COMMON_PROXY_PORT,
11
+ user: nil,
12
+ password: nil
13
13
  }
14
14
 
15
15
  #
@@ -44,7 +44,7 @@ module Spidr
44
44
  # The new proxy information.
45
45
  #
46
46
  def Spidr.proxy=(new_proxy)
47
- @@spidr_proxy = {:port => COMMON_PROXY_PORT}.merge(new_proxy)
47
+ @@spidr_proxy = {port: COMMON_PROXY_PORT}.merge(new_proxy)
48
48
  end
49
49
 
50
50
  #
@@ -75,6 +75,30 @@ module Spidr
75
75
  @@spidr_user_agent = new_agent
76
76
  end
77
77
 
78
+ #
79
+ # Specifies whether `robots.txt` should be honored globally.
80
+ #
81
+ # @return [Boolean]
82
+ #
83
+ # @since 0.5.0
84
+ #
85
+ def Spidr.robots?
86
+ @robots
87
+ end
88
+
89
+ #
90
+ # Enables or disables `robots.txt` globally.
91
+ #
92
+ # @param [Boolean] mode
93
+ #
94
+ # @return [Boolean]
95
+ #
96
+ # @since 0.5.0
97
+ #
98
+ def Spidr.robots=(mode)
99
+ @robots = mode
100
+ end
101
+
78
102
  #
79
103
  # @see Agent.start_at
80
104
  #
@@ -95,4 +119,7 @@ module Spidr
95
119
  def Spidr.site(url,options={},&block)
96
120
  Agent.site(url,options,&block)
97
121
  end
122
+
123
+ def Spidr.robots
124
+ end
98
125
  end
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.4.1'
3
+ VERSION = '0.5.0'
4
4
  end
@@ -0,0 +1,60 @@
1
+ require 'spidr/agent'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Agent do
6
+ describe "actions" do
7
+ let(:url) { URI('http://spidr.rubyforge.org/') }
8
+
9
+ it "should be able to pause spidering" do
10
+ count = 0
11
+ agent = Agent.host('spidr.rubyforge.org') do |spider|
12
+ spider.every_page do |page|
13
+ count += 1
14
+ spider.pause! if count >= 2
15
+ end
16
+ end
17
+
18
+ expect(agent).to be_paused
19
+ expect(agent.history.length).to eq(2)
20
+ end
21
+
22
+ it "should be able to continue spidering after being paused" do
23
+ agent = Agent.new do |spider|
24
+ spider.every_page do |page|
25
+ spider.pause!
26
+ end
27
+ end
28
+
29
+ agent.enqueue(url)
30
+ agent.continue!
31
+
32
+ expect(agent.visited?(url)).to eq(true)
33
+ end
34
+
35
+ it "should allow skipping of enqueued links" do
36
+ agent = Agent.new do |spider|
37
+ spider.every_url do |url|
38
+ spider.skip_link!
39
+ end
40
+ end
41
+
42
+ agent.enqueue(url)
43
+
44
+ expect(agent.queue).to be_empty
45
+ end
46
+
47
+ it "should allow skipping of visited pages" do
48
+ agent = Agent.new do |spider|
49
+ spider.every_page do |url|
50
+ spider.skip_page!
51
+ end
52
+ end
53
+
54
+ agent.visit_page(url)
55
+
56
+ expect(agent.history).to eq(Set[url])
57
+ expect(agent.queue).to be_empty
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,62 @@
1
+ require 'spidr/agent'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Agent do
6
+ describe "filters" do
7
+ it "should allow setting the acceptable schemes" do
8
+ agent = Agent.new
9
+
10
+ agent.schemes = [:http]
11
+ expect(agent.schemes).to eq(['http'])
12
+ end
13
+
14
+ it "should provide the hosts that will be visited" do
15
+ agent = Agent.new(hosts: ['spidr.rubyforge.org'])
16
+
17
+ expect(agent.visit_hosts).to eq(['spidr.rubyforge.org'])
18
+ end
19
+
20
+ it "should provide the hosts that will not be visited" do
21
+ agent = Agent.new(ignore_hosts: ['example.com'])
22
+
23
+ expect(agent.ignore_hosts).to eq(['example.com'])
24
+ end
25
+
26
+ it "should provide the ports that will be visited" do
27
+ agent = Agent.new(ports: [80, 443, 8000])
28
+
29
+ expect(agent.visit_ports).to eq([80, 443, 8000])
30
+ end
31
+
32
+ it "should provide the ports that will not be visited" do
33
+ agent = Agent.new(ignore_ports: [8000, 8080])
34
+
35
+ expect(agent.ignore_ports).to eq([8000, 8080])
36
+ end
37
+
38
+ it "should provide the links that will be visited" do
39
+ agent = Agent.new(links: ['index.php'])
40
+
41
+ expect(agent.visit_links).to eq(['index.php'])
42
+ end
43
+
44
+ it "should provide the links that will not be visited" do
45
+ agent = Agent.new(ignore_links: [/login/])
46
+
47
+ expect(agent.ignore_links).to eq([/login/])
48
+ end
49
+
50
+ it "should provide the exts that will be visited" do
51
+ agent = Agent.new(exts: ['htm'])
52
+
53
+ expect(agent.visit_exts).to eq(['htm'])
54
+ end
55
+
56
+ it "should provide the exts that will not be visited" do
57
+ agent = Agent.new(ignore_exts: ['cfm'])
58
+
59
+ expect(agent.ignore_exts).to eq(['cfm'])
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,62 @@
1
+ require 'spidr/agent'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Agent do
6
+ describe "sanitizers" do
7
+ describe "sanitize_url" do
8
+ let(:url) { 'http://host.com' }
9
+ before(:all) { @agent = Agent.new }
10
+
11
+ it "should sanitize URLs" do
12
+ agent = Agent.new
13
+ clean_url = agent.sanitize_url(URI(url))
14
+
15
+ expect(clean_url.host).to eq('host.com')
16
+ end
17
+
18
+ it "should sanitize URLs given as Strings" do
19
+ agent = Agent.new
20
+ clean_url = agent.sanitize_url(url)
21
+
22
+ expect(clean_url.host).to eq('host.com')
23
+ end
24
+ end
25
+
26
+ describe "strip_fragments" do
27
+ let(:url) { URI("http://host.com/page#lol") }
28
+
29
+ it "should strip fragment components by default" do
30
+ agent = Agent.new
31
+ clean_url = agent.sanitize_url(url)
32
+
33
+ expect(clean_url.fragment).to be_nil
34
+ end
35
+
36
+ it "should allow perserving fragment components" do
37
+ agent = Agent.new(strip_fragments: false)
38
+ clean_url = agent.sanitize_url(url)
39
+
40
+ expect(clean_url.fragment).to eq('lol')
41
+ end
42
+ end
43
+
44
+ describe "strip_query" do
45
+ let(:url) { URI("http://host.com/page?x=1") }
46
+
47
+ it "should not strip query components by default" do
48
+ agent = Agent.new
49
+ clean_url = agent.sanitize_url(url)
50
+
51
+ expect(clean_url.query).to eq('x=1')
52
+ end
53
+
54
+ it "should allow stripping of query components" do
55
+ agent = Agent.new(strip_query: true)
56
+ clean_url = agent.sanitize_url(url)
57
+
58
+ expect(clean_url.query).to be_nil
59
+ end
60
+ end
61
+ end
62
+ end
@@ -11,11 +11,11 @@ describe Agent do
11
11
  end
12
12
 
13
13
  it "should provide the history" do
14
- @agent.history.should_not be_empty
14
+ expect(@agent.history).not_to be_empty
15
15
  end
16
16
 
17
17
  it "should provide the queue" do
18
- @agent.queue.should be_empty
18
+ expect(@agent.queue).to be_empty
19
19
  end
20
20
 
21
21
  it "should be able to restore the history" do
@@ -23,7 +23,7 @@ describe Agent do
23
23
  previous_history = Set[URI('http://www.example.com')]
24
24
 
25
25
  agent.history = previous_history
26
- agent.history.should == previous_history
26
+ expect(agent.history).to eq(previous_history)
27
27
  end
28
28
 
29
29
  it "should convert new histories to an Set of URIs" do
@@ -32,8 +32,8 @@ describe Agent do
32
32
  expected_history = Set[URI('http://www.example.com')]
33
33
 
34
34
  agent.history = previous_history
35
- agent.history.should_not == previous_history
36
- agent.history.should == expected_history
35
+ expect(agent.history).not_to eq(previous_history)
36
+ expect(agent.history).to eq(expected_history)
37
37
  end
38
38
 
39
39
  it "should be able to restore the failures" do
@@ -41,7 +41,7 @@ describe Agent do
41
41
  previous_failures = Set[URI('http://localhost/')]
42
42
 
43
43
  agent.failures = previous_failures
44
- agent.failures.should == previous_failures
44
+ expect(agent.failures).to eq(previous_failures)
45
45
  end
46
46
 
47
47
  it "should convert new histories to a Set of URIs" do
@@ -50,8 +50,8 @@ describe Agent do
50
50
  expected_failures = Set[URI('http://localhost/')]
51
51
 
52
52
  agent.failures = previous_failures
53
- agent.failures.should_not == previous_failures
54
- agent.failures.should == expected_failures
53
+ expect(agent.failures).not_to eq(previous_failures)
54
+ expect(agent.failures).to eq(expected_failures)
55
55
  end
56
56
 
57
57
  it "should be able to restore the queue" do
@@ -59,7 +59,7 @@ describe Agent do
59
59
  previous_queue = [URI('http://www.example.com')]
60
60
 
61
61
  agent.queue = previous_queue
62
- agent.queue.should == previous_queue
62
+ expect(agent.queue).to eq(previous_queue)
63
63
  end
64
64
 
65
65
  it "should convert new queues to an Array of URIs" do
@@ -68,14 +68,14 @@ describe Agent do
68
68
  expected_queue = [URI('http://www.example.com')]
69
69
 
70
70
  agent.queue = previous_queue
71
- agent.queue.should_not == previous_queue
72
- agent.queue.should == expected_queue
71
+ expect(agent.queue).not_to eq(previous_queue)
72
+ expect(agent.queue).to eq(expected_queue)
73
73
  end
74
74
 
75
75
  it "should provide a to_hash method that returns the queue and history" do
76
76
  hash = @agent.to_hash
77
77
 
78
- hash[:queue].should be_empty
79
- hash[:history].should_not be_empty
78
+ expect(hash[:queue]).to be_empty
79
+ expect(hash[:history]).not_to be_empty
80
80
  end
81
81
  end
@@ -17,25 +17,25 @@ describe AuthStore do
17
17
 
18
18
  it 'should retrieve auth credentials for the URL' do
19
19
  @auth_store[root_uri] = AuthCredential.new('user1', 'pass1')
20
- @auth_store[root_uri].username.should == 'user1'
21
- @auth_store[root_uri].password.should == 'pass1'
20
+ expect(@auth_store[root_uri].username).to eq('user1')
21
+ expect(@auth_store[root_uri].password).to eq('pass1')
22
22
  end
23
23
 
24
24
  it 'should add auth credentials for the URL' do
25
- lambda {
25
+ expect {
26
26
  @auth_store.add(root_uri, 'user1', 'pass1')
27
- }.should change(@auth_store, :size)
27
+ }.to change(@auth_store, :size)
28
28
 
29
- @auth_store[root_uri].username.should == 'user1'
30
- @auth_store[root_uri].password.should == 'pass1'
29
+ expect(@auth_store[root_uri].username).to eq('user1')
30
+ expect(@auth_store[root_uri].password).to eq('pass1')
31
31
  end
32
32
 
33
33
  describe 'matching' do
34
34
  let(:sub_uri) { uri.merge('/course/auth/protected.html') }
35
35
 
36
36
  it 'should match a longer URL to the base' do
37
- @auth_store[sub_uri].username.should == 'admin'
38
- @auth_store[sub_uri].password.should == 'password'
37
+ expect(@auth_store[sub_uri].username).to eq('admin')
38
+ expect(@auth_store[sub_uri].password).to eq('password')
39
39
  end
40
40
 
41
41
  it 'should match the longest of all matching URLs' do
@@ -44,42 +44,42 @@ describe AuthStore do
44
44
  @auth_store.add(uri.merge('/course/auth/special/extra'), 'user3', 'pass3')
45
45
 
46
46
  auth = @auth_store[uri.merge('/course/auth/special/1.html')]
47
- auth.username.should == 'user2'
48
- auth.password.should == 'pass2'
47
+ expect(auth.username).to eq('user2')
48
+ expect(auth.password).to eq('pass2')
49
49
  end
50
50
 
51
51
  it 'should not match a URL with a different host' do
52
52
  remote_uri = URI('http://spidr.rubyforge.org/course/auth')
53
53
 
54
- @auth_store[remote_uri].should be_nil
54
+ expect(@auth_store[remote_uri]).to be_nil
55
55
  end
56
56
 
57
57
  it 'should not match a URL with an alternate path' do
58
58
  relative_uri = uri.merge('/course/admin/protected.html')
59
59
 
60
- @auth_store[relative_uri].should be_nil
60
+ expect(@auth_store[relative_uri]).to be_nil
61
61
  end
62
62
  end
63
63
 
64
64
  it 'should override previous auth credentials' do
65
65
  @auth_store.add(uri, 'newuser', 'newpass')
66
66
 
67
- @auth_store[uri].username.should == 'newuser'
68
- @auth_store[uri].password.should == 'newpass'
67
+ expect(@auth_store[uri].username).to eq('newuser')
68
+ expect(@auth_store[uri].password).to eq('newpass')
69
69
  end
70
70
 
71
71
  it 'should clear all cookies' do
72
72
  @auth_store.clear!
73
- @auth_store.size.should == 0
73
+ expect(@auth_store.size).to eq(0)
74
74
  end
75
75
 
76
76
  describe 'for_url' do
77
77
  it 'should return nil if no authorization exists' do
78
- @auth_store.for_url(URI('http://php.net')).should be_nil
78
+ expect(@auth_store.for_url(URI('http://php.net'))).to be_nil
79
79
  end
80
80
 
81
81
  it 'should create an encoded authorization string' do
82
- @auth_store.for_url(uri).should == "YWRtaW46cGFzc3dvcmQ=\n"
82
+ expect(@auth_store.for_url(uri)).to eq("YWRtaW46cGFzc3dvcmQ=\n")
83
83
  end
84
84
  end
85
85
  end
@@ -6,39 +6,39 @@ describe CookieJar do
6
6
  it "should retrieve cookies for the named host" do
7
7
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
8
8
 
9
- subject['zerosum.org'].should == {'admin' => 'ofcourseiam'}
9
+ expect(subject['zerosum.org']).to eq({'admin' => 'ofcourseiam'})
10
10
  end
11
11
 
12
12
  it "should add a cookie to the jar" do
13
13
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
14
14
 
15
- subject['zerosum.org'].should == {'admin' => 'ofcourseiam'}
15
+ expect(subject['zerosum.org']).to eq({'admin' => 'ofcourseiam'})
16
16
  end
17
17
 
18
18
  it "should merge new cookies into the jar" do
19
19
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
20
20
  subject['zerosum.org'] = {'other' => '1'}
21
21
 
22
- subject['zerosum.org'].should == {
22
+ expect(subject['zerosum.org']).to eq({
23
23
  'admin' => 'ofcourseiam',
24
24
  'other' => '1'
25
- }
25
+ })
26
26
  end
27
27
 
28
28
  it "should override previous cookies in the jar" do
29
29
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
30
30
  subject['zerosum.org'] = {'admin' => 'somethingcompletelydifferent'}
31
31
 
32
- subject['zerosum.org'].should == {
32
+ expect(subject['zerosum.org']).to eq({
33
33
  'admin' => 'somethingcompletelydifferent'
34
- }
34
+ })
35
35
  end
36
36
 
37
37
  it "should clear all cookies" do
38
38
  subject['zerosum.org'] = {'cookie' => 'foobar'}
39
39
  subject.clear!
40
40
 
41
- subject.size.should == 0
41
+ expect(subject.size).to eq(0)
42
42
  end
43
43
 
44
44
  describe "dirty" do
@@ -48,37 +48,37 @@ describe CookieJar do
48
48
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
49
49
  subject['zerosum.org'] = {'other' => '1'}
50
50
 
51
- dirty.include?('zerosum.org').should == true
51
+ expect(dirty.include?('zerosum.org')).to eq(true)
52
52
  end
53
53
 
54
54
  it "should mark a cookie dirty after overriding params" do
55
55
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
56
56
  subject['zerosum.org'] = {'admin' => 'nope'}
57
57
 
58
- dirty.include?('zerosum.org').should == true
58
+ expect(dirty.include?('zerosum.org')).to eq(true)
59
59
  end
60
60
 
61
61
  it "should un-mark a cookie as dirty after re-encoding it" do
62
62
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
63
63
  subject['zerosum.org'] = {'admin' => 'nope'}
64
64
 
65
- dirty.include?('zerosum.org').should == true
65
+ expect(dirty.include?('zerosum.org')).to eq(true)
66
66
 
67
67
  subject.for_host('zerosum.org')
68
68
 
69
- dirty.include?('zerosum.org').should == false
69
+ expect(dirty.include?('zerosum.org')).to eq(false)
70
70
  end
71
71
  end
72
72
 
73
73
  describe "cookies_for_host" do
74
74
  it "should return an empty Hash for unknown hosts" do
75
- subject.cookies_for_host('lol.com').should be_empty
75
+ expect(subject.cookies_for_host('lol.com')).to be_empty
76
76
  end
77
77
 
78
78
  it "should return an empty Hash for hosts with no cookie params" do
79
79
  subject['lol.com'] = {}
80
80
 
81
- subject.cookies_for_host('lol.com').should be_empty
81
+ expect(subject.cookies_for_host('lol.com')).to be_empty
82
82
  end
83
83
 
84
84
  it "should return cookie parameters for the host" do
@@ -87,8 +87,8 @@ describe CookieJar do
87
87
 
88
88
  cookie = subject.cookies_for_host('zerosum.org')
89
89
 
90
- cookie['admin'].should == 'ofcourseiam'
91
- cookie['other'].should == '1'
90
+ expect(cookie['admin']).to eq('ofcourseiam')
91
+ expect(cookie['other']).to eq('1')
92
92
  end
93
93
 
94
94
  it "should include cookies for the parent domain" do
@@ -97,26 +97,26 @@ describe CookieJar do
97
97
 
98
98
  cookie = subject.cookies_for_host('sub.zerosum.org')
99
99
 
100
- cookie['admin'].should == 'ofcourseiam'
101
- cookie['other'].should == '1'
100
+ expect(cookie['admin']).to eq('ofcourseiam')
101
+ expect(cookie['other']).to eq('1')
102
102
  end
103
103
  end
104
104
 
105
105
  describe "for_host" do
106
106
  it "should return nil for unknown hosts" do
107
- subject.for_host('lol.com').should be_nil
107
+ expect(subject.for_host('lol.com')).to be_nil
108
108
  end
109
109
 
110
110
  it "should return nil for hosts with no cookie params" do
111
111
  subject['lol.com'] = {}
112
112
 
113
- subject.for_host('lol.com').should be_nil
113
+ expect(subject.for_host('lol.com')).to be_nil
114
114
  end
115
115
 
116
116
  it "should encode single cookie params" do
117
117
  subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
118
118
 
119
- subject.for_host('zerosum.org').should == 'admin=ofcourseiam'
119
+ expect(subject.for_host('zerosum.org')).to eq('admin=ofcourseiam')
120
120
  end
121
121
 
122
122
  it "should encode multiple cookie params" do
@@ -125,9 +125,9 @@ describe CookieJar do
125
125
 
126
126
  cookie = subject.for_host('zerosum.org')
127
127
 
128
- cookie.should include('admin=ofcourseiam')
129
- cookie.should include('; ')
130
- cookie.should include('other=1')
128
+ expect(cookie).to include('admin=ofcourseiam')
129
+ expect(cookie).to include('; ')
130
+ expect(cookie).to include('other=1')
131
131
  end
132
132
 
133
133
  it "should include cookies for the parent domain" do
@@ -136,9 +136,9 @@ describe CookieJar do
136
136
 
137
137
  cookie = subject.for_host('sub.zerosum.org')
138
138
 
139
- cookie.should include('admin=ofcourseiam')
140
- cookie.should include('; ')
141
- cookie.should include('other=1')
139
+ expect(cookie).to include('admin=ofcourseiam')
140
+ expect(cookie).to include('; ')
141
+ expect(cookie).to include('other=1')
142
142
  end
143
143
  end
144
144
  end