spidr_epg 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +1 -0
  4. data/.yardopts +1 -0
  5. data/ChangeLog.md +291 -0
  6. data/ChangeLog.md~ +291 -0
  7. data/Gemfile +16 -0
  8. data/Gemfile.lock +49 -0
  9. data/Gemfile~ +16 -0
  10. data/LICENSE.txt +20 -0
  11. data/README.md +193 -0
  12. data/README.md~ +190 -0
  13. data/Rakefile +29 -0
  14. data/gemspec.yml +19 -0
  15. data/lib/spidr/actions/actions.rb +83 -0
  16. data/lib/spidr/actions/exceptions/action.rb +9 -0
  17. data/lib/spidr/actions/exceptions/paused.rb +11 -0
  18. data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
  19. data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
  20. data/lib/spidr/actions/exceptions.rb +4 -0
  21. data/lib/spidr/actions.rb +2 -0
  22. data/lib/spidr/agent.rb +866 -0
  23. data/lib/spidr/auth_credential.rb +28 -0
  24. data/lib/spidr/auth_store.rb +161 -0
  25. data/lib/spidr/body.rb +98 -0
  26. data/lib/spidr/cookie_jar.rb +202 -0
  27. data/lib/spidr/events.rb +537 -0
  28. data/lib/spidr/extensions/uri.rb +52 -0
  29. data/lib/spidr/extensions.rb +1 -0
  30. data/lib/spidr/filters.rb +539 -0
  31. data/lib/spidr/headers.rb +370 -0
  32. data/lib/spidr/links.rb +229 -0
  33. data/lib/spidr/page.rb +108 -0
  34. data/lib/spidr/rules.rb +79 -0
  35. data/lib/spidr/sanitizers.rb +56 -0
  36. data/lib/spidr/session_cache.rb +145 -0
  37. data/lib/spidr/spidr.rb +107 -0
  38. data/lib/spidr/version.rb +4 -0
  39. data/lib/spidr/version.rb~ +4 -0
  40. data/lib/spidr.rb +3 -0
  41. data/pkg/spidr-1.0.0.gem +0 -0
  42. data/spec/actions_spec.rb +59 -0
  43. data/spec/agent_spec.rb +81 -0
  44. data/spec/auth_store_spec.rb +85 -0
  45. data/spec/cookie_jar_spec.rb +144 -0
  46. data/spec/extensions/uri_spec.rb +43 -0
  47. data/spec/filters_spec.rb +61 -0
  48. data/spec/helpers/history.rb +34 -0
  49. data/spec/helpers/page.rb +8 -0
  50. data/spec/helpers/wsoc.rb +83 -0
  51. data/spec/page_examples.rb +21 -0
  52. data/spec/page_spec.rb +125 -0
  53. data/spec/rules_spec.rb +45 -0
  54. data/spec/sanitizers_spec.rb +61 -0
  55. data/spec/session_cache.rb +58 -0
  56. data/spec/spec_helper.rb +4 -0
  57. data/spec/spidr_spec.rb +39 -0
  58. data/spidr.gemspec +133 -0
  59. data/spidr.gemspec~ +131 -0
  60. metadata +158 -0
@@ -0,0 +1,79 @@
1
+ module Spidr
2
+ #
3
+ # The {Rules} class represents collections of acceptance and rejection
4
+ # rules, which are used to filter data.
5
+ #
6
+ class Rules
7
+
8
+ # Accept rules
9
+ attr_reader :accept
10
+
11
+ # Reject rules
12
+ attr_reader :reject
13
+
14
+ #
15
+ # Creates a new Rules object.
16
+ #
17
+ # @param [Hash] options
18
+ # Additional options.
19
+ #
20
+ # @option options [Array<String, Regexp, Proc>] :accept
21
+ # The patterns to accept data with.
22
+ #
23
+ # @option options [Array<String, Regexp, Proc>] :reject
24
+ # The patterns to reject data with.
25
+ #
26
+ def initialize(options={})
27
+ @accept = []
28
+ @reject = []
29
+
30
+ @accept += options[:accept] if options[:accept]
31
+ @reject += options[:reject] if options[:reject]
32
+ end
33
+
34
+ #
35
+ # Determines whether the data should be accepted or rejected.
36
+ #
37
+ # @return [Boolean]
38
+ # Specifies whether the given data was accepted, using the rules
39
+ # acceptance patterns.
40
+ #
41
+ def accept?(data)
42
+ unless @accept.empty?
43
+ @accept.any? { |rule| test_data(data,rule) }
44
+ else
45
+ !@reject.any? { |rule| test_data(data,rule) }
46
+ end
47
+ end
48
+
49
+ #
50
+ # Determines whether the data should be rejected or accepted.
51
+ #
52
+ # @return [Boolean]
53
+ # Specifies whether the given data was rejected, using the rules
54
+ # rejection patterns.
55
+ #
56
+ def reject?(data)
57
+ !accept?(data)
58
+ end
59
+
60
+ protected
61
+
62
+ #
63
+ # Tests the given data against a given pattern.
64
+ #
65
+ # @return [Boolean]
66
+ # Specifies whether the given data matched the pattern.
67
+ #
68
+ def test_data(data,rule)
69
+ if rule.kind_of?(Proc)
70
+ rule.call(data) == true
71
+ elsif rule.kind_of?(Regexp)
72
+ !((data.to_s =~ rule).nil?)
73
+ else
74
+ data == rule
75
+ end
76
+ end
77
+
78
+ end
79
+ end
@@ -0,0 +1,56 @@
1
+ require 'uri'
2
+
3
+ module Spidr
4
+ #
5
+ # The {Sanitizers} module adds methods to {Agent} which control the
6
+ # sanitation of incoming links.
7
+ #
8
+ module Sanitizers
9
+ # Specifies whether the Agent will strip URI fragments
10
+ attr_accessor :strip_fragments
11
+
12
+ # Specifies whether the Agent will strip URI queries
13
+ attr_accessor :strip_query
14
+
15
+ #
16
+ # Sanitizes a URL based on filtering options.
17
+ #
18
+ # @param [URI::HTTP, URI::HTTPS, String] url
19
+ # The URL to be sanitized
20
+ #
21
+ # @return [URI::HTTP, URI::HTTPS]
22
+ # The new sanitized URL.
23
+ #
24
+ # @since 0.2.2
25
+ #
26
+ def sanitize_url(url)
27
+ url = URI(url.to_s) unless url.kind_of?(URI)
28
+
29
+ url.fragment = nil if @strip_fragments
30
+ url.query = nil if @strip_query
31
+
32
+ return url
33
+ end
34
+
35
+ protected
36
+
37
+ #
38
+ # Initializes the Sanitizer rules.
39
+ #
40
+ # @param [Hash] options
41
+ # Additional options.
42
+ #
43
+ # @option options [Boolean] :strip_fragments (true)
44
+ # Specifies whether or not to strip the fragment component from URLs.
45
+ #
46
+ # @option options [Boolean] :strip_query (false)
47
+ # Specifies whether or not to strip the query component from URLs.
48
+ #
49
+ # @since 0.2.2
50
+ #
51
+ def initialize_sanitizers(options={})
52
+ @strip_fragments = options.fetch(:strip_fragments,true)
53
+ @strip_query = options.fetch(:strip_query,false)
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,145 @@
1
+ require 'spidrs/spidrs'
2
+
3
+ require 'net/http'
4
+
5
+ module Spidr
6
+ #
7
+ # Stores active HTTP Sessions organized by scheme, host-name and port.
8
+ #
9
+ class SessionCache
10
+
11
+ # Proxy to use
12
+ attr_accessor :proxy
13
+
14
+ #
15
+ # Creates a new session cache.
16
+ #
17
+ # @param [Hash] proxy (Spidr.proxy)
18
+ # Proxy options.
19
+ #
20
+ # @option proxy [String] :host
21
+ # The host the proxy is running on.
22
+ #
23
+ # @option proxy [Integer] :port
24
+ # The port the proxy is running on.
25
+ #
26
+ # @option proxy [String] :user
27
+ # The user to authenticate as with the proxy.
28
+ #
29
+ # @option proxy [String] :password
30
+ # The password to authenticate with.
31
+ #
32
+ # @since 0.2.2
33
+ #
34
+ def initialize(proxy=Spidr.proxy)
35
+ @proxy = proxy
36
+ @sessions = {}
37
+ end
38
+
39
+ #
40
+ # Determines if there is an active HTTP session for a given URL.
41
+ #
42
+ # @param [URI::HTTP, String] url
43
+ # The URL that represents a session.
44
+ #
45
+ # @return [Boolean]
46
+ # Specifies whether there is an active HTTP session.
47
+ #
48
+ # @since 0.2.3
49
+ #
50
+ def active?(url)
51
+ # normalize the url
52
+ url = URI(url.to_s) unless url.kind_of?(URI)
53
+
54
+ # session key
55
+ key = [url.scheme, url.host, url.port]
56
+
57
+ return @sessions.has_key?(key)
58
+ end
59
+
60
+ #
61
+ # Provides an active HTTP session for a given URL.
62
+ #
63
+ # @param [URI::HTTP, String] url
64
+ # The URL which will be requested later.
65
+ #
66
+ # @return [Net::HTTP]
67
+ # The active HTTP session object.
68
+ #
69
+ def [](url)
70
+ # normalize the url
71
+ url = URI(url.to_s) unless url.kind_of?(URI)
72
+
73
+ # session key
74
+ key = [url.scheme, url.host, url.port]
75
+
76
+ unless @sessions[key]
77
+ session = Net::HTTP::Proxy(
78
+ @proxy[:host],
79
+ @proxy[:port],
80
+ @proxy[:user],
81
+ @proxy[:password]
82
+ ).new(url.host,url.port)
83
+
84
+ if url.scheme == 'https'
85
+ session.use_ssl = true
86
+ session.verify_mode = OpenSSL::SSL::VERIFY_NONE
87
+ session.start
88
+ end
89
+
90
+ @sessions[key] = session
91
+ end
92
+
93
+ return @sessions[key]
94
+ end
95
+
96
+ #
97
+ # Destroys an HTTP session for the given scheme, host and port.
98
+ #
99
+ # @param [URI::HTTP, String] url
100
+ # The URL of the requested session.
101
+ #
102
+ # @return [nil]
103
+ #
104
+ # @since 0.2.2
105
+ #
106
+ def kill!(url)
107
+ # normalize the url
108
+ url = URI(url.to_s) unless url.kind_of?(URI)
109
+
110
+ # session key
111
+ key = [url.scheme, url.host, url.port]
112
+
113
+ if (sess = @sessions[key])
114
+ begin
115
+ sess.finish
116
+ rescue IOError
117
+ end
118
+
119
+ @sessions.delete(key)
120
+ end
121
+ end
122
+
123
+ #
124
+ # Clears the session cache.
125
+ #
126
+ # @return [SessionCache]
127
+ # The cleared session cache.
128
+ #
129
+ # @since 0.2.2
130
+ #
131
+ def clear
132
+ @sessions.each_value do |sess|
133
+ begin
134
+ sess.finish
135
+ rescue IOError
136
+ nil
137
+ end
138
+ end
139
+
140
+ @sessions.clear
141
+ return self
142
+ end
143
+
144
+ end
145
+ end
@@ -0,0 +1,107 @@
1
+ require 'spidrs/agent'
2
+
3
+ module Spidr
4
+ # Common proxy port.
5
+ COMMON_PROXY_PORT = 8080
6
+
7
+ # Default proxy information.
8
+ DEFAULT_PROXY = {
9
+ :host => nil,
10
+ :port => COMMON_PROXY_PORT,
11
+ :user => nil,
12
+ :password => nil
13
+ }
14
+
15
+ #
16
+ # Proxy information used by all newly created Agent objects by default.
17
+ #
18
+ # @return [Hash]
19
+ # The Spidr proxy information.
20
+ #
21
+ def Spidr.proxy
22
+ @@spidr_proxy ||= DEFAULT_PROXY
23
+ end
24
+
25
+ #
26
+ # Sets the proxy information used by Agent objects.
27
+ #
28
+ # @param [Hash] new_proxy
29
+ # The new proxy information.
30
+ #
31
+ # @option new_proxy [String] :host
32
+ # The host-name of the proxy.
33
+ #
34
+ # @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
35
+ # The port of the proxy.
36
+ #
37
+ # @option new_proxy [String] :user
38
+ # The user to authenticate with the proxy as.
39
+ #
40
+ # @option new_proxy [String] :password
41
+ # The password to authenticate with the proxy.
42
+ #
43
+ # @return [Hash]
44
+ # The new proxy information.
45
+ #
46
+ def Spidr.proxy=(new_proxy)
47
+ @@spidr_proxy = {:port => COMMON_PROXY_PORT}.merge(new_proxy)
48
+ end
49
+
50
+ #
51
+ # Disables the proxy settings used by all newly created Agent objects.
52
+ #
53
+ def Spidr.disable_proxy!
54
+ @@spidr_proxy = DEFAULT_PROXY
55
+ return true
56
+ end
57
+
58
+ #
59
+ # The User-Agent string used by all Agent objects by default.
60
+ #
61
+ # @return [String]
62
+ # The Spidr User-Agent string.
63
+ #
64
+ def Spidr.user_agent
65
+ @@spidr_user_agent ||= nil
66
+ end
67
+
68
+ #
69
+ # Sets the Spidr User-Agent string.
70
+ #
71
+ # @param [String] new_agent
72
+ # The new User-Agent string.
73
+ #
74
+ def Spidr.user_agent=(new_agent)
75
+ @@spidr_user_agent = new_agent
76
+ end
77
+
78
+ #
79
+ # @see Agent.start_at
80
+ #
81
+ def Spidr.start_at(url,options={},&block)
82
+ Agent.start_at(url,options,&block)
83
+ end
84
+
85
+ #
86
+ # @see Agent.start_at
87
+ # regex use for match url
88
+ # with this faction could find specific url
89
+ #
90
+ def Spidr.start_at(url,regex,options={},&block)
91
+ Agent.start_at(url,regex,options,&block)
92
+ end
93
+
94
+ #
95
+ # @see Agent.host
96
+ #
97
+ def Spidr.host(name,options={},&block)
98
+ Agent.host(name,options,&block)
99
+ end
100
+
101
+ #
102
+ # @see Agent.site
103
+ #
104
+ def Spidr.site(url,options={},&block)
105
+ Agent.site(url,options,&block)
106
+ end
107
+ end
@@ -0,0 +1,4 @@
1
+ module Spidr
2
+ # Spidr version
3
+ VERSION = '1.0.0'
4
+ end
@@ -0,0 +1,4 @@
1
+ module Spidr
2
+ # Spidr version
3
+ VERSION = '1.0.0'
4
+ end
data/lib/spidr.rb ADDED
@@ -0,0 +1,3 @@
1
+ require 'spidr/agent'
2
+ require 'spidr/spidr'
3
+ require 'spidr/version'
Binary file
@@ -0,0 +1,59 @@
1
+ require 'spidr/actions'
2
+ require 'spidr/agent'
3
+
4
+ require 'spec_helper'
5
+
6
+ describe Actions do
7
+ let(:url) { URI('http://spidr.rubyforge.org/') }
8
+
9
+ it "should be able to pause spidering" do
10
+ count = 0
11
+ agent = Agent.host('spidr.rubyforge.org') do |spider|
12
+ spider.every_page do |page|
13
+ count += 1
14
+ spider.pause! if count >= 2
15
+ end
16
+ end
17
+
18
+ agent.should be_paused
19
+ agent.history.length.should == 2
20
+ end
21
+
22
+ it "should be able to continue spidering after being paused" do
23
+ agent = Agent.new do |spider|
24
+ spider.every_page do |page|
25
+ spider.pause!
26
+ end
27
+ end
28
+
29
+ agent.enqueue(url)
30
+ agent.continue!
31
+
32
+ agent.visited?(url).should == true
33
+ end
34
+
35
+ it "should allow skipping of enqueued links" do
36
+ agent = Agent.new do |spider|
37
+ spider.every_url do |url|
38
+ spider.skip_link!
39
+ end
40
+ end
41
+
42
+ agent.enqueue(url)
43
+
44
+ agent.queue.should be_empty
45
+ end
46
+
47
+ it "should allow skipping of visited pages" do
48
+ agent = Agent.new do |spider|
49
+ spider.every_page do |url|
50
+ spider.skip_page!
51
+ end
52
+ end
53
+
54
+ agent.visit_page(url)
55
+
56
+ agent.history.should == Set[url]
57
+ agent.queue.should be_empty
58
+ end
59
+ end
@@ -0,0 +1,81 @@
1
+ require 'spidr/agent'
2
+
3
+ require 'spec_helper'
4
+ require 'helpers/wsoc'
5
+
6
+ describe Agent do
7
+ include Helpers::WSOC
8
+
9
+ before(:all) do
10
+ @agent = run_course
11
+ end
12
+
13
+ it "should provide the history" do
14
+ @agent.history.should_not be_empty
15
+ end
16
+
17
+ it "should provide the queue" do
18
+ @agent.queue.should be_empty
19
+ end
20
+
21
+ it "should be able to restore the history" do
22
+ agent = Agent.new
23
+ previous_history = Set[URI('http://www.example.com')]
24
+
25
+ agent.history = previous_history
26
+ agent.history.should == previous_history
27
+ end
28
+
29
+ it "should convert new histories to an Set of URIs" do
30
+ agent = Agent.new
31
+ previous_history = ['http://www.example.com']
32
+ expected_history = Set[URI('http://www.example.com')]
33
+
34
+ agent.history = previous_history
35
+ agent.history.should_not == previous_history
36
+ agent.history.should == expected_history
37
+ end
38
+
39
+ it "should be able to restore the failures" do
40
+ agent = Agent.new
41
+ previous_failures = Set[URI('http://localhost/')]
42
+
43
+ agent.failures = previous_failures
44
+ agent.failures.should == previous_failures
45
+ end
46
+
47
+ it "should convert new histories to a Set of URIs" do
48
+ agent = Agent.new
49
+ previous_failures = ['http://localhost/']
50
+ expected_failures = Set[URI('http://localhost/')]
51
+
52
+ agent.failures = previous_failures
53
+ agent.failures.should_not == previous_failures
54
+ agent.failures.should == expected_failures
55
+ end
56
+
57
+ it "should be able to restore the queue" do
58
+ agent = Agent.new
59
+ previous_queue = [URI('http://www.example.com')]
60
+
61
+ agent.queue = previous_queue
62
+ agent.queue.should == previous_queue
63
+ end
64
+
65
+ it "should convert new queues to an Array of URIs" do
66
+ agent = Agent.new
67
+ previous_queue = ['http://www.example.com']
68
+ expected_queue = [URI('http://www.example.com')]
69
+
70
+ agent.queue = previous_queue
71
+ agent.queue.should_not == previous_queue
72
+ agent.queue.should == expected_queue
73
+ end
74
+
75
+ it "should provide a to_hash method that returns the queue and history" do
76
+ hash = @agent.to_hash
77
+
78
+ hash[:queue].should be_empty
79
+ hash[:history].should_not be_empty
80
+ end
81
+ end
@@ -0,0 +1,85 @@
1
+ require 'spidr/auth_store'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe AuthStore do
6
+ let(:root_uri) { URI('http://zerosum.org/') }
7
+ let(:uri) { root_uri.merge('/course/auth') }
8
+
9
+ before(:each) do
10
+ @auth_store = AuthStore.new
11
+ @auth_store.add(uri, 'admin', 'password')
12
+ end
13
+
14
+ after(:each) do
15
+ @auth_store.clear!
16
+ end
17
+
18
+ it 'should retrieve auth credentials for the URL' do
19
+ @auth_store[root_uri] = AuthCredential.new('user1', 'pass1')
20
+ @auth_store[root_uri].username.should == 'user1'
21
+ @auth_store[root_uri].password.should == 'pass1'
22
+ end
23
+
24
+ it 'should add auth credentials for the URL' do
25
+ lambda {
26
+ @auth_store.add(root_uri, 'user1', 'pass1')
27
+ }.should change(@auth_store, :size)
28
+
29
+ @auth_store[root_uri].username.should == 'user1'
30
+ @auth_store[root_uri].password.should == 'pass1'
31
+ end
32
+
33
+ describe 'matching' do
34
+ let(:sub_uri) { uri.merge('/course/auth/protected.html') }
35
+
36
+ it 'should match a longer URL to the base' do
37
+ @auth_store[sub_uri].username.should == 'admin'
38
+ @auth_store[sub_uri].password.should == 'password'
39
+ end
40
+
41
+ it 'should match the longest of all matching URLs' do
42
+ @auth_store.add(uri.merge('/course'), 'user1', 'pass1')
43
+ @auth_store.add(uri.merge('/course/auth/special'), 'user2', 'pass2')
44
+ @auth_store.add(uri.merge('/course/auth/special/extra'), 'user3', 'pass3')
45
+
46
+ auth = @auth_store[uri.merge('/course/auth/special/1.html')]
47
+ auth.username.should == 'user2'
48
+ auth.password.should == 'pass2'
49
+ end
50
+
51
+ it 'should not match a URL with a different host' do
52
+ remote_uri = URI('http://spidr.rubyforge.org/course/auth')
53
+
54
+ @auth_store[remote_uri].should be_nil
55
+ end
56
+
57
+ it 'should not match a URL with an alternate path' do
58
+ relative_uri = uri.merge('/course/admin/protected.html')
59
+
60
+ @auth_store[relative_uri].should be_nil
61
+ end
62
+ end
63
+
64
+ it 'should override previous auth credentials' do
65
+ @auth_store.add(uri, 'newuser', 'newpass')
66
+
67
+ @auth_store[uri].username.should == 'newuser'
68
+ @auth_store[uri].password.should == 'newpass'
69
+ end
70
+
71
+ it 'should clear all cookies' do
72
+ @auth_store.clear!
73
+ @auth_store.size.should == 0
74
+ end
75
+
76
+ describe 'for_url' do
77
+ it 'should return nil if no authorization exists' do
78
+ @auth_store.for_url(URI('http://php.net')).should be_nil
79
+ end
80
+
81
+ it 'should create an encoded authorization string' do
82
+ @auth_store.for_url(uri).should == "YWRtaW46cGFzc3dvcmQ=\n"
83
+ end
84
+ end
85
+ end