spidr_epg 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +1 -0
  4. data/.yardopts +1 -0
  5. data/ChangeLog.md +291 -0
  6. data/ChangeLog.md~ +291 -0
  7. data/Gemfile +16 -0
  8. data/Gemfile.lock +49 -0
  9. data/Gemfile~ +16 -0
  10. data/LICENSE.txt +20 -0
  11. data/README.md +193 -0
  12. data/README.md~ +190 -0
  13. data/Rakefile +29 -0
  14. data/gemspec.yml +19 -0
  15. data/lib/spidr/actions/actions.rb +83 -0
  16. data/lib/spidr/actions/exceptions/action.rb +9 -0
  17. data/lib/spidr/actions/exceptions/paused.rb +11 -0
  18. data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
  19. data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
  20. data/lib/spidr/actions/exceptions.rb +4 -0
  21. data/lib/spidr/actions.rb +2 -0
  22. data/lib/spidr/agent.rb +866 -0
  23. data/lib/spidr/auth_credential.rb +28 -0
  24. data/lib/spidr/auth_store.rb +161 -0
  25. data/lib/spidr/body.rb +98 -0
  26. data/lib/spidr/cookie_jar.rb +202 -0
  27. data/lib/spidr/events.rb +537 -0
  28. data/lib/spidr/extensions/uri.rb +52 -0
  29. data/lib/spidr/extensions.rb +1 -0
  30. data/lib/spidr/filters.rb +539 -0
  31. data/lib/spidr/headers.rb +370 -0
  32. data/lib/spidr/links.rb +229 -0
  33. data/lib/spidr/page.rb +108 -0
  34. data/lib/spidr/rules.rb +79 -0
  35. data/lib/spidr/sanitizers.rb +56 -0
  36. data/lib/spidr/session_cache.rb +145 -0
  37. data/lib/spidr/spidr.rb +107 -0
  38. data/lib/spidr/version.rb +4 -0
  39. data/lib/spidr/version.rb~ +4 -0
  40. data/lib/spidr.rb +3 -0
  41. data/pkg/spidr-1.0.0.gem +0 -0
  42. data/spec/actions_spec.rb +59 -0
  43. data/spec/agent_spec.rb +81 -0
  44. data/spec/auth_store_spec.rb +85 -0
  45. data/spec/cookie_jar_spec.rb +144 -0
  46. data/spec/extensions/uri_spec.rb +43 -0
  47. data/spec/filters_spec.rb +61 -0
  48. data/spec/helpers/history.rb +34 -0
  49. data/spec/helpers/page.rb +8 -0
  50. data/spec/helpers/wsoc.rb +83 -0
  51. data/spec/page_examples.rb +21 -0
  52. data/spec/page_spec.rb +125 -0
  53. data/spec/rules_spec.rb +45 -0
  54. data/spec/sanitizers_spec.rb +61 -0
  55. data/spec/session_cache.rb +58 -0
  56. data/spec/spec_helper.rb +4 -0
  57. data/spec/spidr_spec.rb +39 -0
  58. data/spidr.gemspec +133 -0
  59. data/spidr.gemspec~ +131 -0
  60. metadata +158 -0
@@ -0,0 +1,79 @@
1
+ module Spidr
2
+ #
3
+ # The {Rules} class represents collections of acceptance and rejection
4
+ # rules, which are used to filter data.
5
+ #
6
+ class Rules
7
+
8
+ # Accept rules
9
+ attr_reader :accept
10
+
11
+ # Reject rules
12
+ attr_reader :reject
13
+
14
+ #
15
+ # Creates a new Rules object.
16
+ #
17
+ # @param [Hash] options
18
+ # Additional options.
19
+ #
20
+ # @option options [Array<String, Regexp, Proc>] :accept
21
+ # The patterns to accept data with.
22
+ #
23
+ # @option options [Array<String, Regexp, Proc>] :reject
24
+ # The patterns to reject data with.
25
+ #
26
+ def initialize(options={})
27
+ @accept = []
28
+ @reject = []
29
+
30
+ @accept += options[:accept] if options[:accept]
31
+ @reject += options[:reject] if options[:reject]
32
+ end
33
+
34
+ #
35
+ # Determines whether the data should be accepted or rejected.
36
+ #
37
+ # @return [Boolean]
38
+ # Specifies whether the given data was accepted, using the rules
39
+ # acceptance patterns.
40
+ #
41
+ def accept?(data)
42
+ unless @accept.empty?
43
+ @accept.any? { |rule| test_data(data,rule) }
44
+ else
45
+ !@reject.any? { |rule| test_data(data,rule) }
46
+ end
47
+ end
48
+
49
+ #
50
+ # Determines whether the data should be rejected or accepted.
51
+ #
52
+ # @return [Boolean]
53
+ # Specifies whether the given data was rejected, using the rules
54
+ # rejection patterns.
55
+ #
56
+ def reject?(data)
57
+ !accept?(data)
58
+ end
59
+
60
+ protected
61
+
62
+ #
63
+ # Tests the given data against a given pattern.
64
+ #
65
+ # @return [Boolean]
66
+ # Specifies whether the given data matched the pattern.
67
+ #
68
+ def test_data(data,rule)
69
+ if rule.kind_of?(Proc)
70
+ rule.call(data) == true
71
+ elsif rule.kind_of?(Regexp)
72
+ !((data.to_s =~ rule).nil?)
73
+ else
74
+ data == rule
75
+ end
76
+ end
77
+
78
+ end
79
+ end
@@ -0,0 +1,56 @@
1
+ require 'uri'
2
+
3
+ module Spidr
4
+ #
5
+ # The {Sanitizers} module adds methods to {Agent} which control the
6
+ # sanitation of incoming links.
7
+ #
8
+ module Sanitizers
9
+ # Specifies whether the Agent will strip URI fragments
10
+ attr_accessor :strip_fragments
11
+
12
+ # Specifies whether the Agent will strip URI queries
13
+ attr_accessor :strip_query
14
+
15
+ #
16
+ # Sanitizes a URL based on filtering options.
17
+ #
18
+ # @param [URI::HTTP, URI::HTTPS, String] url
19
+ # The URL to be sanitized
20
+ #
21
+ # @return [URI::HTTP, URI::HTTPS]
22
+ # The new sanitized URL.
23
+ #
24
+ # @since 0.2.2
25
+ #
26
+ def sanitize_url(url)
27
+ url = URI(url.to_s) unless url.kind_of?(URI)
28
+
29
+ url.fragment = nil if @strip_fragments
30
+ url.query = nil if @strip_query
31
+
32
+ return url
33
+ end
34
+
35
+ protected
36
+
37
+ #
38
+ # Initializes the Sanitizer rules.
39
+ #
40
+ # @param [Hash] options
41
+ # Additional options.
42
+ #
43
+ # @option options [Boolean] :strip_fragments (true)
44
+ # Specifies whether or not to strip the fragment component from URLs.
45
+ #
46
+ # @option options [Boolean] :strip_query (false)
47
+ # Specifies whether or not to strip the query component from URLs.
48
+ #
49
+ # @since 0.2.2
50
+ #
51
+ def initialize_sanitizers(options={})
52
+ @strip_fragments = options.fetch(:strip_fragments,true)
53
+ @strip_query = options.fetch(:strip_query,false)
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,145 @@
1
+ require 'spidrs/spidrs'
2
+
3
+ require 'net/http'
4
+
5
+ module Spidr
6
+ #
7
+ # Stores active HTTP Sessions organized by scheme, host-name and port.
8
+ #
9
+ class SessionCache
10
+
11
+ # Proxy to use
12
+ attr_accessor :proxy
13
+
14
+ #
15
+ # Creates a new session cache.
16
+ #
17
+ # @param [Hash] proxy (Spidr.proxy)
18
+ # Proxy options.
19
+ #
20
+ # @option proxy [String] :host
21
+ # The host the proxy is running on.
22
+ #
23
+ # @option proxy [Integer] :port
24
+ # The port the proxy is running on.
25
+ #
26
+ # @option proxy [String] :user
27
+ # The user to authenticate as with the proxy.
28
+ #
29
+ # @option proxy [String] :password
30
+ # The password to authenticate with.
31
+ #
32
+ # @since 0.2.2
33
+ #
34
+ def initialize(proxy=Spidr.proxy)
35
+ @proxy = proxy
36
+ @sessions = {}
37
+ end
38
+
39
+ #
40
+ # Determines if there is an active HTTP session for a given URL.
41
+ #
42
+ # @param [URI::HTTP, String] url
43
+ # The URL that represents a session.
44
+ #
45
+ # @return [Boolean]
46
+ # Specifies whether there is an active HTTP session.
47
+ #
48
+ # @since 0.2.3
49
+ #
50
+ def active?(url)
51
+ # normalize the url
52
+ url = URI(url.to_s) unless url.kind_of?(URI)
53
+
54
+ # session key
55
+ key = [url.scheme, url.host, url.port]
56
+
57
+ return @sessions.has_key?(key)
58
+ end
59
+
60
+ #
61
+ # Provides an active HTTP session for a given URL.
62
+ #
63
+ # @param [URI::HTTP, String] url
64
+ # The URL which will be requested later.
65
+ #
66
+ # @return [Net::HTTP]
67
+ # The active HTTP session object.
68
+ #
69
+ def [](url)
70
+ # normalize the url
71
+ url = URI(url.to_s) unless url.kind_of?(URI)
72
+
73
+ # session key
74
+ key = [url.scheme, url.host, url.port]
75
+
76
+ unless @sessions[key]
77
+ session = Net::HTTP::Proxy(
78
+ @proxy[:host],
79
+ @proxy[:port],
80
+ @proxy[:user],
81
+ @proxy[:password]
82
+ ).new(url.host,url.port)
83
+
84
+ if url.scheme == 'https'
85
+ session.use_ssl = true
86
+ session.verify_mode = OpenSSL::SSL::VERIFY_NONE
87
+ session.start
88
+ end
89
+
90
+ @sessions[key] = session
91
+ end
92
+
93
+ return @sessions[key]
94
+ end
95
+
96
+ #
97
+ # Destroys an HTTP session for the given scheme, host and port.
98
+ #
99
+ # @param [URI::HTTP, String] url
100
+ # The URL of the requested session.
101
+ #
102
+ # @return [nil]
103
+ #
104
+ # @since 0.2.2
105
+ #
106
+ def kill!(url)
107
+ # normalize the url
108
+ url = URI(url.to_s) unless url.kind_of?(URI)
109
+
110
+ # session key
111
+ key = [url.scheme, url.host, url.port]
112
+
113
+ if (sess = @sessions[key])
114
+ begin
115
+ sess.finish
116
+ rescue IOError
117
+ end
118
+
119
+ @sessions.delete(key)
120
+ end
121
+ end
122
+
123
+ #
124
+ # Clears the session cache.
125
+ #
126
+ # @return [SessionCache]
127
+ # The cleared session cache.
128
+ #
129
+ # @since 0.2.2
130
+ #
131
+ def clear
132
+ @sessions.each_value do |sess|
133
+ begin
134
+ sess.finish
135
+ rescue IOError
136
+ nil
137
+ end
138
+ end
139
+
140
+ @sessions.clear
141
+ return self
142
+ end
143
+
144
+ end
145
+ end
@@ -0,0 +1,107 @@
1
+ require 'spidrs/agent'
2
+
3
+ module Spidr
4
+ # Common proxy port.
5
+ COMMON_PROXY_PORT = 8080
6
+
7
+ # Default proxy information.
8
+ DEFAULT_PROXY = {
9
+ :host => nil,
10
+ :port => COMMON_PROXY_PORT,
11
+ :user => nil,
12
+ :password => nil
13
+ }
14
+
15
+ #
16
+ # Proxy information used by all newly created Agent objects by default.
17
+ #
18
+ # @return [Hash]
19
+ # The Spidr proxy information.
20
+ #
21
+ def Spidr.proxy
22
+ @@spidr_proxy ||= DEFAULT_PROXY
23
+ end
24
+
25
+ #
26
+ # Sets the proxy information used by Agent objects.
27
+ #
28
+ # @param [Hash] new_proxy
29
+ # The new proxy information.
30
+ #
31
+ # @option new_proxy [String] :host
32
+ # The host-name of the proxy.
33
+ #
34
+ # @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
35
+ # The port of the proxy.
36
+ #
37
+ # @option new_proxy [String] :user
38
+ # The user to authenticate with the proxy as.
39
+ #
40
+ # @option new_proxy [String] :password
41
+ # The password to authenticate with the proxy.
42
+ #
43
+ # @return [Hash]
44
+ # The new proxy information.
45
+ #
46
+ def Spidr.proxy=(new_proxy)
47
+ @@spidr_proxy = {:port => COMMON_PROXY_PORT}.merge(new_proxy)
48
+ end
49
+
50
+ #
51
+ # Disables the proxy settings used by all newly created Agent objects.
52
+ #
53
+ def Spidr.disable_proxy!
54
+ @@spidr_proxy = DEFAULT_PROXY
55
+ return true
56
+ end
57
+
58
+ #
59
+ # The User-Agent string used by all Agent objects by default.
60
+ #
61
+ # @return [String]
62
+ # The Spidr User-Agent string.
63
+ #
64
+ def Spidr.user_agent
65
+ @@spidr_user_agent ||= nil
66
+ end
67
+
68
+ #
69
+ # Sets the Spidr User-Agent string.
70
+ #
71
+ # @param [String] new_agent
72
+ # The new User-Agent string.
73
+ #
74
+ def Spidr.user_agent=(new_agent)
75
+ @@spidr_user_agent = new_agent
76
+ end
77
+
78
+ #
79
+ # @see Agent.start_at
80
+ #
81
+ def Spidr.start_at(url,options={},&block)
82
+ Agent.start_at(url,options,&block)
83
+ end
84
+
85
+ #
86
+ # @see Agent.start_at
87
+ # regex use for match url
88
+ # with this faction could find specific url
89
+ #
90
+ def Spidr.start_at(url,regex,options={},&block)
91
+ Agent.start_at(url,regex,options,&block)
92
+ end
93
+
94
+ #
95
+ # @see Agent.host
96
+ #
97
+ def Spidr.host(name,options={},&block)
98
+ Agent.host(name,options,&block)
99
+ end
100
+
101
+ #
102
+ # @see Agent.site
103
+ #
104
+ def Spidr.site(url,options={},&block)
105
+ Agent.site(url,options,&block)
106
+ end
107
+ end
@@ -0,0 +1,4 @@
1
+ module Spidr
2
+ # Spidr version
3
+ VERSION = '1.0.0'
4
+ end
@@ -0,0 +1,4 @@
1
+ module Spidr
2
+ # Spidr version
3
+ VERSION = '1.0.0'
4
+ end
data/lib/spidr.rb ADDED
@@ -0,0 +1,3 @@
1
+ require 'spidr/agent'
2
+ require 'spidr/spidr'
3
+ require 'spidr/version'
Binary file
@@ -0,0 +1,59 @@
1
+ require 'spidr/actions'
2
+ require 'spidr/agent'
3
+
4
+ require 'spec_helper'
5
+
6
+ describe Actions do
7
+ let(:url) { URI('http://spidr.rubyforge.org/') }
8
+
9
+ it "should be able to pause spidering" do
10
+ count = 0
11
+ agent = Agent.host('spidr.rubyforge.org') do |spider|
12
+ spider.every_page do |page|
13
+ count += 1
14
+ spider.pause! if count >= 2
15
+ end
16
+ end
17
+
18
+ agent.should be_paused
19
+ agent.history.length.should == 2
20
+ end
21
+
22
+ it "should be able to continue spidering after being paused" do
23
+ agent = Agent.new do |spider|
24
+ spider.every_page do |page|
25
+ spider.pause!
26
+ end
27
+ end
28
+
29
+ agent.enqueue(url)
30
+ agent.continue!
31
+
32
+ agent.visited?(url).should == true
33
+ end
34
+
35
+ it "should allow skipping of enqueued links" do
36
+ agent = Agent.new do |spider|
37
+ spider.every_url do |url|
38
+ spider.skip_link!
39
+ end
40
+ end
41
+
42
+ agent.enqueue(url)
43
+
44
+ agent.queue.should be_empty
45
+ end
46
+
47
+ it "should allow skipping of visited pages" do
48
+ agent = Agent.new do |spider|
49
+ spider.every_page do |url|
50
+ spider.skip_page!
51
+ end
52
+ end
53
+
54
+ agent.visit_page(url)
55
+
56
+ agent.history.should == Set[url]
57
+ agent.queue.should be_empty
58
+ end
59
+ end
@@ -0,0 +1,81 @@
1
+ require 'spidr/agent'
2
+
3
+ require 'spec_helper'
4
+ require 'helpers/wsoc'
5
+
6
+ describe Agent do
7
+ include Helpers::WSOC
8
+
9
+ before(:all) do
10
+ @agent = run_course
11
+ end
12
+
13
+ it "should provide the history" do
14
+ @agent.history.should_not be_empty
15
+ end
16
+
17
+ it "should provide the queue" do
18
+ @agent.queue.should be_empty
19
+ end
20
+
21
+ it "should be able to restore the history" do
22
+ agent = Agent.new
23
+ previous_history = Set[URI('http://www.example.com')]
24
+
25
+ agent.history = previous_history
26
+ agent.history.should == previous_history
27
+ end
28
+
29
+ it "should convert new histories to an Set of URIs" do
30
+ agent = Agent.new
31
+ previous_history = ['http://www.example.com']
32
+ expected_history = Set[URI('http://www.example.com')]
33
+
34
+ agent.history = previous_history
35
+ agent.history.should_not == previous_history
36
+ agent.history.should == expected_history
37
+ end
38
+
39
+ it "should be able to restore the failures" do
40
+ agent = Agent.new
41
+ previous_failures = Set[URI('http://localhost/')]
42
+
43
+ agent.failures = previous_failures
44
+ agent.failures.should == previous_failures
45
+ end
46
+
47
+ it "should convert new histories to a Set of URIs" do
48
+ agent = Agent.new
49
+ previous_failures = ['http://localhost/']
50
+ expected_failures = Set[URI('http://localhost/')]
51
+
52
+ agent.failures = previous_failures
53
+ agent.failures.should_not == previous_failures
54
+ agent.failures.should == expected_failures
55
+ end
56
+
57
+ it "should be able to restore the queue" do
58
+ agent = Agent.new
59
+ previous_queue = [URI('http://www.example.com')]
60
+
61
+ agent.queue = previous_queue
62
+ agent.queue.should == previous_queue
63
+ end
64
+
65
+ it "should convert new queues to an Array of URIs" do
66
+ agent = Agent.new
67
+ previous_queue = ['http://www.example.com']
68
+ expected_queue = [URI('http://www.example.com')]
69
+
70
+ agent.queue = previous_queue
71
+ agent.queue.should_not == previous_queue
72
+ agent.queue.should == expected_queue
73
+ end
74
+
75
+ it "should provide a to_hash method that returns the queue and history" do
76
+ hash = @agent.to_hash
77
+
78
+ hash[:queue].should be_empty
79
+ hash[:history].should_not be_empty
80
+ end
81
+ end
@@ -0,0 +1,85 @@
1
+ require 'spidr/auth_store'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe AuthStore do
6
+ let(:root_uri) { URI('http://zerosum.org/') }
7
+ let(:uri) { root_uri.merge('/course/auth') }
8
+
9
+ before(:each) do
10
+ @auth_store = AuthStore.new
11
+ @auth_store.add(uri, 'admin', 'password')
12
+ end
13
+
14
+ after(:each) do
15
+ @auth_store.clear!
16
+ end
17
+
18
+ it 'should retrieve auth credentials for the URL' do
19
+ @auth_store[root_uri] = AuthCredential.new('user1', 'pass1')
20
+ @auth_store[root_uri].username.should == 'user1'
21
+ @auth_store[root_uri].password.should == 'pass1'
22
+ end
23
+
24
+ it 'should add auth credentials for the URL' do
25
+ lambda {
26
+ @auth_store.add(root_uri, 'user1', 'pass1')
27
+ }.should change(@auth_store, :size)
28
+
29
+ @auth_store[root_uri].username.should == 'user1'
30
+ @auth_store[root_uri].password.should == 'pass1'
31
+ end
32
+
33
+ describe 'matching' do
34
+ let(:sub_uri) { uri.merge('/course/auth/protected.html') }
35
+
36
+ it 'should match a longer URL to the base' do
37
+ @auth_store[sub_uri].username.should == 'admin'
38
+ @auth_store[sub_uri].password.should == 'password'
39
+ end
40
+
41
+ it 'should match the longest of all matching URLs' do
42
+ @auth_store.add(uri.merge('/course'), 'user1', 'pass1')
43
+ @auth_store.add(uri.merge('/course/auth/special'), 'user2', 'pass2')
44
+ @auth_store.add(uri.merge('/course/auth/special/extra'), 'user3', 'pass3')
45
+
46
+ auth = @auth_store[uri.merge('/course/auth/special/1.html')]
47
+ auth.username.should == 'user2'
48
+ auth.password.should == 'pass2'
49
+ end
50
+
51
+ it 'should not match a URL with a different host' do
52
+ remote_uri = URI('http://spidr.rubyforge.org/course/auth')
53
+
54
+ @auth_store[remote_uri].should be_nil
55
+ end
56
+
57
+ it 'should not match a URL with an alternate path' do
58
+ relative_uri = uri.merge('/course/admin/protected.html')
59
+
60
+ @auth_store[relative_uri].should be_nil
61
+ end
62
+ end
63
+
64
+ it 'should override previous auth credentials' do
65
+ @auth_store.add(uri, 'newuser', 'newpass')
66
+
67
+ @auth_store[uri].username.should == 'newuser'
68
+ @auth_store[uri].password.should == 'newpass'
69
+ end
70
+
71
+ it 'should clear all cookies' do
72
+ @auth_store.clear!
73
+ @auth_store.size.should == 0
74
+ end
75
+
76
+ describe 'for_url' do
77
+ it 'should return nil if no authorization exists' do
78
+ @auth_store.for_url(URI('http://php.net')).should be_nil
79
+ end
80
+
81
+ it 'should create an encoded authorization string' do
82
+ @auth_store.for_url(uri).should == "YWRtaW46cGFzc3dvcmQ=\n"
83
+ end
84
+ end
85
+ end