spidr 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
@@ -0,0 +1,112 @@
1
+ module Spidr
2
+ class Page
3
+ #
4
+ # The response code from the page.
5
+ #
6
+ # @return [Integer]
7
+ # Response code from the page.
8
+ #
9
+ def code
10
+ @response.code.to_i
11
+ end
12
+
13
+ #
14
+ # Determines if the response code is `200`.
15
+ #
16
+ # @return [Boolean]
17
+ # Specifies whether the response code is `200`.
18
+ #
19
+ def is_ok?
20
+ code == 200
21
+ end
22
+
23
+ alias ok? is_ok?
24
+
25
+ #
26
+ # Determines if the response code is `308`.
27
+ #
28
+ # @return [Boolean]
29
+ # Specifies whether the response code is `308`.
30
+ #
31
+ def timedout?
32
+ code == 308
33
+ end
34
+
35
+ #
36
+ # Determines if the response code is `400`.
37
+ #
38
+ # @return [Boolean]
39
+ # Specifies whether the response code is `400`.
40
+ #
41
+ def bad_request?
42
+ code == 400
43
+ end
44
+
45
+ #
46
+ # Determines if the response code is `401`.
47
+ #
48
+ # @return [Boolean]
49
+ # Specifies whether the response code is `401`.
50
+ #
51
+ def is_unauthorized?
52
+ code == 401
53
+ end
54
+
55
+ alias unauthorized? is_unauthorized?
56
+
57
+ #
58
+ # Determines if the response code is `403`.
59
+ #
60
+ # @return [Boolean]
61
+ # Specifies whether the response code is `403`.
62
+ #
63
+ def is_forbidden?
64
+ code == 403
65
+ end
66
+
67
+ alias forbidden? is_forbidden?
68
+
69
+ #
70
+ # Determines if the response code is `404`.
71
+ #
72
+ # @return [Boolean]
73
+ # Specifies whether the response code is `404`.
74
+ #
75
+ def is_missing?
76
+ code == 404
77
+ end
78
+
79
+ alias missing? is_missing?
80
+
81
+ #
82
+ # Determines if the response code is `500`.
83
+ #
84
+ # @return [Boolean]
85
+ # Specifies whether the response code is `500`.
86
+ #
87
+ def had_internal_server_error?
88
+ code == 500
89
+ end
90
+
91
+ #
92
+ # Determines if the response code is `300`, `301`, `302`, `303`
93
+ # or `307`. Also checks for "soft" redirects added at the page
94
+ # level by a meta refresh tag.
95
+ #
96
+ # @return [Boolean]
97
+ # Specifies whether the response code is a HTTP Redirect code.
98
+ #
99
+ def is_redirect?
100
+ case code
101
+ when 300..303, 307
102
+ true
103
+ when 200
104
+ meta_redirect?
105
+ else
106
+ false
107
+ end
108
+ end
109
+
110
+ alias redirect? is_redirect?
111
+ end
112
+ end
@@ -0,0 +1,56 @@
1
+ module Spidr
2
+ #
3
+ # @since 0.6.0
4
+ #
5
+ class Proxy < Struct.new(:host, :port, :user, :password)
6
+
7
+ # Default port to use.
8
+ DEFAULT_PORT = 8080
9
+
10
+ #
11
+ # Initializes the proxy.
12
+ #
13
+ # @param [Hash] attributes
14
+ # Attributes for the proxy.
15
+ #
16
+ # @option attributes [String] :host
17
+ # The host the proxy is running on.
18
+ #
19
+ # @option attributes [Integer] :port
20
+ # The port the proxy is running on.
21
+ #
22
+ # @option attributes [String] :user
23
+ # The user to authenticate as with the proxy.
24
+ #
25
+ # @option attributes [String] :password
26
+ # The password to authenticate with.
27
+ #
28
+ def initialize(attributes={})
29
+ super(
30
+ attributes[:host],
31
+ attributes.fetch(:port,DEFAULT_PORT),
32
+ attributes[:user],
33
+ attributes[:password]
34
+ )
35
+ end
36
+
37
+ #
38
+ # Determines if the proxy settings are set.
39
+ #
40
+ # @return [Boolean]
41
+ #
42
+ def enabled?
43
+ !host.nil?
44
+ end
45
+
46
+ #
47
+ # Determines if the proxy is not set.
48
+ #
49
+ # @return [Boolean]
50
+ #
51
+ def disabled?
52
+ host.nil?
53
+ end
54
+
55
+ end
56
+ end
@@ -1,6 +1,9 @@
1
+ require 'spidr/settings/proxy'
2
+ require 'spidr/settings/timeouts'
1
3
  require 'spidr/spidr'
2
4
 
3
5
  require 'net/http'
6
+ require 'openssl'
4
7
 
5
8
  module Spidr
6
9
  #
@@ -8,31 +11,44 @@ module Spidr
8
11
  #
9
12
  class SessionCache
10
13
 
11
- # Proxy to use
12
- attr_accessor :proxy
14
+ include Settings::Proxy
15
+ include Settings::Timeouts
13
16
 
14
17
  #
15
18
  # Creates a new session cache.
16
19
  #
17
- # @param [Hash] proxy (Spidr.proxy)
20
+ # @param [Hash] options
21
+ # Configuration options.
22
+ #
23
+ # @option [Hash] :proxy (Spidr.proxy)
18
24
  # Proxy options.
19
25
  #
20
- # @option proxy [String] :host
21
- # The host the proxy is running on.
26
+ # @option [Integer] :open_timeout (Spidr.open_timeout)
27
+ # Optional open timeout.
22
28
  #
23
- # @option proxy [Integer] :port
24
- # The port the proxy is running on.
29
+ # @option [Integer] :ssl_timeout (Spidr.ssl_timeout)
30
+ # Optional ssl timeout.
25
31
  #
26
- # @option proxy [String] :user
27
- # The user to authenticate as with the proxy.
32
+ # @option [Integer] :read_timeout (Spidr.read_timeout)
33
+ # Optional read timeout.
28
34
  #
29
- # @option proxy [String] :password
30
- # The password to authenticate with.
35
+ # @option [Integer] :continue_timeout (Spidr.continue_timeout)
36
+ # Optional `Continue` timeout.
31
37
  #
32
- # @since 0.2.2
38
+ # @option [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
39
+ # Optional `Keep-Alive` timeout.
33
40
  #
34
- def initialize(proxy=Spidr.proxy)
35
- @proxy = proxy
41
+ # @since 0.6.0
42
+ #
43
+ def initialize(options={})
44
+ @proxy = options.fetch(:proxy,Spidr.proxy)
45
+
46
+ @open_timeout = options.fetch(:open_timeout,Spidr.open_timeout)
47
+ @ssl_timeout = options.fetch(:ssl_timeout,Spidr.ssl_timeout)
48
+ @read_timeout = options.fetch(:read_timeout,Spidr.read_timeout)
49
+ @continue_timeout = options.fetch(:continue_timeout,Spidr.continue_timeout)
50
+ @keep_alive_timeout = options.fetch(:keep_alive_timeout,Spidr.keep_alive_timeout)
51
+
36
52
  @sessions = {}
37
53
  end
38
54
 
@@ -52,7 +68,7 @@ module Spidr
52
68
  url = URI(url.to_s) unless url.kind_of?(URI)
53
69
 
54
70
  # session key
55
- key = [url.scheme, url.host, url.port]
71
+ key = key_for(url)
56
72
 
57
73
  return @sessions.has_key?(key)
58
74
  end
@@ -71,19 +87,25 @@ module Spidr
71
87
  url = URI(url.to_s) unless url.kind_of?(URI)
72
88
 
73
89
  # session key
74
- key = [url.scheme, url.host, url.port]
90
+ key = key_for(url)
75
91
 
76
92
  unless @sessions[key]
77
93
  session = Net::HTTP::Proxy(
78
- @proxy[:host],
79
- @proxy[:port],
80
- @proxy[:user],
81
- @proxy[:password]
94
+ @proxy.host,
95
+ @proxy.port,
96
+ @proxy.user,
97
+ @proxy.password
82
98
  ).new(url.host,url.port)
83
99
 
100
+ session.open_timeout = @open_timeout if @open_timeout
101
+ session.read_timeout = @read_timeout if @read_timeout
102
+ session.continue_timeout = @continue_timeout if @continue_timeout
103
+ session.keep_alive_timeout = @keep_alive_timeout if @keep_alive_timeout
104
+
84
105
  if url.scheme == 'https'
85
106
  session.use_ssl = true
86
107
  session.verify_mode = OpenSSL::SSL::VERIFY_NONE
108
+ session.ssl_timeout = @ssl_timeout
87
109
  session.start
88
110
  end
89
111
 
@@ -108,7 +130,7 @@ module Spidr
108
130
  url = URI(url.to_s) unless url.kind_of?(URI)
109
131
 
110
132
  # session key
111
- key = [url.scheme, url.host, url.port]
133
+ key = key_for(url)
112
134
 
113
135
  if (sess = @sessions[key])
114
136
  begin
@@ -129,11 +151,10 @@ module Spidr
129
151
  # @since 0.2.2
130
152
  #
131
153
  def clear
132
- @sessions.each_value do |sess|
154
+ @sessions.each_value do |session|
133
155
  begin
134
- sess.finish
156
+ session.finish
135
157
  rescue IOError
136
- nil
137
158
  end
138
159
  end
139
160
 
@@ -141,5 +162,20 @@ module Spidr
141
162
  return self
142
163
  end
143
164
 
165
+ private
166
+
167
+ #
168
+ # Creates a session key based on the URL.
169
+ #
170
+ # @param [URI::HTTP] url
171
+ # The given URL.
172
+ #
173
+ # @return [Array]
174
+ # The session key containing the scheme, host and port.
175
+ #
176
+ def key_for(url)
177
+ [url.scheme, url.host, url.port]
178
+ end
179
+
144
180
  end
145
181
  end
@@ -0,0 +1,3 @@
1
+ require 'spidr/settings/proxy'
2
+ require 'spidr/settings/timeouts'
3
+ require 'spidr/settings/user_agent'
@@ -0,0 +1,61 @@
1
+ require 'spidr/proxy'
2
+
3
+ module Spidr
4
+ module Settings
5
+ #
6
+ # Methods for configuring a proxy.
7
+ #
8
+ # @since 0.6.0
9
+ #
10
+ module Proxy
11
+ #
12
+ # Proxy information used by all newly created Agent objects by default.
13
+ #
14
+ # @return [Spidr::Proxy]
15
+ # The Spidr proxy information.
16
+ #
17
+ def proxy
18
+ @proxy ||= Spidr::Proxy.new
19
+ end
20
+
21
+ #
22
+ # Sets the proxy information used by Agent objects.
23
+ #
24
+ # @param [Spidr::Proxy, Hash, nil] new_proxy
25
+ # The new proxy information.
26
+ #
27
+ # @option new_proxy [String] :host
28
+ # The host-name of the proxy.
29
+ #
30
+ # @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
31
+ # The port of the proxy.
32
+ #
33
+ # @option new_proxy [String] :user
34
+ # The user to authenticate with the proxy as.
35
+ #
36
+ # @option new_proxy [String] :password
37
+ # The password to authenticate with the proxy.
38
+ #
39
+ # @return [Spidr::Proxy]
40
+ # The new proxy information.
41
+ #
42
+ def proxy=(new_proxy)
43
+ @proxy = case new_proxy
44
+ when Spidr::Proxy then new_proxy
45
+ when Hash then Spidr::Proxy.new(new_proxy)
46
+ when nil then Spidr::Proxy.new
47
+ else
48
+ raise(TypeError,"#{self.class}#{__method__} only accepts Proxy, Hash or nil")
49
+ end
50
+ end
51
+
52
+ #
53
+ # Disables the proxy settings used by all newly created Agent objects.
54
+ #
55
+ def disable_proxy!
56
+ @proxy = Spidr::Proxy.new
57
+ return true
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,33 @@
1
+ module Spidr
2
+ module Settings
3
+ #
4
+ # @since 0.6.0
5
+ #
6
+ module Timeouts
7
+ # Read timeout.
8
+ #
9
+ # @return [Integer, nil]
10
+ attr_accessor :read_timeout
11
+
12
+ # Open timeout.
13
+ #
14
+ # @return [Integer, nil]
15
+ attr_accessor :open_timeout
16
+
17
+ # SSL timeout.
18
+ #
19
+ # @return [Integer, nil]
20
+ attr_accessor :ssl_timeout
21
+
22
+ # `Continue` timeout.
23
+ #
24
+ # @return [Integer, nil]
25
+ attr_accessor :continue_timeout
26
+
27
+ # `Keep-Alive` timeout.
28
+ #
29
+ # @return [Integer, nil]
30
+ attr_accessor :keep_alive_timeout
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,14 @@
1
+ module Spidr
2
+ module Settings
3
+ #
4
+ # @since 0.6.0
5
+ #
6
+ module UserAgent
7
+ # The User-Agent string used by all Agent objects by default.
8
+ #
9
+ # @return [String]
10
+ # The Spidr User-Agent string.
11
+ attr_accessor :user_agent
12
+ end
13
+ end
14
+ end
@@ -1,79 +1,12 @@
1
+ require 'spidr/settings/proxy'
2
+ require 'spidr/settings/timeouts'
3
+ require 'spidr/settings/user_agent'
1
4
  require 'spidr/agent'
2
5
 
3
6
  module Spidr
4
- # Common proxy port.
5
- COMMON_PROXY_PORT = 8080
6
-
7
- # Default proxy information.
8
- DEFAULT_PROXY = {
9
- host: nil,
10
- port: COMMON_PROXY_PORT,
11
- user: nil,
12
- password: nil
13
- }
14
-
15
- #
16
- # Proxy information used by all newly created Agent objects by default.
17
- #
18
- # @return [Hash]
19
- # The Spidr proxy information.
20
- #
21
- def Spidr.proxy
22
- @@spidr_proxy ||= DEFAULT_PROXY
23
- end
24
-
25
- #
26
- # Sets the proxy information used by Agent objects.
27
- #
28
- # @param [Hash] new_proxy
29
- # The new proxy information.
30
- #
31
- # @option new_proxy [String] :host
32
- # The host-name of the proxy.
33
- #
34
- # @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
35
- # The port of the proxy.
36
- #
37
- # @option new_proxy [String] :user
38
- # The user to authenticate with the proxy as.
39
- #
40
- # @option new_proxy [String] :password
41
- # The password to authenticate with the proxy.
42
- #
43
- # @return [Hash]
44
- # The new proxy information.
45
- #
46
- def Spidr.proxy=(new_proxy)
47
- @@spidr_proxy = {port: COMMON_PROXY_PORT}.merge(new_proxy)
48
- end
49
-
50
- #
51
- # Disables the proxy settings used by all newly created Agent objects.
52
- #
53
- def Spidr.disable_proxy!
54
- @@spidr_proxy = DEFAULT_PROXY
55
- return true
56
- end
57
-
58
- #
59
- # The User-Agent string used by all Agent objects by default.
60
- #
61
- # @return [String]
62
- # The Spidr User-Agent string.
63
- #
64
- def Spidr.user_agent
65
- @@spidr_user_agent ||= nil
66
- end
67
-
68
- #
69
- # Sets the Spidr User-Agent string.
70
- #
71
- # @param [String] new_agent
72
- # The new User-Agent string.
73
- #
74
- def Spidr.user_agent=(new_agent)
75
- @@spidr_user_agent = new_agent
76
- end
7
+ extend Settings::Proxy
8
+ extend Settings::Timeouts
9
+ extend Settings::UserAgent
77
10
 
78
11
  #
79
12
  # Specifies whether `robots.txt` should be honored globally.
@@ -82,7 +15,7 @@ module Spidr
82
15
  #
83
16
  # @since 0.5.0
84
17
  #
85
- def Spidr.robots?
18
+ def self.robots?
86
19
  @robots
87
20
  end
88
21
 
@@ -95,31 +28,34 @@ module Spidr
95
28
  #
96
29
  # @since 0.5.0
97
30
  #
98
- def Spidr.robots=(mode)
31
+ def self.robots=(mode)
99
32
  @robots = mode
100
33
  end
101
34
 
102
35
  #
103
36
  # @see Agent.start_at
104
37
  #
105
- def Spidr.start_at(url,options={},&block)
38
+ def self.start_at(url,options={},&block)
106
39
  Agent.start_at(url,options,&block)
107
40
  end
108
41
 
109
42
  #
110
43
  # @see Agent.host
111
44
  #
112
- def Spidr.host(name,options={},&block)
45
+ def self.host(name,options={},&block)
113
46
  Agent.host(name,options,&block)
114
47
  end
115
48
 
116
49
  #
117
50
  # @see Agent.site
118
51
  #
119
- def Spidr.site(url,options={},&block)
52
+ def self.site(url,options={},&block)
120
53
  Agent.site(url,options,&block)
121
54
  end
122
55
 
123
- def Spidr.robots
56
+ #
57
+ # @abstract
58
+ #
59
+ def self.robots
124
60
  end
125
61
  end