spidr 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
@@ -0,0 +1,112 @@
1
+ module Spidr
2
+ class Page
3
+ #
4
+ # The response code from the page.
5
+ #
6
+ # @return [Integer]
7
+ # Response code from the page.
8
+ #
9
+ def code
10
+ @response.code.to_i
11
+ end
12
+
13
+ #
14
+ # Determines if the response code is `200`.
15
+ #
16
+ # @return [Boolean]
17
+ # Specifies whether the response code is `200`.
18
+ #
19
+ def is_ok?
20
+ code == 200
21
+ end
22
+
23
+ alias ok? is_ok?
24
+
25
+ #
26
+ # Determines if the response code is `308`.
27
+ #
28
+ # @return [Boolean]
29
+ # Specifies whether the response code is `308`.
30
+ #
31
+ def timedout?
32
+ code == 308
33
+ end
34
+
35
+ #
36
+ # Determines if the response code is `400`.
37
+ #
38
+ # @return [Boolean]
39
+ # Specifies whether the response code is `400`.
40
+ #
41
+ def bad_request?
42
+ code == 400
43
+ end
44
+
45
+ #
46
+ # Determines if the response code is `401`.
47
+ #
48
+ # @return [Boolean]
49
+ # Specifies whether the response code is `401`.
50
+ #
51
+ def is_unauthorized?
52
+ code == 401
53
+ end
54
+
55
+ alias unauthorized? is_unauthorized?
56
+
57
+ #
58
+ # Determines if the response code is `403`.
59
+ #
60
+ # @return [Boolean]
61
+ # Specifies whether the response code is `403`.
62
+ #
63
+ def is_forbidden?
64
+ code == 403
65
+ end
66
+
67
+ alias forbidden? is_forbidden?
68
+
69
+ #
70
+ # Determines if the response code is `404`.
71
+ #
72
+ # @return [Boolean]
73
+ # Specifies whether the response code is `404`.
74
+ #
75
+ def is_missing?
76
+ code == 404
77
+ end
78
+
79
+ alias missing? is_missing?
80
+
81
+ #
82
+ # Determines if the response code is `500`.
83
+ #
84
+ # @return [Boolean]
85
+ # Specifies whether the response code is `500`.
86
+ #
87
+ def had_internal_server_error?
88
+ code == 500
89
+ end
90
+
91
+ #
92
+ # Determines if the response code is `300`, `301`, `302`, `303`
93
+ # or `307`. Also checks for "soft" redirects added at the page
94
+ # level by a meta refresh tag.
95
+ #
96
+ # @return [Boolean]
97
+ # Specifies whether the response code is a HTTP Redirect code.
98
+ #
99
+ def is_redirect?
100
+ case code
101
+ when 300..303, 307
102
+ true
103
+ when 200
104
+ meta_redirect?
105
+ else
106
+ false
107
+ end
108
+ end
109
+
110
+ alias redirect? is_redirect?
111
+ end
112
+ end
@@ -0,0 +1,56 @@
1
+ module Spidr
2
+ #
3
+ # @since 0.6.0
4
+ #
5
+ class Proxy < Struct.new(:host, :port, :user, :password)
6
+
7
+ # Default port to use.
8
+ DEFAULT_PORT = 8080
9
+
10
+ #
11
+ # Initializes the proxy.
12
+ #
13
+ # @param [Hash] attributes
14
+ # Attributes for the proxy.
15
+ #
16
+ # @option attributes [String] :host
17
+ # The host the proxy is running on.
18
+ #
19
+ # @option attributes [Integer] :port
20
+ # The port the proxy is running on.
21
+ #
22
+ # @option attributes [String] :user
23
+ # The user to authenticate as with the proxy.
24
+ #
25
+ # @option attributes [String] :password
26
+ # The password to authenticate with.
27
+ #
28
+ def initialize(attributes={})
29
+ super(
30
+ attributes[:host],
31
+ attributes.fetch(:port,DEFAULT_PORT),
32
+ attributes[:user],
33
+ attributes[:password]
34
+ )
35
+ end
36
+
37
+ #
38
+ # Determines if the proxy settings are set.
39
+ #
40
+ # @return [Boolean]
41
+ #
42
+ def enabled?
43
+ !host.nil?
44
+ end
45
+
46
+ #
47
+ # Determines if the proxy is not set.
48
+ #
49
+ # @return [Boolean]
50
+ #
51
+ def disabled?
52
+ host.nil?
53
+ end
54
+
55
+ end
56
+ end
@@ -1,6 +1,9 @@
1
+ require 'spidr/settings/proxy'
2
+ require 'spidr/settings/timeouts'
1
3
  require 'spidr/spidr'
2
4
 
3
5
  require 'net/http'
6
+ require 'openssl'
4
7
 
5
8
  module Spidr
6
9
  #
@@ -8,31 +11,44 @@ module Spidr
8
11
  #
9
12
  class SessionCache
10
13
 
11
- # Proxy to use
12
- attr_accessor :proxy
14
+ include Settings::Proxy
15
+ include Settings::Timeouts
13
16
 
14
17
  #
15
18
  # Creates a new session cache.
16
19
  #
17
- # @param [Hash] proxy (Spidr.proxy)
20
+ # @param [Hash] options
21
+ # Configuration options.
22
+ #
23
+ # @option [Hash] :proxy (Spidr.proxy)
18
24
  # Proxy options.
19
25
  #
20
- # @option proxy [String] :host
21
- # The host the proxy is running on.
26
+ # @option [Integer] :open_timeout (Spidr.open_timeout)
27
+ # Optional open timeout.
22
28
  #
23
- # @option proxy [Integer] :port
24
- # The port the proxy is running on.
29
+ # @option [Integer] :ssl_timeout (Spidr.ssl_timeout)
30
+ # Optional ssl timeout.
25
31
  #
26
- # @option proxy [String] :user
27
- # The user to authenticate as with the proxy.
32
+ # @option [Integer] :read_timeout (Spidr.read_timeout)
33
+ # Optional read timeout.
28
34
  #
29
- # @option proxy [String] :password
30
- # The password to authenticate with.
35
+ # @option [Integer] :continue_timeout (Spidr.continue_timeout)
36
+ # Optional `Continue` timeout.
31
37
  #
32
- # @since 0.2.2
38
+ # @option [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
39
+ # Optional `Keep-Alive` timeout.
33
40
  #
34
- def initialize(proxy=Spidr.proxy)
35
- @proxy = proxy
41
+ # @since 0.6.0
42
+ #
43
+ def initialize(options={})
44
+ @proxy = options.fetch(:proxy,Spidr.proxy)
45
+
46
+ @open_timeout = options.fetch(:open_timeout,Spidr.open_timeout)
47
+ @ssl_timeout = options.fetch(:ssl_timeout,Spidr.ssl_timeout)
48
+ @read_timeout = options.fetch(:read_timeout,Spidr.read_timeout)
49
+ @continue_timeout = options.fetch(:continue_timeout,Spidr.continue_timeout)
50
+ @keep_alive_timeout = options.fetch(:keep_alive_timeout,Spidr.keep_alive_timeout)
51
+
36
52
  @sessions = {}
37
53
  end
38
54
 
@@ -52,7 +68,7 @@ module Spidr
52
68
  url = URI(url.to_s) unless url.kind_of?(URI)
53
69
 
54
70
  # session key
55
- key = [url.scheme, url.host, url.port]
71
+ key = key_for(url)
56
72
 
57
73
  return @sessions.has_key?(key)
58
74
  end
@@ -71,19 +87,25 @@ module Spidr
71
87
  url = URI(url.to_s) unless url.kind_of?(URI)
72
88
 
73
89
  # session key
74
- key = [url.scheme, url.host, url.port]
90
+ key = key_for(url)
75
91
 
76
92
  unless @sessions[key]
77
93
  session = Net::HTTP::Proxy(
78
- @proxy[:host],
79
- @proxy[:port],
80
- @proxy[:user],
81
- @proxy[:password]
94
+ @proxy.host,
95
+ @proxy.port,
96
+ @proxy.user,
97
+ @proxy.password
82
98
  ).new(url.host,url.port)
83
99
 
100
+ session.open_timeout = @open_timeout if @open_timeout
101
+ session.read_timeout = @read_timeout if @read_timeout
102
+ session.continue_timeout = @continue_timeout if @continue_timeout
103
+ session.keep_alive_timeout = @keep_alive_timeout if @keep_alive_timeout
104
+
84
105
  if url.scheme == 'https'
85
106
  session.use_ssl = true
86
107
  session.verify_mode = OpenSSL::SSL::VERIFY_NONE
108
+ session.ssl_timeout = @ssl_timeout
87
109
  session.start
88
110
  end
89
111
 
@@ -108,7 +130,7 @@ module Spidr
108
130
  url = URI(url.to_s) unless url.kind_of?(URI)
109
131
 
110
132
  # session key
111
- key = [url.scheme, url.host, url.port]
133
+ key = key_for(url)
112
134
 
113
135
  if (sess = @sessions[key])
114
136
  begin
@@ -129,11 +151,10 @@ module Spidr
129
151
  # @since 0.2.2
130
152
  #
131
153
  def clear
132
- @sessions.each_value do |sess|
154
+ @sessions.each_value do |session|
133
155
  begin
134
- sess.finish
156
+ session.finish
135
157
  rescue IOError
136
- nil
137
158
  end
138
159
  end
139
160
 
@@ -141,5 +162,20 @@ module Spidr
141
162
  return self
142
163
  end
143
164
 
165
+ private
166
+
167
+ #
168
+ # Creates a session key based on the URL.
169
+ #
170
+ # @param [URI::HTTP] url
171
+ # The given URL.
172
+ #
173
+ # @return [Array]
174
+ # The session key containing the scheme, host and port.
175
+ #
176
+ def key_for(url)
177
+ [url.scheme, url.host, url.port]
178
+ end
179
+
144
180
  end
145
181
  end
@@ -0,0 +1,3 @@
1
+ require 'spidr/settings/proxy'
2
+ require 'spidr/settings/timeouts'
3
+ require 'spidr/settings/user_agent'
@@ -0,0 +1,61 @@
1
+ require 'spidr/proxy'
2
+
3
+ module Spidr
4
+ module Settings
5
+ #
6
+ # Methods for configuring a proxy.
7
+ #
8
+ # @since 0.6.0
9
+ #
10
+ module Proxy
11
+ #
12
+ # Proxy information used by all newly created Agent objects by default.
13
+ #
14
+ # @return [Spidr::Proxy]
15
+ # The Spidr proxy information.
16
+ #
17
+ def proxy
18
+ @proxy ||= Spidr::Proxy.new
19
+ end
20
+
21
+ #
22
+ # Sets the proxy information used by Agent objects.
23
+ #
24
+ # @param [Spidr::Proxy, Hash, nil] new_proxy
25
+ # The new proxy information.
26
+ #
27
+ # @option new_proxy [String] :host
28
+ # The host-name of the proxy.
29
+ #
30
+ # @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
31
+ # The port of the proxy.
32
+ #
33
+ # @option new_proxy [String] :user
34
+ # The user to authenticate with the proxy as.
35
+ #
36
+ # @option new_proxy [String] :password
37
+ # The password to authenticate with the proxy.
38
+ #
39
+ # @return [Spidr::Proxy]
40
+ # The new proxy information.
41
+ #
42
+ def proxy=(new_proxy)
43
+ @proxy = case new_proxy
44
+ when Spidr::Proxy then new_proxy
45
+ when Hash then Spidr::Proxy.new(new_proxy)
46
+ when nil then Spidr::Proxy.new
47
+ else
48
+ raise(TypeError,"#{self.class}#{__method__} only accepts Proxy, Hash or nil")
49
+ end
50
+ end
51
+
52
+ #
53
+ # Disables the proxy settings used by all newly created Agent objects.
54
+ #
55
+ def disable_proxy!
56
+ @proxy = Spidr::Proxy.new
57
+ return true
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,33 @@
1
+ module Spidr
2
+ module Settings
3
+ #
4
+ # @since 0.6.0
5
+ #
6
+ module Timeouts
7
+ # Read timeout.
8
+ #
9
+ # @return [Integer, nil]
10
+ attr_accessor :read_timeout
11
+
12
+ # Open timeout.
13
+ #
14
+ # @return [Integer, nil]
15
+ attr_accessor :open_timeout
16
+
17
+ # SSL timeout.
18
+ #
19
+ # @return [Integer, nil]
20
+ attr_accessor :ssl_timeout
21
+
22
+ # `Continue` timeout.
23
+ #
24
+ # @return [Integer, nil]
25
+ attr_accessor :continue_timeout
26
+
27
+ # `Keep-Alive` timeout.
28
+ #
29
+ # @return [Integer, nil]
30
+ attr_accessor :keep_alive_timeout
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,14 @@
1
+ module Spidr
2
+ module Settings
3
+ #
4
+ # @since 0.6.0
5
+ #
6
+ module UserAgent
7
+ # The User-Agent string used by all Agent objects by default.
8
+ #
9
+ # @return [String]
10
+ # The Spidr User-Agent string.
11
+ attr_accessor :user_agent
12
+ end
13
+ end
14
+ end
@@ -1,79 +1,12 @@
1
+ require 'spidr/settings/proxy'
2
+ require 'spidr/settings/timeouts'
3
+ require 'spidr/settings/user_agent'
1
4
  require 'spidr/agent'
2
5
 
3
6
  module Spidr
4
- # Common proxy port.
5
- COMMON_PROXY_PORT = 8080
6
-
7
- # Default proxy information.
8
- DEFAULT_PROXY = {
9
- host: nil,
10
- port: COMMON_PROXY_PORT,
11
- user: nil,
12
- password: nil
13
- }
14
-
15
- #
16
- # Proxy information used by all newly created Agent objects by default.
17
- #
18
- # @return [Hash]
19
- # The Spidr proxy information.
20
- #
21
- def Spidr.proxy
22
- @@spidr_proxy ||= DEFAULT_PROXY
23
- end
24
-
25
- #
26
- # Sets the proxy information used by Agent objects.
27
- #
28
- # @param [Hash] new_proxy
29
- # The new proxy information.
30
- #
31
- # @option new_proxy [String] :host
32
- # The host-name of the proxy.
33
- #
34
- # @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
35
- # The port of the proxy.
36
- #
37
- # @option new_proxy [String] :user
38
- # The user to authenticate with the proxy as.
39
- #
40
- # @option new_proxy [String] :password
41
- # The password to authenticate with the proxy.
42
- #
43
- # @return [Hash]
44
- # The new proxy information.
45
- #
46
- def Spidr.proxy=(new_proxy)
47
- @@spidr_proxy = {port: COMMON_PROXY_PORT}.merge(new_proxy)
48
- end
49
-
50
- #
51
- # Disables the proxy settings used by all newly created Agent objects.
52
- #
53
- def Spidr.disable_proxy!
54
- @@spidr_proxy = DEFAULT_PROXY
55
- return true
56
- end
57
-
58
- #
59
- # The User-Agent string used by all Agent objects by default.
60
- #
61
- # @return [String]
62
- # The Spidr User-Agent string.
63
- #
64
- def Spidr.user_agent
65
- @@spidr_user_agent ||= nil
66
- end
67
-
68
- #
69
- # Sets the Spidr User-Agent string.
70
- #
71
- # @param [String] new_agent
72
- # The new User-Agent string.
73
- #
74
- def Spidr.user_agent=(new_agent)
75
- @@spidr_user_agent = new_agent
76
- end
7
+ extend Settings::Proxy
8
+ extend Settings::Timeouts
9
+ extend Settings::UserAgent
77
10
 
78
11
  #
79
12
  # Specifies whether `robots.txt` should be honored globally.
@@ -82,7 +15,7 @@ module Spidr
82
15
  #
83
16
  # @since 0.5.0
84
17
  #
85
- def Spidr.robots?
18
+ def self.robots?
86
19
  @robots
87
20
  end
88
21
 
@@ -95,31 +28,34 @@ module Spidr
95
28
  #
96
29
  # @since 0.5.0
97
30
  #
98
- def Spidr.robots=(mode)
31
+ def self.robots=(mode)
99
32
  @robots = mode
100
33
  end
101
34
 
102
35
  #
103
36
  # @see Agent.start_at
104
37
  #
105
- def Spidr.start_at(url,options={},&block)
38
+ def self.start_at(url,options={},&block)
106
39
  Agent.start_at(url,options,&block)
107
40
  end
108
41
 
109
42
  #
110
43
  # @see Agent.host
111
44
  #
112
- def Spidr.host(name,options={},&block)
45
+ def self.host(name,options={},&block)
113
46
  Agent.host(name,options,&block)
114
47
  end
115
48
 
116
49
  #
117
50
  # @see Agent.site
118
51
  #
119
- def Spidr.site(url,options={},&block)
52
+ def self.site(url,options={},&block)
120
53
  Agent.site(url,options,&block)
121
54
  end
122
55
 
123
- def Spidr.robots
56
+ #
57
+ # @abstract
58
+ #
59
+ def self.robots
124
60
  end
125
61
  end