spidr 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +14 -0
- data/ChangeLog.md +20 -2
- data/Gemfile +2 -2
- data/README.md +4 -2
- data/Rakefile +1 -0
- data/gemspec.yml +1 -1
- data/lib/spidr/agent.rb +145 -85
- data/lib/spidr/agent/filters.rb +1 -9
- data/lib/spidr/agent/robots.rb +36 -0
- data/lib/spidr/page.rb +76 -28
- data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
- data/lib/spidr/page/cookies.rb +60 -0
- data/lib/spidr/page/{links.rb → html.rb} +47 -23
- data/lib/spidr/page/status_codes.rb +112 -0
- data/lib/spidr/proxy.rb +56 -0
- data/lib/spidr/session_cache.rb +60 -24
- data/lib/spidr/settings.rb +3 -0
- data/lib/spidr/settings/proxy.rb +61 -0
- data/lib/spidr/settings/timeouts.rb +33 -0
- data/lib/spidr/settings/user_agent.rb +14 -0
- data/lib/spidr/spidr.rb +15 -79
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +158 -32
- data/spec/agent/filters_spec.rb +46 -29
- data/spec/agent/sanitizers_spec.rb +25 -31
- data/spec/agent_spec.rb +772 -50
- data/spec/example_app.rb +27 -0
- data/spec/example_page.rb +33 -0
- data/spec/page/content_types_spec.rb +150 -0
- data/spec/page/cookies_spec.rb +58 -0
- data/spec/page/html_spec.rb +524 -0
- data/spec/page/status_codes_spec.rb +87 -0
- data/spec/page_spec.rb +114 -78
- data/spec/proxy_spec.rb +45 -0
- data/spec/session_cache.rb +103 -2
- data/spec/settings/proxy_examples.rb +82 -0
- data/spec/settings/timeouts_examples.rb +93 -0
- data/spec/settings/user_agent_examples.rb +25 -0
- data/spec/spidr_spec.rb +6 -29
- data/spidr.gemspec +38 -109
- metadata +35 -31
- data/lib/spidr/page/body.rb +0 -98
- data/spec/helpers/history.rb +0 -34
- data/spec/helpers/page.rb +0 -8
- data/spec/helpers/wsoc.rb +0 -83
- data/spec/page_examples.rb +0 -21
@@ -0,0 +1,112 @@
|
|
1
|
+
module Spidr
|
2
|
+
class Page
|
3
|
+
#
|
4
|
+
# The response code from the page.
|
5
|
+
#
|
6
|
+
# @return [Integer]
|
7
|
+
# Response code from the page.
|
8
|
+
#
|
9
|
+
def code
|
10
|
+
@response.code.to_i
|
11
|
+
end
|
12
|
+
|
13
|
+
#
|
14
|
+
# Determines if the response code is `200`.
|
15
|
+
#
|
16
|
+
# @return [Boolean]
|
17
|
+
# Specifies whether the response code is `200`.
|
18
|
+
#
|
19
|
+
def is_ok?
|
20
|
+
code == 200
|
21
|
+
end
|
22
|
+
|
23
|
+
alias ok? is_ok?
|
24
|
+
|
25
|
+
#
|
26
|
+
# Determines if the response code is `308`.
|
27
|
+
#
|
28
|
+
# @return [Boolean]
|
29
|
+
# Specifies whether the response code is `308`.
|
30
|
+
#
|
31
|
+
def timedout?
|
32
|
+
code == 308
|
33
|
+
end
|
34
|
+
|
35
|
+
#
|
36
|
+
# Determines if the response code is `400`.
|
37
|
+
#
|
38
|
+
# @return [Boolean]
|
39
|
+
# Specifies whether the response code is `400`.
|
40
|
+
#
|
41
|
+
def bad_request?
|
42
|
+
code == 400
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Determines if the response code is `401`.
|
47
|
+
#
|
48
|
+
# @return [Boolean]
|
49
|
+
# Specifies whether the response code is `401`.
|
50
|
+
#
|
51
|
+
def is_unauthorized?
|
52
|
+
code == 401
|
53
|
+
end
|
54
|
+
|
55
|
+
alias unauthorized? is_unauthorized?
|
56
|
+
|
57
|
+
#
|
58
|
+
# Determines if the response code is `403`.
|
59
|
+
#
|
60
|
+
# @return [Boolean]
|
61
|
+
# Specifies whether the response code is `403`.
|
62
|
+
#
|
63
|
+
def is_forbidden?
|
64
|
+
code == 403
|
65
|
+
end
|
66
|
+
|
67
|
+
alias forbidden? is_forbidden?
|
68
|
+
|
69
|
+
#
|
70
|
+
# Determines if the response code is `404`.
|
71
|
+
#
|
72
|
+
# @return [Boolean]
|
73
|
+
# Specifies whether the response code is `404`.
|
74
|
+
#
|
75
|
+
def is_missing?
|
76
|
+
code == 404
|
77
|
+
end
|
78
|
+
|
79
|
+
alias missing? is_missing?
|
80
|
+
|
81
|
+
#
|
82
|
+
# Determines if the response code is `500`.
|
83
|
+
#
|
84
|
+
# @return [Boolean]
|
85
|
+
# Specifies whether the response code is `500`.
|
86
|
+
#
|
87
|
+
def had_internal_server_error?
|
88
|
+
code == 500
|
89
|
+
end
|
90
|
+
|
91
|
+
#
|
92
|
+
# Determines if the response code is `300`, `301`, `302`, `303`
|
93
|
+
# or `307`. Also checks for "soft" redirects added at the page
|
94
|
+
# level by a meta refresh tag.
|
95
|
+
#
|
96
|
+
# @return [Boolean]
|
97
|
+
# Specifies whether the response code is a HTTP Redirect code.
|
98
|
+
#
|
99
|
+
def is_redirect?
|
100
|
+
case code
|
101
|
+
when 300..303, 307
|
102
|
+
true
|
103
|
+
when 200
|
104
|
+
meta_redirect?
|
105
|
+
else
|
106
|
+
false
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
alias redirect? is_redirect?
|
111
|
+
end
|
112
|
+
end
|
data/lib/spidr/proxy.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
module Spidr
|
2
|
+
#
|
3
|
+
# @since 0.6.0
|
4
|
+
#
|
5
|
+
class Proxy < Struct.new(:host, :port, :user, :password)
|
6
|
+
|
7
|
+
# Default port to use.
|
8
|
+
DEFAULT_PORT = 8080
|
9
|
+
|
10
|
+
#
|
11
|
+
# Initializes the proxy.
|
12
|
+
#
|
13
|
+
# @param [Hash] attributes
|
14
|
+
# Attributes for the proxy.
|
15
|
+
#
|
16
|
+
# @option attributes [String] :host
|
17
|
+
# The host the proxy is running on.
|
18
|
+
#
|
19
|
+
# @option attributes [Integer] :port
|
20
|
+
# The port the proxy is running on.
|
21
|
+
#
|
22
|
+
# @option attributes [String] :user
|
23
|
+
# The user to authenticate as with the proxy.
|
24
|
+
#
|
25
|
+
# @option attributes [String] :password
|
26
|
+
# The password to authenticate with.
|
27
|
+
#
|
28
|
+
def initialize(attributes={})
|
29
|
+
super(
|
30
|
+
attributes[:host],
|
31
|
+
attributes.fetch(:port,DEFAULT_PORT),
|
32
|
+
attributes[:user],
|
33
|
+
attributes[:password]
|
34
|
+
)
|
35
|
+
end
|
36
|
+
|
37
|
+
#
|
38
|
+
# Determines if the proxy settings are set.
|
39
|
+
#
|
40
|
+
# @return [Boolean]
|
41
|
+
#
|
42
|
+
def enabled?
|
43
|
+
!host.nil?
|
44
|
+
end
|
45
|
+
|
46
|
+
#
|
47
|
+
# Determines if the proxy is not set.
|
48
|
+
#
|
49
|
+
# @return [Boolean]
|
50
|
+
#
|
51
|
+
def disabled?
|
52
|
+
host.nil?
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
data/lib/spidr/session_cache.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
|
+
require 'spidr/settings/proxy'
|
2
|
+
require 'spidr/settings/timeouts'
|
1
3
|
require 'spidr/spidr'
|
2
4
|
|
3
5
|
require 'net/http'
|
6
|
+
require 'openssl'
|
4
7
|
|
5
8
|
module Spidr
|
6
9
|
#
|
@@ -8,31 +11,44 @@ module Spidr
|
|
8
11
|
#
|
9
12
|
class SessionCache
|
10
13
|
|
11
|
-
|
12
|
-
|
14
|
+
include Settings::Proxy
|
15
|
+
include Settings::Timeouts
|
13
16
|
|
14
17
|
#
|
15
18
|
# Creates a new session cache.
|
16
19
|
#
|
17
|
-
# @param [Hash]
|
20
|
+
# @param [Hash] options
|
21
|
+
# Configuration options.
|
22
|
+
#
|
23
|
+
# @option [Hash] :proxy (Spidr.proxy)
|
18
24
|
# Proxy options.
|
19
25
|
#
|
20
|
-
# @option
|
21
|
-
#
|
26
|
+
# @option [Integer] :open_timeout (Spidr.open_timeout)
|
27
|
+
# Optional open timeout.
|
22
28
|
#
|
23
|
-
# @option
|
24
|
-
#
|
29
|
+
# @option [Integer] :ssl_timeout (Spidr.ssl_timeout)
|
30
|
+
# Optional ssl timeout.
|
25
31
|
#
|
26
|
-
# @option
|
27
|
-
#
|
32
|
+
# @option [Integer] :read_timeout (Spidr.read_timeout)
|
33
|
+
# Optional read timeout.
|
28
34
|
#
|
29
|
-
# @option
|
30
|
-
#
|
35
|
+
# @option [Integer] :continue_timeout (Spidr.continue_timeout)
|
36
|
+
# Optional `Continue` timeout.
|
31
37
|
#
|
32
|
-
# @
|
38
|
+
# @option [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
|
39
|
+
# Optional `Keep-Alive` timeout.
|
33
40
|
#
|
34
|
-
|
35
|
-
|
41
|
+
# @since 0.6.0
|
42
|
+
#
|
43
|
+
def initialize(options={})
|
44
|
+
@proxy = options.fetch(:proxy,Spidr.proxy)
|
45
|
+
|
46
|
+
@open_timeout = options.fetch(:open_timeout,Spidr.open_timeout)
|
47
|
+
@ssl_timeout = options.fetch(:ssl_timeout,Spidr.ssl_timeout)
|
48
|
+
@read_timeout = options.fetch(:read_timeout,Spidr.read_timeout)
|
49
|
+
@continue_timeout = options.fetch(:continue_timeout,Spidr.continue_timeout)
|
50
|
+
@keep_alive_timeout = options.fetch(:keep_alive_timeout,Spidr.keep_alive_timeout)
|
51
|
+
|
36
52
|
@sessions = {}
|
37
53
|
end
|
38
54
|
|
@@ -52,7 +68,7 @@ module Spidr
|
|
52
68
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
53
69
|
|
54
70
|
# session key
|
55
|
-
key =
|
71
|
+
key = key_for(url)
|
56
72
|
|
57
73
|
return @sessions.has_key?(key)
|
58
74
|
end
|
@@ -71,19 +87,25 @@ module Spidr
|
|
71
87
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
72
88
|
|
73
89
|
# session key
|
74
|
-
key =
|
90
|
+
key = key_for(url)
|
75
91
|
|
76
92
|
unless @sessions[key]
|
77
93
|
session = Net::HTTP::Proxy(
|
78
|
-
@proxy
|
79
|
-
@proxy
|
80
|
-
@proxy
|
81
|
-
@proxy
|
94
|
+
@proxy.host,
|
95
|
+
@proxy.port,
|
96
|
+
@proxy.user,
|
97
|
+
@proxy.password
|
82
98
|
).new(url.host,url.port)
|
83
99
|
|
100
|
+
session.open_timeout = @open_timeout if @open_timeout
|
101
|
+
session.read_timeout = @read_timeout if @read_timeout
|
102
|
+
session.continue_timeout = @continue_timeout if @continue_timeout
|
103
|
+
session.keep_alive_timeout = @keep_alive_timeout if @keep_alive_timeout
|
104
|
+
|
84
105
|
if url.scheme == 'https'
|
85
106
|
session.use_ssl = true
|
86
107
|
session.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
108
|
+
session.ssl_timeout = @ssl_timeout
|
87
109
|
session.start
|
88
110
|
end
|
89
111
|
|
@@ -108,7 +130,7 @@ module Spidr
|
|
108
130
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
109
131
|
|
110
132
|
# session key
|
111
|
-
key =
|
133
|
+
key = key_for(url)
|
112
134
|
|
113
135
|
if (sess = @sessions[key])
|
114
136
|
begin
|
@@ -129,11 +151,10 @@ module Spidr
|
|
129
151
|
# @since 0.2.2
|
130
152
|
#
|
131
153
|
def clear
|
132
|
-
@sessions.each_value do |
|
154
|
+
@sessions.each_value do |session|
|
133
155
|
begin
|
134
|
-
|
156
|
+
session.finish
|
135
157
|
rescue IOError
|
136
|
-
nil
|
137
158
|
end
|
138
159
|
end
|
139
160
|
|
@@ -141,5 +162,20 @@ module Spidr
|
|
141
162
|
return self
|
142
163
|
end
|
143
164
|
|
165
|
+
private
|
166
|
+
|
167
|
+
#
|
168
|
+
# Creates a session key based on the URL.
|
169
|
+
#
|
170
|
+
# @param [URI::HTTP] url
|
171
|
+
# The given URL.
|
172
|
+
#
|
173
|
+
# @return [Array]
|
174
|
+
# The session key containing the scheme, host and port.
|
175
|
+
#
|
176
|
+
def key_for(url)
|
177
|
+
[url.scheme, url.host, url.port]
|
178
|
+
end
|
179
|
+
|
144
180
|
end
|
145
181
|
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'spidr/proxy'
|
2
|
+
|
3
|
+
module Spidr
|
4
|
+
module Settings
|
5
|
+
#
|
6
|
+
# Methods for configuring a proxy.
|
7
|
+
#
|
8
|
+
# @since 0.6.0
|
9
|
+
#
|
10
|
+
module Proxy
|
11
|
+
#
|
12
|
+
# Proxy information used by all newly created Agent objects by default.
|
13
|
+
#
|
14
|
+
# @return [Spidr::Proxy]
|
15
|
+
# The Spidr proxy information.
|
16
|
+
#
|
17
|
+
def proxy
|
18
|
+
@proxy ||= Spidr::Proxy.new
|
19
|
+
end
|
20
|
+
|
21
|
+
#
|
22
|
+
# Sets the proxy information used by Agent objects.
|
23
|
+
#
|
24
|
+
# @param [Spidr::Proxy, Hash, nil] new_proxy
|
25
|
+
# The new proxy information.
|
26
|
+
#
|
27
|
+
# @option new_proxy [String] :host
|
28
|
+
# The host-name of the proxy.
|
29
|
+
#
|
30
|
+
# @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
|
31
|
+
# The port of the proxy.
|
32
|
+
#
|
33
|
+
# @option new_proxy [String] :user
|
34
|
+
# The user to authenticate with the proxy as.
|
35
|
+
#
|
36
|
+
# @option new_proxy [String] :password
|
37
|
+
# The password to authenticate with the proxy.
|
38
|
+
#
|
39
|
+
# @return [Spidr::Proxy]
|
40
|
+
# The new proxy information.
|
41
|
+
#
|
42
|
+
def proxy=(new_proxy)
|
43
|
+
@proxy = case new_proxy
|
44
|
+
when Spidr::Proxy then new_proxy
|
45
|
+
when Hash then Spidr::Proxy.new(new_proxy)
|
46
|
+
when nil then Spidr::Proxy.new
|
47
|
+
else
|
48
|
+
raise(TypeError,"#{self.class}#{__method__} only accepts Proxy, Hash or nil")
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Disables the proxy settings used by all newly created Agent objects.
|
54
|
+
#
|
55
|
+
def disable_proxy!
|
56
|
+
@proxy = Spidr::Proxy.new
|
57
|
+
return true
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Spidr
|
2
|
+
module Settings
|
3
|
+
#
|
4
|
+
# @since 0.6.0
|
5
|
+
#
|
6
|
+
module Timeouts
|
7
|
+
# Read timeout.
|
8
|
+
#
|
9
|
+
# @return [Integer, nil]
|
10
|
+
attr_accessor :read_timeout
|
11
|
+
|
12
|
+
# Open timeout.
|
13
|
+
#
|
14
|
+
# @return [Integer, nil]
|
15
|
+
attr_accessor :open_timeout
|
16
|
+
|
17
|
+
# SSL timeout.
|
18
|
+
#
|
19
|
+
# @return [Integer, nil]
|
20
|
+
attr_accessor :ssl_timeout
|
21
|
+
|
22
|
+
# `Continue` timeout.
|
23
|
+
#
|
24
|
+
# @return [Integer, nil]
|
25
|
+
attr_accessor :continue_timeout
|
26
|
+
|
27
|
+
# `Keep-Alive` timeout.
|
28
|
+
#
|
29
|
+
# @return [Integer, nil]
|
30
|
+
attr_accessor :keep_alive_timeout
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/spidr/spidr.rb
CHANGED
@@ -1,79 +1,12 @@
|
|
1
|
+
require 'spidr/settings/proxy'
|
2
|
+
require 'spidr/settings/timeouts'
|
3
|
+
require 'spidr/settings/user_agent'
|
1
4
|
require 'spidr/agent'
|
2
5
|
|
3
6
|
module Spidr
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
# Default proxy information.
|
8
|
-
DEFAULT_PROXY = {
|
9
|
-
host: nil,
|
10
|
-
port: COMMON_PROXY_PORT,
|
11
|
-
user: nil,
|
12
|
-
password: nil
|
13
|
-
}
|
14
|
-
|
15
|
-
#
|
16
|
-
# Proxy information used by all newly created Agent objects by default.
|
17
|
-
#
|
18
|
-
# @return [Hash]
|
19
|
-
# The Spidr proxy information.
|
20
|
-
#
|
21
|
-
def Spidr.proxy
|
22
|
-
@@spidr_proxy ||= DEFAULT_PROXY
|
23
|
-
end
|
24
|
-
|
25
|
-
#
|
26
|
-
# Sets the proxy information used by Agent objects.
|
27
|
-
#
|
28
|
-
# @param [Hash] new_proxy
|
29
|
-
# The new proxy information.
|
30
|
-
#
|
31
|
-
# @option new_proxy [String] :host
|
32
|
-
# The host-name of the proxy.
|
33
|
-
#
|
34
|
-
# @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
|
35
|
-
# The port of the proxy.
|
36
|
-
#
|
37
|
-
# @option new_proxy [String] :user
|
38
|
-
# The user to authenticate with the proxy as.
|
39
|
-
#
|
40
|
-
# @option new_proxy [String] :password
|
41
|
-
# The password to authenticate with the proxy.
|
42
|
-
#
|
43
|
-
# @return [Hash]
|
44
|
-
# The new proxy information.
|
45
|
-
#
|
46
|
-
def Spidr.proxy=(new_proxy)
|
47
|
-
@@spidr_proxy = {port: COMMON_PROXY_PORT}.merge(new_proxy)
|
48
|
-
end
|
49
|
-
|
50
|
-
#
|
51
|
-
# Disables the proxy settings used by all newly created Agent objects.
|
52
|
-
#
|
53
|
-
def Spidr.disable_proxy!
|
54
|
-
@@spidr_proxy = DEFAULT_PROXY
|
55
|
-
return true
|
56
|
-
end
|
57
|
-
|
58
|
-
#
|
59
|
-
# The User-Agent string used by all Agent objects by default.
|
60
|
-
#
|
61
|
-
# @return [String]
|
62
|
-
# The Spidr User-Agent string.
|
63
|
-
#
|
64
|
-
def Spidr.user_agent
|
65
|
-
@@spidr_user_agent ||= nil
|
66
|
-
end
|
67
|
-
|
68
|
-
#
|
69
|
-
# Sets the Spidr User-Agent string.
|
70
|
-
#
|
71
|
-
# @param [String] new_agent
|
72
|
-
# The new User-Agent string.
|
73
|
-
#
|
74
|
-
def Spidr.user_agent=(new_agent)
|
75
|
-
@@spidr_user_agent = new_agent
|
76
|
-
end
|
7
|
+
extend Settings::Proxy
|
8
|
+
extend Settings::Timeouts
|
9
|
+
extend Settings::UserAgent
|
77
10
|
|
78
11
|
#
|
79
12
|
# Specifies whether `robots.txt` should be honored globally.
|
@@ -82,7 +15,7 @@ module Spidr
|
|
82
15
|
#
|
83
16
|
# @since 0.5.0
|
84
17
|
#
|
85
|
-
def
|
18
|
+
def self.robots?
|
86
19
|
@robots
|
87
20
|
end
|
88
21
|
|
@@ -95,31 +28,34 @@ module Spidr
|
|
95
28
|
#
|
96
29
|
# @since 0.5.0
|
97
30
|
#
|
98
|
-
def
|
31
|
+
def self.robots=(mode)
|
99
32
|
@robots = mode
|
100
33
|
end
|
101
34
|
|
102
35
|
#
|
103
36
|
# @see Agent.start_at
|
104
37
|
#
|
105
|
-
def
|
38
|
+
def self.start_at(url,options={},&block)
|
106
39
|
Agent.start_at(url,options,&block)
|
107
40
|
end
|
108
41
|
|
109
42
|
#
|
110
43
|
# @see Agent.host
|
111
44
|
#
|
112
|
-
def
|
45
|
+
def self.host(name,options={},&block)
|
113
46
|
Agent.host(name,options,&block)
|
114
47
|
end
|
115
48
|
|
116
49
|
#
|
117
50
|
# @see Agent.site
|
118
51
|
#
|
119
|
-
def
|
52
|
+
def self.site(url,options={},&block)
|
120
53
|
Agent.site(url,options,&block)
|
121
54
|
end
|
122
55
|
|
123
|
-
|
56
|
+
#
|
57
|
+
# @abstract
|
58
|
+
#
|
59
|
+
def self.robots
|
124
60
|
end
|
125
61
|
end
|