spidr 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +14 -0
- data/ChangeLog.md +20 -2
- data/Gemfile +2 -2
- data/README.md +4 -2
- data/Rakefile +1 -0
- data/gemspec.yml +1 -1
- data/lib/spidr/agent.rb +145 -85
- data/lib/spidr/agent/filters.rb +1 -9
- data/lib/spidr/agent/robots.rb +36 -0
- data/lib/spidr/page.rb +76 -28
- data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
- data/lib/spidr/page/cookies.rb +60 -0
- data/lib/spidr/page/{links.rb → html.rb} +47 -23
- data/lib/spidr/page/status_codes.rb +112 -0
- data/lib/spidr/proxy.rb +56 -0
- data/lib/spidr/session_cache.rb +60 -24
- data/lib/spidr/settings.rb +3 -0
- data/lib/spidr/settings/proxy.rb +61 -0
- data/lib/spidr/settings/timeouts.rb +33 -0
- data/lib/spidr/settings/user_agent.rb +14 -0
- data/lib/spidr/spidr.rb +15 -79
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +158 -32
- data/spec/agent/filters_spec.rb +46 -29
- data/spec/agent/sanitizers_spec.rb +25 -31
- data/spec/agent_spec.rb +772 -50
- data/spec/example_app.rb +27 -0
- data/spec/example_page.rb +33 -0
- data/spec/page/content_types_spec.rb +150 -0
- data/spec/page/cookies_spec.rb +58 -0
- data/spec/page/html_spec.rb +524 -0
- data/spec/page/status_codes_spec.rb +87 -0
- data/spec/page_spec.rb +114 -78
- data/spec/proxy_spec.rb +45 -0
- data/spec/session_cache.rb +103 -2
- data/spec/settings/proxy_examples.rb +82 -0
- data/spec/settings/timeouts_examples.rb +93 -0
- data/spec/settings/user_agent_examples.rb +25 -0
- data/spec/spidr_spec.rb +6 -29
- data/spidr.gemspec +38 -109
- metadata +35 -31
- data/lib/spidr/page/body.rb +0 -98
- data/spec/helpers/history.rb +0 -34
- data/spec/helpers/page.rb +0 -8
- data/spec/helpers/wsoc.rb +0 -83
- data/spec/page_examples.rb +0 -21
@@ -0,0 +1,112 @@
|
|
1
|
+
module Spidr
|
2
|
+
class Page
|
3
|
+
#
|
4
|
+
# The response code from the page.
|
5
|
+
#
|
6
|
+
# @return [Integer]
|
7
|
+
# Response code from the page.
|
8
|
+
#
|
9
|
+
def code
|
10
|
+
@response.code.to_i
|
11
|
+
end
|
12
|
+
|
13
|
+
#
|
14
|
+
# Determines if the response code is `200`.
|
15
|
+
#
|
16
|
+
# @return [Boolean]
|
17
|
+
# Specifies whether the response code is `200`.
|
18
|
+
#
|
19
|
+
def is_ok?
|
20
|
+
code == 200
|
21
|
+
end
|
22
|
+
|
23
|
+
alias ok? is_ok?
|
24
|
+
|
25
|
+
#
|
26
|
+
# Determines if the response code is `308`.
|
27
|
+
#
|
28
|
+
# @return [Boolean]
|
29
|
+
# Specifies whether the response code is `308`.
|
30
|
+
#
|
31
|
+
def timedout?
|
32
|
+
code == 308
|
33
|
+
end
|
34
|
+
|
35
|
+
#
|
36
|
+
# Determines if the response code is `400`.
|
37
|
+
#
|
38
|
+
# @return [Boolean]
|
39
|
+
# Specifies whether the response code is `400`.
|
40
|
+
#
|
41
|
+
def bad_request?
|
42
|
+
code == 400
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Determines if the response code is `401`.
|
47
|
+
#
|
48
|
+
# @return [Boolean]
|
49
|
+
# Specifies whether the response code is `401`.
|
50
|
+
#
|
51
|
+
def is_unauthorized?
|
52
|
+
code == 401
|
53
|
+
end
|
54
|
+
|
55
|
+
alias unauthorized? is_unauthorized?
|
56
|
+
|
57
|
+
#
|
58
|
+
# Determines if the response code is `403`.
|
59
|
+
#
|
60
|
+
# @return [Boolean]
|
61
|
+
# Specifies whether the response code is `403`.
|
62
|
+
#
|
63
|
+
def is_forbidden?
|
64
|
+
code == 403
|
65
|
+
end
|
66
|
+
|
67
|
+
alias forbidden? is_forbidden?
|
68
|
+
|
69
|
+
#
|
70
|
+
# Determines if the response code is `404`.
|
71
|
+
#
|
72
|
+
# @return [Boolean]
|
73
|
+
# Specifies whether the response code is `404`.
|
74
|
+
#
|
75
|
+
def is_missing?
|
76
|
+
code == 404
|
77
|
+
end
|
78
|
+
|
79
|
+
alias missing? is_missing?
|
80
|
+
|
81
|
+
#
|
82
|
+
# Determines if the response code is `500`.
|
83
|
+
#
|
84
|
+
# @return [Boolean]
|
85
|
+
# Specifies whether the response code is `500`.
|
86
|
+
#
|
87
|
+
def had_internal_server_error?
|
88
|
+
code == 500
|
89
|
+
end
|
90
|
+
|
91
|
+
#
|
92
|
+
# Determines if the response code is `300`, `301`, `302`, `303`
|
93
|
+
# or `307`. Also checks for "soft" redirects added at the page
|
94
|
+
# level by a meta refresh tag.
|
95
|
+
#
|
96
|
+
# @return [Boolean]
|
97
|
+
# Specifies whether the response code is a HTTP Redirect code.
|
98
|
+
#
|
99
|
+
def is_redirect?
|
100
|
+
case code
|
101
|
+
when 300..303, 307
|
102
|
+
true
|
103
|
+
when 200
|
104
|
+
meta_redirect?
|
105
|
+
else
|
106
|
+
false
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
alias redirect? is_redirect?
|
111
|
+
end
|
112
|
+
end
|
data/lib/spidr/proxy.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
module Spidr
|
2
|
+
#
|
3
|
+
# @since 0.6.0
|
4
|
+
#
|
5
|
+
class Proxy < Struct.new(:host, :port, :user, :password)
|
6
|
+
|
7
|
+
# Default port to use.
|
8
|
+
DEFAULT_PORT = 8080
|
9
|
+
|
10
|
+
#
|
11
|
+
# Initializes the proxy.
|
12
|
+
#
|
13
|
+
# @param [Hash] attributes
|
14
|
+
# Attributes for the proxy.
|
15
|
+
#
|
16
|
+
# @option attributes [String] :host
|
17
|
+
# The host the proxy is running on.
|
18
|
+
#
|
19
|
+
# @option attributes [Integer] :port
|
20
|
+
# The port the proxy is running on.
|
21
|
+
#
|
22
|
+
# @option attributes [String] :user
|
23
|
+
# The user to authenticate as with the proxy.
|
24
|
+
#
|
25
|
+
# @option attributes [String] :password
|
26
|
+
# The password to authenticate with.
|
27
|
+
#
|
28
|
+
def initialize(attributes={})
|
29
|
+
super(
|
30
|
+
attributes[:host],
|
31
|
+
attributes.fetch(:port,DEFAULT_PORT),
|
32
|
+
attributes[:user],
|
33
|
+
attributes[:password]
|
34
|
+
)
|
35
|
+
end
|
36
|
+
|
37
|
+
#
|
38
|
+
# Determines if the proxy settings are set.
|
39
|
+
#
|
40
|
+
# @return [Boolean]
|
41
|
+
#
|
42
|
+
def enabled?
|
43
|
+
!host.nil?
|
44
|
+
end
|
45
|
+
|
46
|
+
#
|
47
|
+
# Determines if the proxy is not set.
|
48
|
+
#
|
49
|
+
# @return [Boolean]
|
50
|
+
#
|
51
|
+
def disabled?
|
52
|
+
host.nil?
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
data/lib/spidr/session_cache.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
|
+
require 'spidr/settings/proxy'
|
2
|
+
require 'spidr/settings/timeouts'
|
1
3
|
require 'spidr/spidr'
|
2
4
|
|
3
5
|
require 'net/http'
|
6
|
+
require 'openssl'
|
4
7
|
|
5
8
|
module Spidr
|
6
9
|
#
|
@@ -8,31 +11,44 @@ module Spidr
|
|
8
11
|
#
|
9
12
|
class SessionCache
|
10
13
|
|
11
|
-
|
12
|
-
|
14
|
+
include Settings::Proxy
|
15
|
+
include Settings::Timeouts
|
13
16
|
|
14
17
|
#
|
15
18
|
# Creates a new session cache.
|
16
19
|
#
|
17
|
-
# @param [Hash]
|
20
|
+
# @param [Hash] options
|
21
|
+
# Configuration options.
|
22
|
+
#
|
23
|
+
# @option [Hash] :proxy (Spidr.proxy)
|
18
24
|
# Proxy options.
|
19
25
|
#
|
20
|
-
# @option
|
21
|
-
#
|
26
|
+
# @option [Integer] :open_timeout (Spidr.open_timeout)
|
27
|
+
# Optional open timeout.
|
22
28
|
#
|
23
|
-
# @option
|
24
|
-
#
|
29
|
+
# @option [Integer] :ssl_timeout (Spidr.ssl_timeout)
|
30
|
+
# Optional ssl timeout.
|
25
31
|
#
|
26
|
-
# @option
|
27
|
-
#
|
32
|
+
# @option [Integer] :read_timeout (Spidr.read_timeout)
|
33
|
+
# Optional read timeout.
|
28
34
|
#
|
29
|
-
# @option
|
30
|
-
#
|
35
|
+
# @option [Integer] :continue_timeout (Spidr.continue_timeout)
|
36
|
+
# Optional `Continue` timeout.
|
31
37
|
#
|
32
|
-
# @
|
38
|
+
# @option [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
|
39
|
+
# Optional `Keep-Alive` timeout.
|
33
40
|
#
|
34
|
-
|
35
|
-
|
41
|
+
# @since 0.6.0
|
42
|
+
#
|
43
|
+
def initialize(options={})
|
44
|
+
@proxy = options.fetch(:proxy,Spidr.proxy)
|
45
|
+
|
46
|
+
@open_timeout = options.fetch(:open_timeout,Spidr.open_timeout)
|
47
|
+
@ssl_timeout = options.fetch(:ssl_timeout,Spidr.ssl_timeout)
|
48
|
+
@read_timeout = options.fetch(:read_timeout,Spidr.read_timeout)
|
49
|
+
@continue_timeout = options.fetch(:continue_timeout,Spidr.continue_timeout)
|
50
|
+
@keep_alive_timeout = options.fetch(:keep_alive_timeout,Spidr.keep_alive_timeout)
|
51
|
+
|
36
52
|
@sessions = {}
|
37
53
|
end
|
38
54
|
|
@@ -52,7 +68,7 @@ module Spidr
|
|
52
68
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
53
69
|
|
54
70
|
# session key
|
55
|
-
key =
|
71
|
+
key = key_for(url)
|
56
72
|
|
57
73
|
return @sessions.has_key?(key)
|
58
74
|
end
|
@@ -71,19 +87,25 @@ module Spidr
|
|
71
87
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
72
88
|
|
73
89
|
# session key
|
74
|
-
key =
|
90
|
+
key = key_for(url)
|
75
91
|
|
76
92
|
unless @sessions[key]
|
77
93
|
session = Net::HTTP::Proxy(
|
78
|
-
@proxy
|
79
|
-
@proxy
|
80
|
-
@proxy
|
81
|
-
@proxy
|
94
|
+
@proxy.host,
|
95
|
+
@proxy.port,
|
96
|
+
@proxy.user,
|
97
|
+
@proxy.password
|
82
98
|
).new(url.host,url.port)
|
83
99
|
|
100
|
+
session.open_timeout = @open_timeout if @open_timeout
|
101
|
+
session.read_timeout = @read_timeout if @read_timeout
|
102
|
+
session.continue_timeout = @continue_timeout if @continue_timeout
|
103
|
+
session.keep_alive_timeout = @keep_alive_timeout if @keep_alive_timeout
|
104
|
+
|
84
105
|
if url.scheme == 'https'
|
85
106
|
session.use_ssl = true
|
86
107
|
session.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
108
|
+
session.ssl_timeout = @ssl_timeout
|
87
109
|
session.start
|
88
110
|
end
|
89
111
|
|
@@ -108,7 +130,7 @@ module Spidr
|
|
108
130
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
109
131
|
|
110
132
|
# session key
|
111
|
-
key =
|
133
|
+
key = key_for(url)
|
112
134
|
|
113
135
|
if (sess = @sessions[key])
|
114
136
|
begin
|
@@ -129,11 +151,10 @@ module Spidr
|
|
129
151
|
# @since 0.2.2
|
130
152
|
#
|
131
153
|
def clear
|
132
|
-
@sessions.each_value do |
|
154
|
+
@sessions.each_value do |session|
|
133
155
|
begin
|
134
|
-
|
156
|
+
session.finish
|
135
157
|
rescue IOError
|
136
|
-
nil
|
137
158
|
end
|
138
159
|
end
|
139
160
|
|
@@ -141,5 +162,20 @@ module Spidr
|
|
141
162
|
return self
|
142
163
|
end
|
143
164
|
|
165
|
+
private
|
166
|
+
|
167
|
+
#
|
168
|
+
# Creates a session key based on the URL.
|
169
|
+
#
|
170
|
+
# @param [URI::HTTP] url
|
171
|
+
# The given URL.
|
172
|
+
#
|
173
|
+
# @return [Array]
|
174
|
+
# The session key containing the scheme, host and port.
|
175
|
+
#
|
176
|
+
def key_for(url)
|
177
|
+
[url.scheme, url.host, url.port]
|
178
|
+
end
|
179
|
+
|
144
180
|
end
|
145
181
|
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'spidr/proxy'
|
2
|
+
|
3
|
+
module Spidr
|
4
|
+
module Settings
|
5
|
+
#
|
6
|
+
# Methods for configuring a proxy.
|
7
|
+
#
|
8
|
+
# @since 0.6.0
|
9
|
+
#
|
10
|
+
module Proxy
|
11
|
+
#
|
12
|
+
# Proxy information used by all newly created Agent objects by default.
|
13
|
+
#
|
14
|
+
# @return [Spidr::Proxy]
|
15
|
+
# The Spidr proxy information.
|
16
|
+
#
|
17
|
+
def proxy
|
18
|
+
@proxy ||= Spidr::Proxy.new
|
19
|
+
end
|
20
|
+
|
21
|
+
#
|
22
|
+
# Sets the proxy information used by Agent objects.
|
23
|
+
#
|
24
|
+
# @param [Spidr::Proxy, Hash, nil] new_proxy
|
25
|
+
# The new proxy information.
|
26
|
+
#
|
27
|
+
# @option new_proxy [String] :host
|
28
|
+
# The host-name of the proxy.
|
29
|
+
#
|
30
|
+
# @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
|
31
|
+
# The port of the proxy.
|
32
|
+
#
|
33
|
+
# @option new_proxy [String] :user
|
34
|
+
# The user to authenticate with the proxy as.
|
35
|
+
#
|
36
|
+
# @option new_proxy [String] :password
|
37
|
+
# The password to authenticate with the proxy.
|
38
|
+
#
|
39
|
+
# @return [Spidr::Proxy]
|
40
|
+
# The new proxy information.
|
41
|
+
#
|
42
|
+
def proxy=(new_proxy)
|
43
|
+
@proxy = case new_proxy
|
44
|
+
when Spidr::Proxy then new_proxy
|
45
|
+
when Hash then Spidr::Proxy.new(new_proxy)
|
46
|
+
when nil then Spidr::Proxy.new
|
47
|
+
else
|
48
|
+
raise(TypeError,"#{self.class}#{__method__} only accepts Proxy, Hash or nil")
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Disables the proxy settings used by all newly created Agent objects.
|
54
|
+
#
|
55
|
+
def disable_proxy!
|
56
|
+
@proxy = Spidr::Proxy.new
|
57
|
+
return true
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Spidr
|
2
|
+
module Settings
|
3
|
+
#
|
4
|
+
# @since 0.6.0
|
5
|
+
#
|
6
|
+
module Timeouts
|
7
|
+
# Read timeout.
|
8
|
+
#
|
9
|
+
# @return [Integer, nil]
|
10
|
+
attr_accessor :read_timeout
|
11
|
+
|
12
|
+
# Open timeout.
|
13
|
+
#
|
14
|
+
# @return [Integer, nil]
|
15
|
+
attr_accessor :open_timeout
|
16
|
+
|
17
|
+
# SSL timeout.
|
18
|
+
#
|
19
|
+
# @return [Integer, nil]
|
20
|
+
attr_accessor :ssl_timeout
|
21
|
+
|
22
|
+
# `Continue` timeout.
|
23
|
+
#
|
24
|
+
# @return [Integer, nil]
|
25
|
+
attr_accessor :continue_timeout
|
26
|
+
|
27
|
+
# `Keep-Alive` timeout.
|
28
|
+
#
|
29
|
+
# @return [Integer, nil]
|
30
|
+
attr_accessor :keep_alive_timeout
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/spidr/spidr.rb
CHANGED
@@ -1,79 +1,12 @@
|
|
1
|
+
require 'spidr/settings/proxy'
|
2
|
+
require 'spidr/settings/timeouts'
|
3
|
+
require 'spidr/settings/user_agent'
|
1
4
|
require 'spidr/agent'
|
2
5
|
|
3
6
|
module Spidr
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
# Default proxy information.
|
8
|
-
DEFAULT_PROXY = {
|
9
|
-
host: nil,
|
10
|
-
port: COMMON_PROXY_PORT,
|
11
|
-
user: nil,
|
12
|
-
password: nil
|
13
|
-
}
|
14
|
-
|
15
|
-
#
|
16
|
-
# Proxy information used by all newly created Agent objects by default.
|
17
|
-
#
|
18
|
-
# @return [Hash]
|
19
|
-
# The Spidr proxy information.
|
20
|
-
#
|
21
|
-
def Spidr.proxy
|
22
|
-
@@spidr_proxy ||= DEFAULT_PROXY
|
23
|
-
end
|
24
|
-
|
25
|
-
#
|
26
|
-
# Sets the proxy information used by Agent objects.
|
27
|
-
#
|
28
|
-
# @param [Hash] new_proxy
|
29
|
-
# The new proxy information.
|
30
|
-
#
|
31
|
-
# @option new_proxy [String] :host
|
32
|
-
# The host-name of the proxy.
|
33
|
-
#
|
34
|
-
# @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
|
35
|
-
# The port of the proxy.
|
36
|
-
#
|
37
|
-
# @option new_proxy [String] :user
|
38
|
-
# The user to authenticate with the proxy as.
|
39
|
-
#
|
40
|
-
# @option new_proxy [String] :password
|
41
|
-
# The password to authenticate with the proxy.
|
42
|
-
#
|
43
|
-
# @return [Hash]
|
44
|
-
# The new proxy information.
|
45
|
-
#
|
46
|
-
def Spidr.proxy=(new_proxy)
|
47
|
-
@@spidr_proxy = {port: COMMON_PROXY_PORT}.merge(new_proxy)
|
48
|
-
end
|
49
|
-
|
50
|
-
#
|
51
|
-
# Disables the proxy settings used by all newly created Agent objects.
|
52
|
-
#
|
53
|
-
def Spidr.disable_proxy!
|
54
|
-
@@spidr_proxy = DEFAULT_PROXY
|
55
|
-
return true
|
56
|
-
end
|
57
|
-
|
58
|
-
#
|
59
|
-
# The User-Agent string used by all Agent objects by default.
|
60
|
-
#
|
61
|
-
# @return [String]
|
62
|
-
# The Spidr User-Agent string.
|
63
|
-
#
|
64
|
-
def Spidr.user_agent
|
65
|
-
@@spidr_user_agent ||= nil
|
66
|
-
end
|
67
|
-
|
68
|
-
#
|
69
|
-
# Sets the Spidr User-Agent string.
|
70
|
-
#
|
71
|
-
# @param [String] new_agent
|
72
|
-
# The new User-Agent string.
|
73
|
-
#
|
74
|
-
def Spidr.user_agent=(new_agent)
|
75
|
-
@@spidr_user_agent = new_agent
|
76
|
-
end
|
7
|
+
extend Settings::Proxy
|
8
|
+
extend Settings::Timeouts
|
9
|
+
extend Settings::UserAgent
|
77
10
|
|
78
11
|
#
|
79
12
|
# Specifies whether `robots.txt` should be honored globally.
|
@@ -82,7 +15,7 @@ module Spidr
|
|
82
15
|
#
|
83
16
|
# @since 0.5.0
|
84
17
|
#
|
85
|
-
def
|
18
|
+
def self.robots?
|
86
19
|
@robots
|
87
20
|
end
|
88
21
|
|
@@ -95,31 +28,34 @@ module Spidr
|
|
95
28
|
#
|
96
29
|
# @since 0.5.0
|
97
30
|
#
|
98
|
-
def
|
31
|
+
def self.robots=(mode)
|
99
32
|
@robots = mode
|
100
33
|
end
|
101
34
|
|
102
35
|
#
|
103
36
|
# @see Agent.start_at
|
104
37
|
#
|
105
|
-
def
|
38
|
+
def self.start_at(url,options={},&block)
|
106
39
|
Agent.start_at(url,options,&block)
|
107
40
|
end
|
108
41
|
|
109
42
|
#
|
110
43
|
# @see Agent.host
|
111
44
|
#
|
112
|
-
def
|
45
|
+
def self.host(name,options={},&block)
|
113
46
|
Agent.host(name,options,&block)
|
114
47
|
end
|
115
48
|
|
116
49
|
#
|
117
50
|
# @see Agent.site
|
118
51
|
#
|
119
|
-
def
|
52
|
+
def self.site(url,options={},&block)
|
120
53
|
Agent.site(url,options,&block)
|
121
54
|
end
|
122
55
|
|
123
|
-
|
56
|
+
#
|
57
|
+
# @abstract
|
58
|
+
#
|
59
|
+
def self.robots
|
124
60
|
end
|
125
61
|
end
|