spidr 0.6.1 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +11 -0
- data/.github/workflows/ruby.yml +26 -0
- data/.gitignore +4 -5
- data/ChangeLog.md +19 -1
- data/Gemfile +7 -4
- data/LICENSE.txt +1 -1
- data/README.md +136 -79
- data/Rakefile +1 -0
- data/gemspec.yml +7 -0
- data/lib/spidr/agent/actions.rb +3 -1
- data/lib/spidr/agent/events.rb +3 -1
- data/lib/spidr/agent/filters.rb +57 -56
- data/lib/spidr/agent/robots.rb +2 -0
- data/lib/spidr/agent/sanitizers.rb +7 -8
- data/lib/spidr/agent.rb +232 -108
- data/lib/spidr/auth_credential.rb +2 -0
- data/lib/spidr/auth_store.rb +9 -7
- data/lib/spidr/cookie_jar.rb +7 -5
- data/lib/spidr/extensions/uri.rb +3 -1
- data/lib/spidr/extensions.rb +3 -1
- data/lib/spidr/page/content_types.rb +53 -0
- data/lib/spidr/page/cookies.rb +2 -0
- data/lib/spidr/page/html.rb +21 -20
- data/lib/spidr/page/status_codes.rb +15 -11
- data/lib/spidr/page.rb +3 -1
- data/lib/spidr/proxy.rb +8 -14
- data/lib/spidr/rules.rb +7 -8
- data/lib/spidr/session_cache.rb +26 -22
- data/lib/spidr/settings/proxy.rb +22 -6
- data/lib/spidr/settings/timeouts.rb +2 -0
- data/lib/spidr/settings/user_agent.rb +2 -0
- data/lib/spidr/settings.rb +5 -3
- data/lib/spidr/spidr.rb +22 -11
- data/lib/spidr/version.rb +3 -1
- data/lib/spidr.rb +5 -3
- data/spec/agent_spec.rb +356 -7
- data/spec/example_page.rb +2 -0
- data/spec/page/content_types_spec.rb +22 -0
- data/spec/page/html_spec.rb +255 -51
- data/spec/page/status_codes_spec.rb +4 -4
- data/spec/proxy_spec.rb +2 -2
- data/spec/settings/proxy_examples.rb +31 -11
- data/spec/spec_helper.rb +3 -0
- data/spidr.gemspec +1 -4
- metadata +8 -7
- data/.travis.yml +0 -16
data/lib/spidr/agent.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'settings/user_agent'
|
4
|
+
require_relative 'agent/sanitizers'
|
5
|
+
require_relative 'agent/filters'
|
6
|
+
require_relative 'agent/events'
|
7
|
+
require_relative 'agent/actions'
|
8
|
+
require_relative 'agent/robots'
|
9
|
+
require_relative 'page'
|
10
|
+
require_relative 'session_cache'
|
11
|
+
require_relative 'cookie_jar'
|
12
|
+
require_relative 'auth_store'
|
13
|
+
require_relative 'spidr'
|
12
14
|
|
13
15
|
require 'openssl'
|
14
16
|
require 'net/http'
|
@@ -19,12 +21,12 @@ module Spidr
|
|
19
21
|
|
20
22
|
include Settings::UserAgent
|
21
23
|
|
22
|
-
# HTTP Host Header to use
|
24
|
+
# HTTP Host `Header` to use
|
23
25
|
#
|
24
26
|
# @return [String]
|
25
27
|
attr_accessor :host_header
|
26
28
|
|
27
|
-
# HTTP Host Headers to use for specific hosts
|
29
|
+
# HTTP `Host` Headers to use for specific hosts
|
28
30
|
#
|
29
31
|
# @return [Hash{String,Regexp => String}]
|
30
32
|
attr_reader :host_headers
|
@@ -96,70 +98,110 @@ module Spidr
|
|
96
98
|
#
|
97
99
|
# Creates a new Agent object.
|
98
100
|
#
|
99
|
-
# @param [
|
100
|
-
#
|
101
|
+
# @param [String, nil] host_header
|
102
|
+
# The HTTP `Host` header to use with each request.
|
101
103
|
#
|
102
|
-
# @
|
103
|
-
#
|
104
|
+
# @param [Hash{String,Regexp => String}] host_headers
|
105
|
+
# The HTTP `Host` headers to use for specific hosts.
|
104
106
|
#
|
105
|
-
# @
|
107
|
+
# @param [Hash{String => String}] default_headers
|
108
|
+
# Default headers to set for every request.
|
109
|
+
#
|
110
|
+
# @param [String, nil] user_agent
|
111
|
+
# The `User-Agent` string to send with each requests.
|
112
|
+
#
|
113
|
+
# @param [String, nil] referer
|
114
|
+
# The `Referer` URL to send with each request.
|
115
|
+
#
|
116
|
+
# @param [Integer, nil] open_timeout
|
117
|
+
# Optional open connection timeout.
|
118
|
+
#
|
119
|
+
# @param [Integer, nil] read_timeout
|
106
120
|
# Optional read timeout.
|
107
121
|
#
|
108
|
-
# @
|
109
|
-
# Optional
|
122
|
+
# @param [Integer, nil] ssl_timeout
|
123
|
+
# Optional SSL connection timeout.
|
110
124
|
#
|
111
|
-
# @
|
125
|
+
# @param [Integer, nil] continue_timeout
|
112
126
|
# Optional continue timeout.
|
113
127
|
#
|
114
|
-
# @
|
115
|
-
# Optional
|
128
|
+
# @param [Integer, nil] keep_alive_timeout
|
129
|
+
# Optional `Keep-Alive` timeout.
|
116
130
|
#
|
117
|
-
# @
|
131
|
+
# @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy
|
118
132
|
# The proxy information to use.
|
119
133
|
#
|
120
|
-
# @option
|
134
|
+
# @option proxy [String] :host
|
121
135
|
# The host the proxy is running on.
|
122
136
|
#
|
123
|
-
# @option
|
137
|
+
# @option proxy [Integer] :port (8080)
|
124
138
|
# The port the proxy is running on.
|
125
139
|
#
|
126
|
-
# @option
|
140
|
+
# @option proxy [String, nil] :user
|
127
141
|
# The user to authenticate as with the proxy.
|
128
142
|
#
|
129
|
-
# @option
|
143
|
+
# @option proxy [String, nil] :password
|
130
144
|
# The password to authenticate with.
|
131
145
|
#
|
132
|
-
# @
|
133
|
-
#
|
146
|
+
# @param [Integer] delay
|
147
|
+
# The number of seconds to pause between each request.
|
134
148
|
#
|
135
|
-
# @
|
136
|
-
# The
|
149
|
+
# @param [Integer, nil] limit
|
150
|
+
# The maximum number of pages to visit.
|
137
151
|
#
|
138
|
-
# @
|
139
|
-
# The
|
152
|
+
# @param [Integer, nil] max_depth
|
153
|
+
# The maximum link depth to follow.
|
140
154
|
#
|
141
|
-
# @
|
142
|
-
# The
|
155
|
+
# @param [Set, Array, nil] queue
|
156
|
+
# The initial queue of URLs to visit.
|
143
157
|
#
|
144
|
-
# @
|
145
|
-
# The
|
158
|
+
# @param [Set, Array, nil] history
|
159
|
+
# The initial list of visited URLs.
|
146
160
|
#
|
147
|
-
# @
|
148
|
-
#
|
161
|
+
# @param [Boolean] strip_fragments
|
162
|
+
# Controls whether to strip the fragment components from the URLs.
|
149
163
|
#
|
150
|
-
# @
|
151
|
-
#
|
164
|
+
# @param [Boolean] strip_query
|
165
|
+
# Controls whether to strip the query components from the URLs.
|
152
166
|
#
|
153
|
-
# @
|
154
|
-
# The
|
167
|
+
# @param [Array<String>] schemes
|
168
|
+
# The list of acceptable URI schemes to visit.
|
169
|
+
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
155
170
|
#
|
156
|
-
# @
|
157
|
-
# The
|
171
|
+
# @param [String] host
|
172
|
+
# The host-name to visit.
|
158
173
|
#
|
159
|
-
# @
|
160
|
-
# The
|
174
|
+
# @param [Array<String, Regexp, Proc>] hosts
|
175
|
+
# The patterns which match the host-names to visit.
|
161
176
|
#
|
162
|
-
# @
|
177
|
+
# @param [Array<String, Regexp, Proc>] ignore_hosts
|
178
|
+
# The patterns which match the host-names to not visit.
|
179
|
+
#
|
180
|
+
# @param [Array<Integer, Regexp, Proc>] ports
|
181
|
+
# The patterns which match the ports to visit.
|
182
|
+
#
|
183
|
+
# @param [Array<Integer, Regexp, Proc>] ignore_ports
|
184
|
+
# The patterns which match the ports to not visit.
|
185
|
+
#
|
186
|
+
# @param [Array<String, Regexp, Proc>] links
|
187
|
+
# The patterns which match the links to visit.
|
188
|
+
#
|
189
|
+
# @param [Array<String, Regexp, Proc>] ignore_links
|
190
|
+
# The patterns which match the links to not visit.
|
191
|
+
#
|
192
|
+
# @param [Array<String, Regexp, Proc>] urls
|
193
|
+
# The patterns which match the URLs to visit.
|
194
|
+
#
|
195
|
+
# @param [Array<String, Regexp, Proc>] ignore_urls
|
196
|
+
# The patterns which match the URLs to not visit.
|
197
|
+
#
|
198
|
+
# @param [Array<String, Regexp, Proc>] exts
|
199
|
+
# The patterns which match the URI path extensions to visit.
|
200
|
+
#
|
201
|
+
# @param [Array<String, Regexp, Proc>] ignore_exts
|
202
|
+
# The patterns which match the URI path extensions to not visit.
|
203
|
+
#
|
204
|
+
# @param [Boolean] robots
|
163
205
|
# Specifies whether `robots.txt` should be honored.
|
164
206
|
#
|
165
207
|
# @yield [agent]
|
@@ -169,58 +211,99 @@ module Spidr
|
|
169
211
|
# @yieldparam [Agent] agent
|
170
212
|
# The newly created agent.
|
171
213
|
#
|
172
|
-
#
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
214
|
+
def initialize(# header keyword arguments
|
215
|
+
host_header: nil,
|
216
|
+
host_headers: {},
|
217
|
+
default_headers: {},
|
218
|
+
user_agent: Spidr.user_agent,
|
219
|
+
referer: nil,
|
220
|
+
# session cache keyword arguments
|
221
|
+
proxy: Spidr.proxy,
|
222
|
+
open_timeout: Spidr.open_timeout,
|
223
|
+
ssl_timeout: Spidr.ssl_timeout,
|
224
|
+
read_timeout: Spidr.read_timeout,
|
225
|
+
continue_timeout: Spidr.continue_timeout,
|
226
|
+
keep_alive_timeout: Spidr.keep_alive_timeout,
|
227
|
+
# spidering controls keyword arguments
|
228
|
+
delay: 0,
|
229
|
+
limit: nil,
|
230
|
+
max_depth: nil,
|
231
|
+
# history keyword arguments
|
232
|
+
queue: nil,
|
233
|
+
history: nil,
|
234
|
+
# sanitizer keyword arguments
|
235
|
+
strip_fragments: true,
|
236
|
+
strip_query: false,
|
237
|
+
# filtering keyword arguments
|
238
|
+
schemes: self.class.default_schemes,
|
239
|
+
host: nil,
|
240
|
+
hosts: nil,
|
241
|
+
ignore_hosts: nil,
|
242
|
+
ports: nil,
|
243
|
+
ignore_ports: nil,
|
244
|
+
links: nil,
|
245
|
+
ignore_links: nil,
|
246
|
+
urls: nil,
|
247
|
+
ignore_urls: nil,
|
248
|
+
exts: nil,
|
249
|
+
ignore_exts: nil,
|
250
|
+
# robots keyword arguments
|
251
|
+
robots: Spidr.robots?)
|
252
|
+
@host_header = host_header
|
253
|
+
@host_headers = host_headers
|
254
|
+
|
255
|
+
@default_headers = default_headers
|
256
|
+
|
257
|
+
@user_agent = user_agent
|
258
|
+
@referer = referer
|
259
|
+
|
260
|
+
@sessions = SessionCache.new(
|
261
|
+
proxy: proxy,
|
262
|
+
open_timeout: open_timeout,
|
263
|
+
ssl_timeout: ssl_timeout,
|
264
|
+
read_timeout: read_timeout,
|
265
|
+
continue_timeout: continue_timeout,
|
266
|
+
keep_alive_timeout: keep_alive_timeout
|
267
|
+
)
|
195
268
|
@cookies = CookieJar.new
|
196
269
|
@authorized = AuthStore.new
|
197
270
|
|
198
271
|
@running = false
|
199
|
-
@delay =
|
272
|
+
@delay = delay
|
200
273
|
@history = Set[]
|
201
274
|
@failures = Set[]
|
202
275
|
@queue = []
|
203
276
|
|
204
|
-
@limit =
|
277
|
+
@limit = limit
|
205
278
|
@levels = Hash.new(0)
|
206
|
-
@max_depth =
|
207
|
-
|
208
|
-
if
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
279
|
+
@max_depth = max_depth
|
280
|
+
|
281
|
+
self.queue = queue if queue
|
282
|
+
self.history = history if history
|
283
|
+
|
284
|
+
initialize_sanitizers(
|
285
|
+
strip_fragments: strip_fragments,
|
286
|
+
strip_query: strip_query
|
287
|
+
)
|
288
|
+
|
289
|
+
initialize_filters(
|
290
|
+
schemes: schemes,
|
291
|
+
host: host,
|
292
|
+
hosts: hosts,
|
293
|
+
ignore_hosts: ignore_hosts,
|
294
|
+
ports: ports,
|
295
|
+
ignore_ports: ignore_ports,
|
296
|
+
links: links,
|
297
|
+
ignore_links: ignore_links,
|
298
|
+
urls: urls,
|
299
|
+
ignore_urls: ignore_urls,
|
300
|
+
exts: exts,
|
301
|
+
ignore_exts: ignore_exts
|
302
|
+
)
|
303
|
+
initialize_actions
|
304
|
+
initialize_events
|
305
|
+
|
306
|
+
initialize_robots if robots
|
224
307
|
|
225
308
|
yield self if block_given?
|
226
309
|
end
|
@@ -231,8 +314,8 @@ module Spidr
|
|
231
314
|
# @param [URI::HTTP, String] url
|
232
315
|
# The URL to start spidering at.
|
233
316
|
#
|
234
|
-
# @param [Hash]
|
235
|
-
# Additional
|
317
|
+
# @param [Hash{Symbol => Object}] kwargs
|
318
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
236
319
|
#
|
237
320
|
# @yield [agent]
|
238
321
|
# If a block is given, it will be passed the newly created agent
|
@@ -241,12 +324,16 @@ module Spidr
|
|
241
324
|
# @yieldparam [Agent] agent
|
242
325
|
# The newly created agent.
|
243
326
|
#
|
327
|
+
# @return [Agent]
|
328
|
+
# The created agent object.
|
329
|
+
#
|
244
330
|
# @see #initialize
|
245
331
|
# @see #start_at
|
246
332
|
#
|
247
|
-
def self.start_at(url
|
248
|
-
agent = new(
|
333
|
+
def self.start_at(url,**kwargs,&block)
|
334
|
+
agent = new(**kwargs,&block)
|
249
335
|
agent.start_at(url)
|
336
|
+
return agent
|
250
337
|
end
|
251
338
|
|
252
339
|
#
|
@@ -255,8 +342,8 @@ module Spidr
|
|
255
342
|
# @param [URI::HTTP, String] url
|
256
343
|
# The web-site to spider.
|
257
344
|
#
|
258
|
-
# @param [Hash]
|
259
|
-
# Additional
|
345
|
+
# @param [Hash{Symbol => Object}] kwargs
|
346
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
260
347
|
#
|
261
348
|
# @yield [agent]
|
262
349
|
# If a block is given, it will be passed the newly created agent
|
@@ -265,13 +352,17 @@ module Spidr
|
|
265
352
|
# @yieldparam [Agent] agent
|
266
353
|
# The newly created agent.
|
267
354
|
#
|
355
|
+
# @return [Agent]
|
356
|
+
# The created agent object.
|
357
|
+
#
|
268
358
|
# @see #initialize
|
269
359
|
#
|
270
|
-
def self.site(url
|
360
|
+
def self.site(url,**kwargs,&block)
|
271
361
|
url = URI(url)
|
272
362
|
|
273
|
-
agent = new(
|
363
|
+
agent = new(host: url.host, **kwargs, &block)
|
274
364
|
agent.start_at(url)
|
365
|
+
return agent
|
275
366
|
end
|
276
367
|
|
277
368
|
#
|
@@ -280,8 +371,35 @@ module Spidr
|
|
280
371
|
# @param [String] name
|
281
372
|
# The host-name to spider.
|
282
373
|
#
|
283
|
-
# @param [Hash]
|
284
|
-
# Additional
|
374
|
+
# @param [Hash{Symbol => Object}] kwargs
|
375
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
376
|
+
#
|
377
|
+
# @yield [agent]
|
378
|
+
# If a block is given, it will be passed the newly created agent
|
379
|
+
# before it begins spidering.
|
380
|
+
#
|
381
|
+
# @yieldparam [Agent] agent
|
382
|
+
# The newly created agent.
|
383
|
+
#
|
384
|
+
# @return [Agent]
|
385
|
+
# The created agent object.
|
386
|
+
#
|
387
|
+
# @see #initialize
|
388
|
+
#
|
389
|
+
def self.host(name,**kwargs,&block)
|
390
|
+
agent = new(host: name, **kwargs, &block)
|
391
|
+
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
392
|
+
return agent
|
393
|
+
end
|
394
|
+
|
395
|
+
#
|
396
|
+
# Creates a new agent and spiders the entire domain.
|
397
|
+
#
|
398
|
+
# @param [String] name
|
399
|
+
# The top-level domain to spider.
|
400
|
+
#
|
401
|
+
# @param [Hash{Symbol => Object}] kwargs
|
402
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
285
403
|
#
|
286
404
|
# @yield [agent]
|
287
405
|
# If a block is given, it will be passed the newly created agent
|
@@ -290,11 +408,17 @@ module Spidr
|
|
290
408
|
# @yieldparam [Agent] agent
|
291
409
|
# The newly created agent.
|
292
410
|
#
|
411
|
+
# @return [Agent]
|
412
|
+
# The created agent object.
|
413
|
+
#
|
293
414
|
# @see #initialize
|
294
415
|
#
|
295
|
-
|
296
|
-
|
416
|
+
# @since 0.7.0
|
417
|
+
#
|
418
|
+
def self.domain(name,**kwargs,&block)
|
419
|
+
agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block)
|
297
420
|
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
421
|
+
return agent
|
298
422
|
end
|
299
423
|
|
300
424
|
#
|
@@ -314,10 +438,10 @@ module Spidr
|
|
314
438
|
#
|
315
439
|
# Sets the proxy information that the agent uses.
|
316
440
|
#
|
317
|
-
# @param [Proxy] new_proxy
|
441
|
+
# @param [Proxy, Hash, URI::HTTP, String, nil] new_proxy
|
318
442
|
# The new proxy information.
|
319
443
|
#
|
320
|
-
# @return [
|
444
|
+
# @return [Proxy]
|
321
445
|
# The new proxy information.
|
322
446
|
#
|
323
447
|
# @see SessionCache#proxy=
|
@@ -534,7 +658,7 @@ module Spidr
|
|
534
658
|
def enqueue(url,level=0)
|
535
659
|
url = sanitize_url(url)
|
536
660
|
|
537
|
-
if (!
|
661
|
+
if (!queued?(url) && visit?(url))
|
538
662
|
link = url.to_s
|
539
663
|
|
540
664
|
begin
|
@@ -633,7 +757,7 @@ module Spidr
|
|
633
757
|
end
|
634
758
|
|
635
759
|
#
|
636
|
-
# Visits a given URL, and
|
760
|
+
# Visits a given URL, and enqueues the links recovered from the URL
|
637
761
|
# to be visited later.
|
638
762
|
#
|
639
763
|
# @param [URI::HTTP, String] url
|
data/lib/spidr/auth_store.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'extensions/uri'
|
4
|
+
require_relative 'auth_credential'
|
5
|
+
require_relative 'page'
|
4
6
|
|
5
7
|
require 'base64'
|
6
8
|
|
@@ -20,7 +22,7 @@ module Spidr
|
|
20
22
|
@credentials = {}
|
21
23
|
end
|
22
24
|
|
23
|
-
#
|
25
|
+
#
|
24
26
|
# Given a URL, return the most specific matching auth credential.
|
25
27
|
#
|
26
28
|
# @param [URI] url
|
@@ -54,7 +56,7 @@ module Spidr
|
|
54
56
|
return nil
|
55
57
|
end
|
56
58
|
|
57
|
-
#
|
59
|
+
#
|
58
60
|
# Add an auth credential to the store for supplied base URL.
|
59
61
|
#
|
60
62
|
# @param [URI] url
|
@@ -109,7 +111,7 @@ module Spidr
|
|
109
111
|
# or `nil` if no authorization exists.
|
110
112
|
#
|
111
113
|
# @param [URI] url
|
112
|
-
# The
|
114
|
+
# The URL.
|
113
115
|
#
|
114
116
|
# @return [String, nil]
|
115
117
|
# The base64 encoded authorizatio string or `nil`.
|
@@ -122,7 +124,7 @@ module Spidr
|
|
122
124
|
end
|
123
125
|
end
|
124
126
|
|
125
|
-
#
|
127
|
+
#
|
126
128
|
# Clear the contents of the auth store.
|
127
129
|
#
|
128
130
|
# @return [AuthStore]
|
data/lib/spidr/cookie_jar.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'page'
|
2
4
|
|
3
5
|
require 'set'
|
4
6
|
|
@@ -42,8 +44,8 @@ module Spidr
|
|
42
44
|
@params.each(&block)
|
43
45
|
end
|
44
46
|
|
45
|
-
#
|
46
|
-
# Return all relevant cookies in a single string for the
|
47
|
+
#
|
48
|
+
# Return all relevant cookies in a single string for the
|
47
49
|
# named host or domain (in browser request format).
|
48
50
|
#
|
49
51
|
# @param [String] host
|
@@ -59,7 +61,7 @@ module Spidr
|
|
59
61
|
@params[host] ||= {}
|
60
62
|
end
|
61
63
|
|
62
|
-
#
|
64
|
+
#
|
63
65
|
# Add a cookie to the jar for a particular domain.
|
64
66
|
#
|
65
67
|
# @param [String] host
|
@@ -166,7 +168,7 @@ module Spidr
|
|
166
168
|
return host_cookies
|
167
169
|
end
|
168
170
|
|
169
|
-
#
|
171
|
+
#
|
170
172
|
# Clear out the jar, removing all stored cookies.
|
171
173
|
#
|
172
174
|
# @since 0.2.2
|
data/lib/spidr/extensions/uri.rb
CHANGED
data/lib/spidr/extensions.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Spidr
|
2
4
|
class Page
|
3
5
|
#
|
@@ -221,5 +223,56 @@ module Spidr
|
|
221
223
|
def zip?
|
222
224
|
is_content_type?('application/zip')
|
223
225
|
end
|
226
|
+
|
227
|
+
#
|
228
|
+
# Determines if the page is a PNG image.
|
229
|
+
#
|
230
|
+
# @return [Boolean]
|
231
|
+
# Specifies whether the page is a PNG image.
|
232
|
+
#
|
233
|
+
# @since 0.7.0
|
234
|
+
#
|
235
|
+
def png?
|
236
|
+
is_content_type?('image/png')
|
237
|
+
end
|
238
|
+
|
239
|
+
#
|
240
|
+
# Determines if the page is a GIF image.
|
241
|
+
#
|
242
|
+
# @return [Boolean]
|
243
|
+
# Specifies whether the page is a GIF image.
|
244
|
+
#
|
245
|
+
# @since 0.7.0
|
246
|
+
#
|
247
|
+
def gif?
|
248
|
+
is_content_type?('image/gif')
|
249
|
+
end
|
250
|
+
|
251
|
+
#
|
252
|
+
# Determines if the page is a JPEG image.
|
253
|
+
#
|
254
|
+
# @return [Boolean]
|
255
|
+
# Specifies whether the page is a JPEG image.
|
256
|
+
#
|
257
|
+
# @since 0.7.0
|
258
|
+
#
|
259
|
+
def jpeg?
|
260
|
+
is_content_type?('image/jpeg')
|
261
|
+
end
|
262
|
+
|
263
|
+
#
|
264
|
+
# Determines if the page is a ICO image.
|
265
|
+
#
|
266
|
+
# @return [Boolean]
|
267
|
+
# Specifies whether the page is a ICO image.
|
268
|
+
#
|
269
|
+
# @since 0.7.0
|
270
|
+
#
|
271
|
+
def ico?
|
272
|
+
is_content_type?('image/x-icon') ||
|
273
|
+
is_content_type?('image/vnd.microsoft.icon')
|
274
|
+
end
|
275
|
+
|
276
|
+
alias icon? ico?
|
224
277
|
end
|
225
278
|
end
|