spidr 0.6.1 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.editorconfig +11 -0
- data/.github/workflows/ruby.yml +26 -0
- data/.gitignore +4 -5
- data/ChangeLog.md +19 -1
- data/Gemfile +7 -4
- data/LICENSE.txt +1 -1
- data/README.md +136 -79
- data/Rakefile +1 -0
- data/gemspec.yml +7 -0
- data/lib/spidr/agent/actions.rb +3 -1
- data/lib/spidr/agent/events.rb +3 -1
- data/lib/spidr/agent/filters.rb +57 -56
- data/lib/spidr/agent/robots.rb +2 -0
- data/lib/spidr/agent/sanitizers.rb +7 -8
- data/lib/spidr/agent.rb +232 -108
- data/lib/spidr/auth_credential.rb +2 -0
- data/lib/spidr/auth_store.rb +9 -7
- data/lib/spidr/cookie_jar.rb +7 -5
- data/lib/spidr/extensions/uri.rb +3 -1
- data/lib/spidr/extensions.rb +3 -1
- data/lib/spidr/page/content_types.rb +53 -0
- data/lib/spidr/page/cookies.rb +2 -0
- data/lib/spidr/page/html.rb +21 -20
- data/lib/spidr/page/status_codes.rb +15 -11
- data/lib/spidr/page.rb +3 -1
- data/lib/spidr/proxy.rb +8 -14
- data/lib/spidr/rules.rb +7 -8
- data/lib/spidr/session_cache.rb +26 -22
- data/lib/spidr/settings/proxy.rb +22 -6
- data/lib/spidr/settings/timeouts.rb +2 -0
- data/lib/spidr/settings/user_agent.rb +2 -0
- data/lib/spidr/settings.rb +5 -3
- data/lib/spidr/spidr.rb +22 -11
- data/lib/spidr/version.rb +3 -1
- data/lib/spidr.rb +5 -3
- data/spec/agent_spec.rb +356 -7
- data/spec/example_page.rb +2 -0
- data/spec/page/content_types_spec.rb +22 -0
- data/spec/page/html_spec.rb +255 -51
- data/spec/page/status_codes_spec.rb +4 -4
- data/spec/proxy_spec.rb +2 -2
- data/spec/settings/proxy_examples.rb +31 -11
- data/spec/spec_helper.rb +3 -0
- data/spidr.gemspec +1 -4
- metadata +8 -7
- data/.travis.yml +0 -16
data/lib/spidr/agent.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'settings/user_agent'
|
4
|
+
require_relative 'agent/sanitizers'
|
5
|
+
require_relative 'agent/filters'
|
6
|
+
require_relative 'agent/events'
|
7
|
+
require_relative 'agent/actions'
|
8
|
+
require_relative 'agent/robots'
|
9
|
+
require_relative 'page'
|
10
|
+
require_relative 'session_cache'
|
11
|
+
require_relative 'cookie_jar'
|
12
|
+
require_relative 'auth_store'
|
13
|
+
require_relative 'spidr'
|
12
14
|
|
13
15
|
require 'openssl'
|
14
16
|
require 'net/http'
|
@@ -19,12 +21,12 @@ module Spidr
|
|
19
21
|
|
20
22
|
include Settings::UserAgent
|
21
23
|
|
22
|
-
# HTTP Host Header to use
|
24
|
+
# HTTP Host `Header` to use
|
23
25
|
#
|
24
26
|
# @return [String]
|
25
27
|
attr_accessor :host_header
|
26
28
|
|
27
|
-
# HTTP Host Headers to use for specific hosts
|
29
|
+
# HTTP `Host` Headers to use for specific hosts
|
28
30
|
#
|
29
31
|
# @return [Hash{String,Regexp => String}]
|
30
32
|
attr_reader :host_headers
|
@@ -96,70 +98,110 @@ module Spidr
|
|
96
98
|
#
|
97
99
|
# Creates a new Agent object.
|
98
100
|
#
|
99
|
-
# @param [
|
100
|
-
#
|
101
|
+
# @param [String, nil] host_header
|
102
|
+
# The HTTP `Host` header to use with each request.
|
101
103
|
#
|
102
|
-
# @
|
103
|
-
#
|
104
|
+
# @param [Hash{String,Regexp => String}] host_headers
|
105
|
+
# The HTTP `Host` headers to use for specific hosts.
|
104
106
|
#
|
105
|
-
# @
|
107
|
+
# @param [Hash{String => String}] default_headers
|
108
|
+
# Default headers to set for every request.
|
109
|
+
#
|
110
|
+
# @param [String, nil] user_agent
|
111
|
+
# The `User-Agent` string to send with each requests.
|
112
|
+
#
|
113
|
+
# @param [String, nil] referer
|
114
|
+
# The `Referer` URL to send with each request.
|
115
|
+
#
|
116
|
+
# @param [Integer, nil] open_timeout
|
117
|
+
# Optional open connection timeout.
|
118
|
+
#
|
119
|
+
# @param [Integer, nil] read_timeout
|
106
120
|
# Optional read timeout.
|
107
121
|
#
|
108
|
-
# @
|
109
|
-
# Optional
|
122
|
+
# @param [Integer, nil] ssl_timeout
|
123
|
+
# Optional SSL connection timeout.
|
110
124
|
#
|
111
|
-
# @
|
125
|
+
# @param [Integer, nil] continue_timeout
|
112
126
|
# Optional continue timeout.
|
113
127
|
#
|
114
|
-
# @
|
115
|
-
# Optional
|
128
|
+
# @param [Integer, nil] keep_alive_timeout
|
129
|
+
# Optional `Keep-Alive` timeout.
|
116
130
|
#
|
117
|
-
# @
|
131
|
+
# @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy
|
118
132
|
# The proxy information to use.
|
119
133
|
#
|
120
|
-
# @option
|
134
|
+
# @option proxy [String] :host
|
121
135
|
# The host the proxy is running on.
|
122
136
|
#
|
123
|
-
# @option
|
137
|
+
# @option proxy [Integer] :port (8080)
|
124
138
|
# The port the proxy is running on.
|
125
139
|
#
|
126
|
-
# @option
|
140
|
+
# @option proxy [String, nil] :user
|
127
141
|
# The user to authenticate as with the proxy.
|
128
142
|
#
|
129
|
-
# @option
|
143
|
+
# @option proxy [String, nil] :password
|
130
144
|
# The password to authenticate with.
|
131
145
|
#
|
132
|
-
# @
|
133
|
-
#
|
146
|
+
# @param [Integer] delay
|
147
|
+
# The number of seconds to pause between each request.
|
134
148
|
#
|
135
|
-
# @
|
136
|
-
# The
|
149
|
+
# @param [Integer, nil] limit
|
150
|
+
# The maximum number of pages to visit.
|
137
151
|
#
|
138
|
-
# @
|
139
|
-
# The
|
152
|
+
# @param [Integer, nil] max_depth
|
153
|
+
# The maximum link depth to follow.
|
140
154
|
#
|
141
|
-
# @
|
142
|
-
# The
|
155
|
+
# @param [Set, Array, nil] queue
|
156
|
+
# The initial queue of URLs to visit.
|
143
157
|
#
|
144
|
-
# @
|
145
|
-
# The
|
158
|
+
# @param [Set, Array, nil] history
|
159
|
+
# The initial list of visited URLs.
|
146
160
|
#
|
147
|
-
# @
|
148
|
-
#
|
161
|
+
# @param [Boolean] strip_fragments
|
162
|
+
# Controls whether to strip the fragment components from the URLs.
|
149
163
|
#
|
150
|
-
# @
|
151
|
-
#
|
164
|
+
# @param [Boolean] strip_query
|
165
|
+
# Controls whether to strip the query components from the URLs.
|
152
166
|
#
|
153
|
-
# @
|
154
|
-
# The
|
167
|
+
# @param [Array<String>] schemes
|
168
|
+
# The list of acceptable URI schemes to visit.
|
169
|
+
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
155
170
|
#
|
156
|
-
# @
|
157
|
-
# The
|
171
|
+
# @param [String] host
|
172
|
+
# The host-name to visit.
|
158
173
|
#
|
159
|
-
# @
|
160
|
-
# The
|
174
|
+
# @param [Array<String, Regexp, Proc>] hosts
|
175
|
+
# The patterns which match the host-names to visit.
|
161
176
|
#
|
162
|
-
# @
|
177
|
+
# @param [Array<String, Regexp, Proc>] ignore_hosts
|
178
|
+
# The patterns which match the host-names to not visit.
|
179
|
+
#
|
180
|
+
# @param [Array<Integer, Regexp, Proc>] ports
|
181
|
+
# The patterns which match the ports to visit.
|
182
|
+
#
|
183
|
+
# @param [Array<Integer, Regexp, Proc>] ignore_ports
|
184
|
+
# The patterns which match the ports to not visit.
|
185
|
+
#
|
186
|
+
# @param [Array<String, Regexp, Proc>] links
|
187
|
+
# The patterns which match the links to visit.
|
188
|
+
#
|
189
|
+
# @param [Array<String, Regexp, Proc>] ignore_links
|
190
|
+
# The patterns which match the links to not visit.
|
191
|
+
#
|
192
|
+
# @param [Array<String, Regexp, Proc>] urls
|
193
|
+
# The patterns which match the URLs to visit.
|
194
|
+
#
|
195
|
+
# @param [Array<String, Regexp, Proc>] ignore_urls
|
196
|
+
# The patterns which match the URLs to not visit.
|
197
|
+
#
|
198
|
+
# @param [Array<String, Regexp, Proc>] exts
|
199
|
+
# The patterns which match the URI path extensions to visit.
|
200
|
+
#
|
201
|
+
# @param [Array<String, Regexp, Proc>] ignore_exts
|
202
|
+
# The patterns which match the URI path extensions to not visit.
|
203
|
+
#
|
204
|
+
# @param [Boolean] robots
|
163
205
|
# Specifies whether `robots.txt` should be honored.
|
164
206
|
#
|
165
207
|
# @yield [agent]
|
@@ -169,58 +211,99 @@ module Spidr
|
|
169
211
|
# @yieldparam [Agent] agent
|
170
212
|
# The newly created agent.
|
171
213
|
#
|
172
|
-
#
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
214
|
+
def initialize(# header keyword arguments
|
215
|
+
host_header: nil,
|
216
|
+
host_headers: {},
|
217
|
+
default_headers: {},
|
218
|
+
user_agent: Spidr.user_agent,
|
219
|
+
referer: nil,
|
220
|
+
# session cache keyword arguments
|
221
|
+
proxy: Spidr.proxy,
|
222
|
+
open_timeout: Spidr.open_timeout,
|
223
|
+
ssl_timeout: Spidr.ssl_timeout,
|
224
|
+
read_timeout: Spidr.read_timeout,
|
225
|
+
continue_timeout: Spidr.continue_timeout,
|
226
|
+
keep_alive_timeout: Spidr.keep_alive_timeout,
|
227
|
+
# spidering controls keyword arguments
|
228
|
+
delay: 0,
|
229
|
+
limit: nil,
|
230
|
+
max_depth: nil,
|
231
|
+
# history keyword arguments
|
232
|
+
queue: nil,
|
233
|
+
history: nil,
|
234
|
+
# sanitizer keyword arguments
|
235
|
+
strip_fragments: true,
|
236
|
+
strip_query: false,
|
237
|
+
# filtering keyword arguments
|
238
|
+
schemes: self.class.default_schemes,
|
239
|
+
host: nil,
|
240
|
+
hosts: nil,
|
241
|
+
ignore_hosts: nil,
|
242
|
+
ports: nil,
|
243
|
+
ignore_ports: nil,
|
244
|
+
links: nil,
|
245
|
+
ignore_links: nil,
|
246
|
+
urls: nil,
|
247
|
+
ignore_urls: nil,
|
248
|
+
exts: nil,
|
249
|
+
ignore_exts: nil,
|
250
|
+
# robots keyword arguments
|
251
|
+
robots: Spidr.robots?)
|
252
|
+
@host_header = host_header
|
253
|
+
@host_headers = host_headers
|
254
|
+
|
255
|
+
@default_headers = default_headers
|
256
|
+
|
257
|
+
@user_agent = user_agent
|
258
|
+
@referer = referer
|
259
|
+
|
260
|
+
@sessions = SessionCache.new(
|
261
|
+
proxy: proxy,
|
262
|
+
open_timeout: open_timeout,
|
263
|
+
ssl_timeout: ssl_timeout,
|
264
|
+
read_timeout: read_timeout,
|
265
|
+
continue_timeout: continue_timeout,
|
266
|
+
keep_alive_timeout: keep_alive_timeout
|
267
|
+
)
|
195
268
|
@cookies = CookieJar.new
|
196
269
|
@authorized = AuthStore.new
|
197
270
|
|
198
271
|
@running = false
|
199
|
-
@delay =
|
272
|
+
@delay = delay
|
200
273
|
@history = Set[]
|
201
274
|
@failures = Set[]
|
202
275
|
@queue = []
|
203
276
|
|
204
|
-
@limit =
|
277
|
+
@limit = limit
|
205
278
|
@levels = Hash.new(0)
|
206
|
-
@max_depth =
|
207
|
-
|
208
|
-
if
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
279
|
+
@max_depth = max_depth
|
280
|
+
|
281
|
+
self.queue = queue if queue
|
282
|
+
self.history = history if history
|
283
|
+
|
284
|
+
initialize_sanitizers(
|
285
|
+
strip_fragments: strip_fragments,
|
286
|
+
strip_query: strip_query
|
287
|
+
)
|
288
|
+
|
289
|
+
initialize_filters(
|
290
|
+
schemes: schemes,
|
291
|
+
host: host,
|
292
|
+
hosts: hosts,
|
293
|
+
ignore_hosts: ignore_hosts,
|
294
|
+
ports: ports,
|
295
|
+
ignore_ports: ignore_ports,
|
296
|
+
links: links,
|
297
|
+
ignore_links: ignore_links,
|
298
|
+
urls: urls,
|
299
|
+
ignore_urls: ignore_urls,
|
300
|
+
exts: exts,
|
301
|
+
ignore_exts: ignore_exts
|
302
|
+
)
|
303
|
+
initialize_actions
|
304
|
+
initialize_events
|
305
|
+
|
306
|
+
initialize_robots if robots
|
224
307
|
|
225
308
|
yield self if block_given?
|
226
309
|
end
|
@@ -231,8 +314,8 @@ module Spidr
|
|
231
314
|
# @param [URI::HTTP, String] url
|
232
315
|
# The URL to start spidering at.
|
233
316
|
#
|
234
|
-
# @param [Hash]
|
235
|
-
# Additional
|
317
|
+
# @param [Hash{Symbol => Object}] kwargs
|
318
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
236
319
|
#
|
237
320
|
# @yield [agent]
|
238
321
|
# If a block is given, it will be passed the newly created agent
|
@@ -241,12 +324,16 @@ module Spidr
|
|
241
324
|
# @yieldparam [Agent] agent
|
242
325
|
# The newly created agent.
|
243
326
|
#
|
327
|
+
# @return [Agent]
|
328
|
+
# The created agent object.
|
329
|
+
#
|
244
330
|
# @see #initialize
|
245
331
|
# @see #start_at
|
246
332
|
#
|
247
|
-
def self.start_at(url
|
248
|
-
agent = new(
|
333
|
+
def self.start_at(url,**kwargs,&block)
|
334
|
+
agent = new(**kwargs,&block)
|
249
335
|
agent.start_at(url)
|
336
|
+
return agent
|
250
337
|
end
|
251
338
|
|
252
339
|
#
|
@@ -255,8 +342,8 @@ module Spidr
|
|
255
342
|
# @param [URI::HTTP, String] url
|
256
343
|
# The web-site to spider.
|
257
344
|
#
|
258
|
-
# @param [Hash]
|
259
|
-
# Additional
|
345
|
+
# @param [Hash{Symbol => Object}] kwargs
|
346
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
260
347
|
#
|
261
348
|
# @yield [agent]
|
262
349
|
# If a block is given, it will be passed the newly created agent
|
@@ -265,13 +352,17 @@ module Spidr
|
|
265
352
|
# @yieldparam [Agent] agent
|
266
353
|
# The newly created agent.
|
267
354
|
#
|
355
|
+
# @return [Agent]
|
356
|
+
# The created agent object.
|
357
|
+
#
|
268
358
|
# @see #initialize
|
269
359
|
#
|
270
|
-
def self.site(url
|
360
|
+
def self.site(url,**kwargs,&block)
|
271
361
|
url = URI(url)
|
272
362
|
|
273
|
-
agent = new(
|
363
|
+
agent = new(host: url.host, **kwargs, &block)
|
274
364
|
agent.start_at(url)
|
365
|
+
return agent
|
275
366
|
end
|
276
367
|
|
277
368
|
#
|
@@ -280,8 +371,35 @@ module Spidr
|
|
280
371
|
# @param [String] name
|
281
372
|
# The host-name to spider.
|
282
373
|
#
|
283
|
-
# @param [Hash]
|
284
|
-
# Additional
|
374
|
+
# @param [Hash{Symbol => Object}] kwargs
|
375
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
376
|
+
#
|
377
|
+
# @yield [agent]
|
378
|
+
# If a block is given, it will be passed the newly created agent
|
379
|
+
# before it begins spidering.
|
380
|
+
#
|
381
|
+
# @yieldparam [Agent] agent
|
382
|
+
# The newly created agent.
|
383
|
+
#
|
384
|
+
# @return [Agent]
|
385
|
+
# The created agent object.
|
386
|
+
#
|
387
|
+
# @see #initialize
|
388
|
+
#
|
389
|
+
def self.host(name,**kwargs,&block)
|
390
|
+
agent = new(host: name, **kwargs, &block)
|
391
|
+
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
392
|
+
return agent
|
393
|
+
end
|
394
|
+
|
395
|
+
#
|
396
|
+
# Creates a new agent and spiders the entire domain.
|
397
|
+
#
|
398
|
+
# @param [String] name
|
399
|
+
# The top-level domain to spider.
|
400
|
+
#
|
401
|
+
# @param [Hash{Symbol => Object}] kwargs
|
402
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
285
403
|
#
|
286
404
|
# @yield [agent]
|
287
405
|
# If a block is given, it will be passed the newly created agent
|
@@ -290,11 +408,17 @@ module Spidr
|
|
290
408
|
# @yieldparam [Agent] agent
|
291
409
|
# The newly created agent.
|
292
410
|
#
|
411
|
+
# @return [Agent]
|
412
|
+
# The created agent object.
|
413
|
+
#
|
293
414
|
# @see #initialize
|
294
415
|
#
|
295
|
-
|
296
|
-
|
416
|
+
# @since 0.7.0
|
417
|
+
#
|
418
|
+
def self.domain(name,**kwargs,&block)
|
419
|
+
agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block)
|
297
420
|
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
421
|
+
return agent
|
298
422
|
end
|
299
423
|
|
300
424
|
#
|
@@ -314,10 +438,10 @@ module Spidr
|
|
314
438
|
#
|
315
439
|
# Sets the proxy information that the agent uses.
|
316
440
|
#
|
317
|
-
# @param [Proxy] new_proxy
|
441
|
+
# @param [Proxy, Hash, URI::HTTP, String, nil] new_proxy
|
318
442
|
# The new proxy information.
|
319
443
|
#
|
320
|
-
# @return [
|
444
|
+
# @return [Proxy]
|
321
445
|
# The new proxy information.
|
322
446
|
#
|
323
447
|
# @see SessionCache#proxy=
|
@@ -534,7 +658,7 @@ module Spidr
|
|
534
658
|
def enqueue(url,level=0)
|
535
659
|
url = sanitize_url(url)
|
536
660
|
|
537
|
-
if (!
|
661
|
+
if (!queued?(url) && visit?(url))
|
538
662
|
link = url.to_s
|
539
663
|
|
540
664
|
begin
|
@@ -633,7 +757,7 @@ module Spidr
|
|
633
757
|
end
|
634
758
|
|
635
759
|
#
|
636
|
-
# Visits a given URL, and
|
760
|
+
# Visits a given URL, and enqueues the links recovered from the URL
|
637
761
|
# to be visited later.
|
638
762
|
#
|
639
763
|
# @param [URI::HTTP, String] url
|
data/lib/spidr/auth_store.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'extensions/uri'
|
4
|
+
require_relative 'auth_credential'
|
5
|
+
require_relative 'page'
|
4
6
|
|
5
7
|
require 'base64'
|
6
8
|
|
@@ -20,7 +22,7 @@ module Spidr
|
|
20
22
|
@credentials = {}
|
21
23
|
end
|
22
24
|
|
23
|
-
#
|
25
|
+
#
|
24
26
|
# Given a URL, return the most specific matching auth credential.
|
25
27
|
#
|
26
28
|
# @param [URI] url
|
@@ -54,7 +56,7 @@ module Spidr
|
|
54
56
|
return nil
|
55
57
|
end
|
56
58
|
|
57
|
-
#
|
59
|
+
#
|
58
60
|
# Add an auth credential to the store for supplied base URL.
|
59
61
|
#
|
60
62
|
# @param [URI] url
|
@@ -109,7 +111,7 @@ module Spidr
|
|
109
111
|
# or `nil` if no authorization exists.
|
110
112
|
#
|
111
113
|
# @param [URI] url
|
112
|
-
# The
|
114
|
+
# The URL.
|
113
115
|
#
|
114
116
|
# @return [String, nil]
|
115
117
|
# The base64 encoded authorizatio string or `nil`.
|
@@ -122,7 +124,7 @@ module Spidr
|
|
122
124
|
end
|
123
125
|
end
|
124
126
|
|
125
|
-
#
|
127
|
+
#
|
126
128
|
# Clear the contents of the auth store.
|
127
129
|
#
|
128
130
|
# @return [AuthStore]
|
data/lib/spidr/cookie_jar.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'page'
|
2
4
|
|
3
5
|
require 'set'
|
4
6
|
|
@@ -42,8 +44,8 @@ module Spidr
|
|
42
44
|
@params.each(&block)
|
43
45
|
end
|
44
46
|
|
45
|
-
#
|
46
|
-
# Return all relevant cookies in a single string for the
|
47
|
+
#
|
48
|
+
# Return all relevant cookies in a single string for the
|
47
49
|
# named host or domain (in browser request format).
|
48
50
|
#
|
49
51
|
# @param [String] host
|
@@ -59,7 +61,7 @@ module Spidr
|
|
59
61
|
@params[host] ||= {}
|
60
62
|
end
|
61
63
|
|
62
|
-
#
|
64
|
+
#
|
63
65
|
# Add a cookie to the jar for a particular domain.
|
64
66
|
#
|
65
67
|
# @param [String] host
|
@@ -166,7 +168,7 @@ module Spidr
|
|
166
168
|
return host_cookies
|
167
169
|
end
|
168
170
|
|
169
|
-
#
|
171
|
+
#
|
170
172
|
# Clear out the jar, removing all stored cookies.
|
171
173
|
#
|
172
174
|
# @since 0.2.2
|
data/lib/spidr/extensions/uri.rb
CHANGED
data/lib/spidr/extensions.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Spidr
|
2
4
|
class Page
|
3
5
|
#
|
@@ -221,5 +223,56 @@ module Spidr
|
|
221
223
|
def zip?
|
222
224
|
is_content_type?('application/zip')
|
223
225
|
end
|
226
|
+
|
227
|
+
#
|
228
|
+
# Determines if the page is a PNG image.
|
229
|
+
#
|
230
|
+
# @return [Boolean]
|
231
|
+
# Specifies whether the page is a PNG image.
|
232
|
+
#
|
233
|
+
# @since 0.7.0
|
234
|
+
#
|
235
|
+
def png?
|
236
|
+
is_content_type?('image/png')
|
237
|
+
end
|
238
|
+
|
239
|
+
#
|
240
|
+
# Determines if the page is a GIF image.
|
241
|
+
#
|
242
|
+
# @return [Boolean]
|
243
|
+
# Specifies whether the page is a GIF image.
|
244
|
+
#
|
245
|
+
# @since 0.7.0
|
246
|
+
#
|
247
|
+
def gif?
|
248
|
+
is_content_type?('image/gif')
|
249
|
+
end
|
250
|
+
|
251
|
+
#
|
252
|
+
# Determines if the page is a JPEG image.
|
253
|
+
#
|
254
|
+
# @return [Boolean]
|
255
|
+
# Specifies whether the page is a JPEG image.
|
256
|
+
#
|
257
|
+
# @since 0.7.0
|
258
|
+
#
|
259
|
+
def jpeg?
|
260
|
+
is_content_type?('image/jpeg')
|
261
|
+
end
|
262
|
+
|
263
|
+
#
|
264
|
+
# Determines if the page is a ICO image.
|
265
|
+
#
|
266
|
+
# @return [Boolean]
|
267
|
+
# Specifies whether the page is a ICO image.
|
268
|
+
#
|
269
|
+
# @since 0.7.0
|
270
|
+
#
|
271
|
+
def ico?
|
272
|
+
is_content_type?('image/x-icon') ||
|
273
|
+
is_content_type?('image/vnd.microsoft.icon')
|
274
|
+
end
|
275
|
+
|
276
|
+
alias icon? ico?
|
224
277
|
end
|
225
278
|
end
|