spidr 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/spidr/agent.rb CHANGED
@@ -19,12 +19,12 @@ module Spidr
19
19
 
20
20
  include Settings::UserAgent
21
21
 
22
- # HTTP Host Header to use
22
+ # HTTP Host `Header` to use
23
23
  #
24
24
  # @return [String]
25
25
  attr_accessor :host_header
26
26
 
27
- # HTTP Host Headers to use for specific hosts
27
+ # HTTP `Host` Headers to use for specific hosts
28
28
  #
29
29
  # @return [Hash{String,Regexp => String}]
30
30
  attr_reader :host_headers
@@ -96,70 +96,110 @@ module Spidr
96
96
  #
97
97
  # Creates a new Agent object.
98
98
  #
99
- # @param [Hash] options
100
- # Additional options
99
+ # @param [String, nil] host_header
100
+ # The HTTP `Host` header to use with each request.
101
101
  #
102
- # @option options [Integer] :open_timeout (Spidr.open_timeout)
103
- # Optional open timeout.
102
+ # @param [Hash{String,Regexp => String}] host_headers
103
+ # The HTTP `Host` headers to use for specific hosts.
104
104
  #
105
- # @option options [Integer] :read_timeout (Spidr.read_timeout)
105
+ # @param [Hash{String => String}] default_headers
106
+ # Default headers to set for every request.
107
+ #
108
+ # @param [String, nil] user_agent
109
+ # The `User-Agent` string to send with each requests.
110
+ #
111
+ # @param [String, nil] referer
112
+ # The `Referer` URL to send with each request.
113
+ #
114
+ # @param [Integer, nil] open_timeout
115
+ # Optional open connection timeout.
116
+ #
117
+ # @param [Integer, nil] read_timeout
106
118
  # Optional read timeout.
107
119
  #
108
- # @option options [Integer] :ssl_timeout (Spidr.ssl_timeout)
109
- # Optional ssl timeout.
120
+ # @param [Integer, nil] ssl_timeout
121
+ # Optional SSL connection timeout.
110
122
  #
111
- # @option options [Integer] :continue_timeout (Spidr.continue_timeout)
123
+ # @param [Integer, nil] continue_timeout
112
124
  # Optional continue timeout.
113
125
  #
114
- # @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
115
- # Optional keep_alive timeout.
126
+ # @param [Integer, nil] keep_alive_timeout
127
+ # Optional `Keep-Alive` timeout.
116
128
  #
117
- # @option options [Hash] :proxy (Spidr.proxy)
129
+ # @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy
118
130
  # The proxy information to use.
119
131
  #
120
- # @option :proxy [String] :host
132
+ # @option proxy [String] :host
121
133
  # The host the proxy is running on.
122
134
  #
123
- # @option :proxy [Integer] :port
135
+ # @option proxy [Integer] :port (8080)
124
136
  # The port the proxy is running on.
125
137
  #
126
- # @option :proxy [String] :user
138
+ # @option proxy [String, nil] :user
127
139
  # The user to authenticate as with the proxy.
128
140
  #
129
- # @option :proxy [String] :password
141
+ # @option proxy [String, nil] :password
130
142
  # The password to authenticate with.
131
143
  #
132
- # @option options [Hash{String => String}] :default_headers
133
- # Default headers to set for every request.
144
+ # @param [Integer] delay
145
+ # The number of seconds to pause between each request.
134
146
  #
135
- # @option options [String] :host_header
136
- # The HTTP Host header to use with each request.
147
+ # @param [Integer, nil] limit
148
+ # The maximum number of pages to visit.
137
149
  #
138
- # @option options [Hash{String,Regexp => String}] :host_headers
139
- # The HTTP Host headers to use for specific hosts.
150
+ # @param [Integer, nil] max_depth
151
+ # The maximum link depth to follow.
140
152
  #
141
- # @option options [String] :user_agent (Spidr.user_agent)
142
- # The User-Agent string to send with each requests.
153
+ # @param [Set, Array, nil] queue
154
+ # The initial queue of URLs to visit.
143
155
  #
144
- # @option options [String] :referer
145
- # The Referer URL to send with each request.
156
+ # @param [Set, Array, nil] history
157
+ # The initial list of visited URLs.
146
158
  #
147
- # @option options [Integer] :delay (0)
148
- # The number of seconds to pause between each request.
159
+ # @param [Boolean] strip_fragments
160
+ # Controls whether to strip the fragment components from the URLs.
149
161
  #
150
- # @option options [Set, Array] :queue
151
- # The initial queue of URLs to visit.
162
+ # @param [Boolean] strip_query
163
+ # Controls whether to strip the query components from the URLs.
152
164
  #
153
- # @option options [Set, Array] :history
154
- # The initial list of visited URLs.
165
+ # @param [Array<String>] schemes
166
+ # The list of acceptable URI schemes to visit.
167
+ # The `https` scheme will be ignored if `net/https` cannot be loaded.
155
168
  #
156
- # @option options [Integer] :limit
157
- # The maximum number of pages to visit.
169
+ # @param [String] host
170
+ # The host-name to visit.
158
171
  #
159
- # @option options [Integer] :max_depth
160
- # The maximum link depth to follow.
172
+ # @param [Array<String, Regexp, Proc>] hosts
173
+ # The patterns which match the host-names to visit.
161
174
  #
162
- # @option options [Boolean] :robots (Spidr.robots?)
175
+ # @param [Array<String, Regexp, Proc>] ignore_hosts
176
+ # The patterns which match the host-names to not visit.
177
+ #
178
+ # @param [Array<Integer, Regexp, Proc>] ports
179
+ # The patterns which match the ports to visit.
180
+ #
181
+ # @param [Array<Integer, Regexp, Proc>] ignore_ports
182
+ # The patterns which match the ports to not visit.
183
+ #
184
+ # @param [Array<String, Regexp, Proc>] links
185
+ # The patterns which match the links to visit.
186
+ #
187
+ # @param [Array<String, Regexp, Proc>] ignore_links
188
+ # The patterns which match the links to not visit.
189
+ #
190
+ # @param [Array<String, Regexp, Proc>] urls
191
+ # The patterns which match the URLs to visit.
192
+ #
193
+ # @param [Array<String, Regexp, Proc>] ignore_urls
194
+ # The patterns which match the URLs to not visit.
195
+ #
196
+ # @param [Array<String, Regexp, Proc>] exts
197
+ # The patterns which match the URI path extensions to visit.
198
+ #
199
+ # @param [Array<String, Regexp, Proc>] ignore_exts
200
+ # The patterns which match the URI path extensions to not visit.
201
+ #
202
+ # @param [Boolean] robots
163
203
  # Specifies whether `robots.txt` should be honored.
164
204
  #
165
205
  # @yield [agent]
@@ -169,58 +209,99 @@ module Spidr
169
209
  # @yieldparam [Agent] agent
170
210
  # The newly created agent.
171
211
  #
172
- # @see #initialize_sanitizers
173
- # @see #initialize_filters
174
- # @see #initialize_actions
175
- # @see #initialize_events
176
- #
177
- def initialize(options={})
178
- @host_header = options[:host_header]
179
- @host_headers = {}
180
-
181
- if options[:host_headers]
182
- @host_headers.merge!(options[:host_headers])
183
- end
184
-
185
- @default_headers = {}
186
-
187
- if options[:default_headers]
188
- @default_headers.merge!(options[:default_headers])
189
- end
190
-
191
- @user_agent = options.fetch(:user_agent,Spidr.user_agent)
192
- @referer = options[:referer]
193
-
194
- @sessions = SessionCache.new(options)
212
+ def initialize(# header keyword arguments
213
+ host_header: nil,
214
+ host_headers: {},
215
+ default_headers: {},
216
+ user_agent: Spidr.user_agent,
217
+ referer: nil,
218
+ # session cache keyword arguments
219
+ proxy: Spidr.proxy,
220
+ open_timeout: Spidr.open_timeout,
221
+ ssl_timeout: Spidr.ssl_timeout,
222
+ read_timeout: Spidr.read_timeout,
223
+ continue_timeout: Spidr.continue_timeout,
224
+ keep_alive_timeout: Spidr.keep_alive_timeout,
225
+ # spidering controls keyword arguments
226
+ delay: 0,
227
+ limit: nil,
228
+ max_depth: nil,
229
+ # history keyword arguments
230
+ queue: nil,
231
+ history: nil,
232
+ # sanitizer keyword arguments
233
+ strip_fragments: true,
234
+ strip_query: false,
235
+ # filtering keyword arguments
236
+ schemes: self.class.default_schemes,
237
+ host: nil,
238
+ hosts: nil,
239
+ ignore_hosts: nil,
240
+ ports: nil,
241
+ ignore_ports: nil,
242
+ links: nil,
243
+ ignore_links: nil,
244
+ urls: nil,
245
+ ignore_urls: nil,
246
+ exts: nil,
247
+ ignore_exts: nil,
248
+ # robots keyword arguments
249
+ robots: Spidr.robots?)
250
+ @host_header = host_header
251
+ @host_headers = host_headers
252
+
253
+ @default_headers = default_headers
254
+
255
+ @user_agent = user_agent
256
+ @referer = referer
257
+
258
+ @sessions = SessionCache.new(
259
+ proxy: proxy,
260
+ open_timeout: open_timeout,
261
+ ssl_timeout: ssl_timeout,
262
+ read_timeout: read_timeout,
263
+ continue_timeout: continue_timeout,
264
+ keep_alive_timeout: keep_alive_timeout
265
+ )
195
266
  @cookies = CookieJar.new
196
267
  @authorized = AuthStore.new
197
268
 
198
269
  @running = false
199
- @delay = options.fetch(:delay,0)
270
+ @delay = delay
200
271
  @history = Set[]
201
272
  @failures = Set[]
202
273
  @queue = []
203
274
 
204
- @limit = options[:limit]
275
+ @limit = limit
205
276
  @levels = Hash.new(0)
206
- @max_depth = options[:max_depth]
207
-
208
- if options[:queue]
209
- self.queue = options[:queue]
210
- end
211
-
212
- if options[:history]
213
- self.history = options[:history]
214
- end
215
-
216
- initialize_sanitizers(options)
217
- initialize_filters(options)
218
- initialize_actions(options)
219
- initialize_events(options)
220
-
221
- if options.fetch(:robots,Spidr.robots?)
222
- initialize_robots
223
- end
277
+ @max_depth = max_depth
278
+
279
+ self.queue = queue if queue
280
+ self.history = history if history
281
+
282
+ initialize_sanitizers(
283
+ strip_fragments: strip_fragments,
284
+ strip_query: strip_query
285
+ )
286
+
287
+ initialize_filters(
288
+ schemes: schemes,
289
+ host: host,
290
+ hosts: hosts,
291
+ ignore_hosts: ignore_hosts,
292
+ ports: ports,
293
+ ignore_ports: ignore_ports,
294
+ links: links,
295
+ ignore_links: ignore_links,
296
+ urls: urls,
297
+ ignore_urls: ignore_urls,
298
+ exts: exts,
299
+ ignore_exts: ignore_exts
300
+ )
301
+ initialize_actions
302
+ initialize_events
303
+
304
+ initialize_robots if robots
224
305
 
225
306
  yield self if block_given?
226
307
  end
@@ -231,8 +312,8 @@ module Spidr
231
312
  # @param [URI::HTTP, String] url
232
313
  # The URL to start spidering at.
233
314
  #
234
- # @param [Hash] options
235
- # Additional options. See {Agent#initialize}.
315
+ # @param [Hash{Symbol => Object}] kwargs
316
+ # Additional keyword arguments. See {Agent#initialize}.
236
317
  #
237
318
  # @yield [agent]
238
319
  # If a block is given, it will be passed the newly created agent
@@ -241,12 +322,16 @@ module Spidr
241
322
  # @yieldparam [Agent] agent
242
323
  # The newly created agent.
243
324
  #
325
+ # @return [Agent]
326
+ # The created agent object.
327
+ #
244
328
  # @see #initialize
245
329
  # @see #start_at
246
330
  #
247
- def self.start_at(url,options={},&block)
248
- agent = new(options,&block)
331
+ def self.start_at(url,**kwargs,&block)
332
+ agent = new(**kwargs,&block)
249
333
  agent.start_at(url)
334
+ return agent
250
335
  end
251
336
 
252
337
  #
@@ -255,8 +340,8 @@ module Spidr
255
340
  # @param [URI::HTTP, String] url
256
341
  # The web-site to spider.
257
342
  #
258
- # @param [Hash] options
259
- # Additional options. See {Agent#initialize}.
343
+ # @param [Hash{Symbol => Object}] kwargs
344
+ # Additional keyword arguments. See {Agent#initialize}.
260
345
  #
261
346
  # @yield [agent]
262
347
  # If a block is given, it will be passed the newly created agent
@@ -265,13 +350,17 @@ module Spidr
265
350
  # @yieldparam [Agent] agent
266
351
  # The newly created agent.
267
352
  #
353
+ # @return [Agent]
354
+ # The created agent object.
355
+ #
268
356
  # @see #initialize
269
357
  #
270
- def self.site(url,options={},&block)
358
+ def self.site(url,**kwargs,&block)
271
359
  url = URI(url)
272
360
 
273
- agent = new(options.merge(host: url.host),&block)
361
+ agent = new(host: url.host, **kwargs, &block)
274
362
  agent.start_at(url)
363
+ return agent
275
364
  end
276
365
 
277
366
  #
@@ -280,8 +369,35 @@ module Spidr
280
369
  # @param [String] name
281
370
  # The host-name to spider.
282
371
  #
283
- # @param [Hash] options
284
- # Additional options. See {Agent#initialize}.
372
+ # @param [Hash{Symbol => Object}] kwargs
373
+ # Additional keyword arguments. See {Agent#initialize}.
374
+ #
375
+ # @yield [agent]
376
+ # If a block is given, it will be passed the newly created agent
377
+ # before it begins spidering.
378
+ #
379
+ # @yieldparam [Agent] agent
380
+ # The newly created agent.
381
+ #
382
+ # @return [Agent]
383
+ # The created agent object.
384
+ #
385
+ # @see #initialize
386
+ #
387
+ def self.host(name,**kwargs,&block)
388
+ agent = new(host: name, **kwargs, &block)
389
+ agent.start_at(URI::HTTP.build(host: name, path: '/'))
390
+ return agent
391
+ end
392
+
393
+ #
394
+ # Creates a new agent and spiders the entire domain.
395
+ #
396
+ # @param [String] name
397
+ # The top-level domain to spider.
398
+ #
399
+ # @param [Hash{Symbol => Object}] kwargs
400
+ # Additional keyword arguments. See {Agent#initialize}.
285
401
  #
286
402
  # @yield [agent]
287
403
  # If a block is given, it will be passed the newly created agent
@@ -290,11 +406,17 @@ module Spidr
290
406
  # @yieldparam [Agent] agent
291
407
  # The newly created agent.
292
408
  #
409
+ # @return [Agent]
410
+ # The created agent object.
411
+ #
293
412
  # @see #initialize
294
413
  #
295
- def self.host(name,options={},&block)
296
- agent = new(options.merge(host: name),&block)
414
+ # @since 0.7.0
415
+ #
416
+ def self.domain(name,**kwargs,&block)
417
+ agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block)
297
418
  agent.start_at(URI::HTTP.build(host: name, path: '/'))
419
+ return agent
298
420
  end
299
421
 
300
422
  #
@@ -314,10 +436,10 @@ module Spidr
314
436
  #
315
437
  # Sets the proxy information that the agent uses.
316
438
  #
317
- # @param [Proxy] new_proxy
439
+ # @param [Proxy, Hash, URI::HTTP, String, nil] new_proxy
318
440
  # The new proxy information.
319
441
  #
320
- # @return [Hash]
442
+ # @return [Proxy]
321
443
  # The new proxy information.
322
444
  #
323
445
  # @see SessionCache#proxy=
@@ -534,7 +656,7 @@ module Spidr
534
656
  def enqueue(url,level=0)
535
657
  url = sanitize_url(url)
536
658
 
537
- if (!(queued?(url)) && visit?(url))
659
+ if (!queued?(url) && visit?(url))
538
660
  link = url.to_s
539
661
 
540
662
  begin
@@ -633,7 +755,7 @@ module Spidr
633
755
  end
634
756
 
635
757
  #
636
- # Visits a given URL, and enqueus the links recovered from the URL
758
+ # Visits a given URL, and enqueues the links recovered from the URL
637
759
  # to be visited later.
638
760
  #
639
761
  # @param [URI::HTTP, String] url
@@ -109,7 +109,7 @@ module Spidr
109
109
  # or `nil` if no authorization exists.
110
110
  #
111
111
  # @param [URI] url
112
- # The url.
112
+ # The URL.
113
113
  #
114
114
  # @return [String, nil]
115
115
  # The base64 encoded authorizatio string or `nil`.
@@ -221,5 +221,56 @@ module Spidr
221
221
  def zip?
222
222
  is_content_type?('application/zip')
223
223
  end
224
+
225
+ #
226
+ # Determines if the page is a PNG image.
227
+ #
228
+ # @return [Boolean]
229
+ # Specifies whether the page is a PNG image.
230
+ #
231
+ # @since 0.7.0
232
+ #
233
+ def png?
234
+ is_content_type?('image/png')
235
+ end
236
+
237
+ #
238
+ # Determines if the page is a GIF image.
239
+ #
240
+ # @return [Boolean]
241
+ # Specifies whether the page is a GIF image.
242
+ #
243
+ # @since 0.7.0
244
+ #
245
+ def gif?
246
+ is_content_type?('image/gif')
247
+ end
248
+
249
+ #
250
+ # Determines if the page is a JPEG image.
251
+ #
252
+ # @return [Boolean]
253
+ # Specifies whether the page is a JPEG image.
254
+ #
255
+ # @since 0.7.0
256
+ #
257
+ def jpeg?
258
+ is_content_type?('image/jpeg')
259
+ end
260
+
261
+ #
262
+ # Determines if the page is a ICO image.
263
+ #
264
+ # @return [Boolean]
265
+ # Specifies whether the page is a ICO image.
266
+ #
267
+ # @since 0.7.0
268
+ #
269
+ def ico?
270
+ is_content_type?('image/x-icon') ||
271
+ is_content_type?('image/vnd.microsoft.icon')
272
+ end
273
+
274
+ alias icon? ico?
224
275
  end
225
276
  end
@@ -105,7 +105,9 @@ module Spidr
105
105
  def each_redirect(&block)
106
106
  return enum_for(__method__) unless block
107
107
 
108
- if (locations = @response.get_fields('Location'))
108
+ locations = @response.get_fields('Location')
109
+
110
+ unless (locations.nil? || locations.empty?)
109
111
  # Location headers override any meta-refresh redirects in the HTML
110
112
  locations.each(&block)
111
113
  else
@@ -175,34 +177,30 @@ module Spidr
175
177
  #
176
178
  # @since 0.3.0
177
179
  #
178
- def each_link
180
+ def each_link(&block)
179
181
  return enum_for(__method__) unless block_given?
180
182
 
181
- filter = lambda { |url|
182
- yield url unless (url.nil? || url.empty?)
183
- }
184
-
185
- each_redirect(&filter) if is_redirect?
183
+ each_redirect(&block) if is_redirect?
186
184
 
187
185
  if (html? && doc)
188
- doc.search('//a[@href]').each do |a|
189
- filter.call(a.get_attribute('href'))
186
+ doc.search('//a[@href[string()]]').each do |a|
187
+ yield a.get_attribute('href')
190
188
  end
191
189
 
192
- doc.search('//frame[@src]').each do |iframe|
193
- filter.call(iframe.get_attribute('src'))
190
+ doc.search('//frame[@src[string()]]').each do |iframe|
191
+ yield iframe.get_attribute('src')
194
192
  end
195
193
 
196
- doc.search('//iframe[@src]').each do |iframe|
197
- filter.call(iframe.get_attribute('src'))
194
+ doc.search('//iframe[@src[string()]]').each do |iframe|
195
+ yield iframe.get_attribute('src')
198
196
  end
199
197
 
200
- doc.search('//link[@href]').each do |link|
201
- filter.call(link.get_attribute('href'))
198
+ doc.search('//link[@href[string()]]').each do |link|
199
+ yield link.get_attribute('href')
202
200
  end
203
201
 
204
- doc.search('//script[@src]').each do |script|
205
- filter.call(script.get_attribute('src'))
202
+ doc.search('//script[@src[string()]]').each do |script|
203
+ yield script.get_attribute('src')
206
204
  end
207
205
  end
208
206
  end
@@ -211,7 +209,7 @@ module Spidr
211
209
  # The links from within the page.
212
210
  #
213
211
  # @return [Array<String>]
214
- # All links within the HTML page, frame/iframe source URLs and any
212
+ # All links within the HTML page, `frame`/`iframe` source URLs and any
215
213
  # links in the `Location` header.
216
214
  #
217
215
  def links
@@ -22,16 +22,6 @@ module Spidr
22
22
 
23
23
  alias ok? is_ok?
24
24
 
25
- #
26
- # Determines if the response code is `308`.
27
- #
28
- # @return [Boolean]
29
- # Specifies whether the response code is `308`.
30
- #
31
- def timedout?
32
- code == 308
33
- end
34
-
35
25
  #
36
26
  # Determines if the response code is `400`.
37
27
  #
@@ -78,6 +68,18 @@ module Spidr
78
68
 
79
69
  alias missing? is_missing?
80
70
 
71
+ #
72
+ # Determines if the response code is `408`.
73
+ #
74
+ # @return [Boolean]
75
+ # Specifies whether the response code is `408`.
76
+ #
77
+ def is_timedout?
78
+ code == 408
79
+ end
80
+
81
+ alias timedout? is_timedout?
82
+
81
83
  #
82
84
  # Determines if the response code is `500`.
83
85
  #
data/lib/spidr/proxy.rb CHANGED
@@ -10,28 +10,20 @@ module Spidr
10
10
  #
11
11
  # Initializes the proxy.
12
12
  #
13
- # @param [Hash] attributes
14
- # Attributes for the proxy.
15
- #
16
- # @option attributes [String] :host
13
+ # @param [String] host
17
14
  # The host the proxy is running on.
18
15
  #
19
- # @option attributes [Integer] :port
16
+ # @param [Integer] port
20
17
  # The port the proxy is running on.
21
18
  #
22
- # @option attributes [String] :user
19
+ # @param [String] user
23
20
  # The user to authenticate as with the proxy.
24
21
  #
25
- # @option attributes [String] :password
22
+ # @param [String] password
26
23
  # The password to authenticate with.
27
24
  #
28
- def initialize(attributes={})
29
- super(
30
- attributes[:host],
31
- attributes.fetch(:port,DEFAULT_PORT),
32
- attributes[:user],
33
- attributes[:password]
34
- )
25
+ def initialize(host: nil, port: DEFAULT_PORT, user: nil, password: nil)
26
+ super(host,port,user,password)
35
27
  end
36
28
 
37
29
  #
data/lib/spidr/rules.rb CHANGED
@@ -14,21 +14,18 @@ module Spidr
14
14
  #
15
15
  # Creates a new Rules object.
16
16
  #
17
- # @param [Hash] options
18
- # Additional options.
19
- #
20
- # @option options [Array<String, Regexp, Proc>] :accept
17
+ # @param [Array<String, Regexp, Proc>, nil] accept
21
18
  # The patterns to accept data with.
22
19
  #
23
- # @option options [Array<String, Regexp, Proc>] :reject
20
+ # @param [Array<String, Regexp, Proc>, nil] reject
24
21
  # The patterns to reject data with.
25
22
  #
26
- def initialize(options={})
23
+ def initialize(accept: nil, reject: nil)
27
24
  @accept = []
28
25
  @reject = []
29
26
 
30
- @accept += options[:accept] if options[:accept]
31
- @reject += options[:reject] if options[:reject]
27
+ @accept += accept if accept
28
+ @reject += reject if reject
32
29
  end
33
30
 
34
31
  #