spidr 0.6.1 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/spidr/agent.rb CHANGED
@@ -19,12 +19,12 @@ module Spidr
19
19
 
20
20
  include Settings::UserAgent
21
21
 
22
- # HTTP Host Header to use
22
+ # HTTP Host `Header` to use
23
23
  #
24
24
  # @return [String]
25
25
  attr_accessor :host_header
26
26
 
27
- # HTTP Host Headers to use for specific hosts
27
+ # HTTP `Host` Headers to use for specific hosts
28
28
  #
29
29
  # @return [Hash{String,Regexp => String}]
30
30
  attr_reader :host_headers
@@ -96,70 +96,110 @@ module Spidr
96
96
  #
97
97
  # Creates a new Agent object.
98
98
  #
99
- # @param [Hash] options
100
- # Additional options
99
+ # @param [String, nil] host_header
100
+ # The HTTP `Host` header to use with each request.
101
101
  #
102
- # @option options [Integer] :open_timeout (Spidr.open_timeout)
103
- # Optional open timeout.
102
+ # @param [Hash{String,Regexp => String}] host_headers
103
+ # The HTTP `Host` headers to use for specific hosts.
104
104
  #
105
- # @option options [Integer] :read_timeout (Spidr.read_timeout)
105
+ # @param [Hash{String => String}] default_headers
106
+ # Default headers to set for every request.
107
+ #
108
+ # @param [String, nil] user_agent
109
+ # The `User-Agent` string to send with each requests.
110
+ #
111
+ # @param [String, nil] referer
112
+ # The `Referer` URL to send with each request.
113
+ #
114
+ # @param [Integer, nil] open_timeout
115
+ # Optional open connection timeout.
116
+ #
117
+ # @param [Integer, nil] read_timeout
106
118
  # Optional read timeout.
107
119
  #
108
- # @option options [Integer] :ssl_timeout (Spidr.ssl_timeout)
109
- # Optional ssl timeout.
120
+ # @param [Integer, nil] ssl_timeout
121
+ # Optional SSL connection timeout.
110
122
  #
111
- # @option options [Integer] :continue_timeout (Spidr.continue_timeout)
123
+ # @param [Integer, nil] continue_timeout
112
124
  # Optional continue timeout.
113
125
  #
114
- # @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
115
- # Optional keep_alive timeout.
126
+ # @param [Integer, nil] keep_alive_timeout
127
+ # Optional `Keep-Alive` timeout.
116
128
  #
117
- # @option options [Hash] :proxy (Spidr.proxy)
129
+ # @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy
118
130
  # The proxy information to use.
119
131
  #
120
- # @option :proxy [String] :host
132
+ # @option proxy [String] :host
121
133
  # The host the proxy is running on.
122
134
  #
123
- # @option :proxy [Integer] :port
135
+ # @option proxy [Integer] :port (8080)
124
136
  # The port the proxy is running on.
125
137
  #
126
- # @option :proxy [String] :user
138
+ # @option proxy [String, nil] :user
127
139
  # The user to authenticate as with the proxy.
128
140
  #
129
- # @option :proxy [String] :password
141
+ # @option proxy [String, nil] :password
130
142
  # The password to authenticate with.
131
143
  #
132
- # @option options [Hash{String => String}] :default_headers
133
- # Default headers to set for every request.
144
+ # @param [Integer] delay
145
+ # The number of seconds to pause between each request.
134
146
  #
135
- # @option options [String] :host_header
136
- # The HTTP Host header to use with each request.
147
+ # @param [Integer, nil] limit
148
+ # The maximum number of pages to visit.
137
149
  #
138
- # @option options [Hash{String,Regexp => String}] :host_headers
139
- # The HTTP Host headers to use for specific hosts.
150
+ # @param [Integer, nil] max_depth
151
+ # The maximum link depth to follow.
140
152
  #
141
- # @option options [String] :user_agent (Spidr.user_agent)
142
- # The User-Agent string to send with each requests.
153
+ # @param [Set, Array, nil] queue
154
+ # The initial queue of URLs to visit.
143
155
  #
144
- # @option options [String] :referer
145
- # The Referer URL to send with each request.
156
+ # @param [Set, Array, nil] history
157
+ # The initial list of visited URLs.
146
158
  #
147
- # @option options [Integer] :delay (0)
148
- # The number of seconds to pause between each request.
159
+ # @param [Boolean] strip_fragments
160
+ # Controls whether to strip the fragment components from the URLs.
149
161
  #
150
- # @option options [Set, Array] :queue
151
- # The initial queue of URLs to visit.
162
+ # @param [Boolean] strip_query
163
+ # Controls whether to strip the query components from the URLs.
152
164
  #
153
- # @option options [Set, Array] :history
154
- # The initial list of visited URLs.
165
+ # @param [Array<String>] schemes
166
+ # The list of acceptable URI schemes to visit.
167
+ # The `https` scheme will be ignored if `net/https` cannot be loaded.
155
168
  #
156
- # @option options [Integer] :limit
157
- # The maximum number of pages to visit.
169
+ # @param [String] host
170
+ # The host-name to visit.
158
171
  #
159
- # @option options [Integer] :max_depth
160
- # The maximum link depth to follow.
172
+ # @param [Array<String, Regexp, Proc>] hosts
173
+ # The patterns which match the host-names to visit.
161
174
  #
162
- # @option options [Boolean] :robots (Spidr.robots?)
175
+ # @param [Array<String, Regexp, Proc>] ignore_hosts
176
+ # The patterns which match the host-names to not visit.
177
+ #
178
+ # @param [Array<Integer, Regexp, Proc>] ports
179
+ # The patterns which match the ports to visit.
180
+ #
181
+ # @param [Array<Integer, Regexp, Proc>] ignore_ports
182
+ # The patterns which match the ports to not visit.
183
+ #
184
+ # @param [Array<String, Regexp, Proc>] links
185
+ # The patterns which match the links to visit.
186
+ #
187
+ # @param [Array<String, Regexp, Proc>] ignore_links
188
+ # The patterns which match the links to not visit.
189
+ #
190
+ # @param [Array<String, Regexp, Proc>] urls
191
+ # The patterns which match the URLs to visit.
192
+ #
193
+ # @param [Array<String, Regexp, Proc>] ignore_urls
194
+ # The patterns which match the URLs to not visit.
195
+ #
196
+ # @param [Array<String, Regexp, Proc>] exts
197
+ # The patterns which match the URI path extensions to visit.
198
+ #
199
+ # @param [Array<String, Regexp, Proc>] ignore_exts
200
+ # The patterns which match the URI path extensions to not visit.
201
+ #
202
+ # @param [Boolean] robots
163
203
  # Specifies whether `robots.txt` should be honored.
164
204
  #
165
205
  # @yield [agent]
@@ -169,58 +209,99 @@ module Spidr
169
209
  # @yieldparam [Agent] agent
170
210
  # The newly created agent.
171
211
  #
172
- # @see #initialize_sanitizers
173
- # @see #initialize_filters
174
- # @see #initialize_actions
175
- # @see #initialize_events
176
- #
177
- def initialize(options={})
178
- @host_header = options[:host_header]
179
- @host_headers = {}
180
-
181
- if options[:host_headers]
182
- @host_headers.merge!(options[:host_headers])
183
- end
184
-
185
- @default_headers = {}
186
-
187
- if options[:default_headers]
188
- @default_headers.merge!(options[:default_headers])
189
- end
190
-
191
- @user_agent = options.fetch(:user_agent,Spidr.user_agent)
192
- @referer = options[:referer]
193
-
194
- @sessions = SessionCache.new(options)
212
+ def initialize(# header keyword arguments
213
+ host_header: nil,
214
+ host_headers: {},
215
+ default_headers: {},
216
+ user_agent: Spidr.user_agent,
217
+ referer: nil,
218
+ # session cache keyword arguments
219
+ proxy: Spidr.proxy,
220
+ open_timeout: Spidr.open_timeout,
221
+ ssl_timeout: Spidr.ssl_timeout,
222
+ read_timeout: Spidr.read_timeout,
223
+ continue_timeout: Spidr.continue_timeout,
224
+ keep_alive_timeout: Spidr.keep_alive_timeout,
225
+ # spidering controls keyword arguments
226
+ delay: 0,
227
+ limit: nil,
228
+ max_depth: nil,
229
+ # history keyword arguments
230
+ queue: nil,
231
+ history: nil,
232
+ # sanitizer keyword arguments
233
+ strip_fragments: true,
234
+ strip_query: false,
235
+ # filtering keyword arguments
236
+ schemes: self.class.default_schemes,
237
+ host: nil,
238
+ hosts: nil,
239
+ ignore_hosts: nil,
240
+ ports: nil,
241
+ ignore_ports: nil,
242
+ links: nil,
243
+ ignore_links: nil,
244
+ urls: nil,
245
+ ignore_urls: nil,
246
+ exts: nil,
247
+ ignore_exts: nil,
248
+ # robots keyword arguments
249
+ robots: Spidr.robots?)
250
+ @host_header = host_header
251
+ @host_headers = host_headers
252
+
253
+ @default_headers = default_headers
254
+
255
+ @user_agent = user_agent
256
+ @referer = referer
257
+
258
+ @sessions = SessionCache.new(
259
+ proxy: proxy,
260
+ open_timeout: open_timeout,
261
+ ssl_timeout: ssl_timeout,
262
+ read_timeout: read_timeout,
263
+ continue_timeout: continue_timeout,
264
+ keep_alive_timeout: keep_alive_timeout
265
+ )
195
266
  @cookies = CookieJar.new
196
267
  @authorized = AuthStore.new
197
268
 
198
269
  @running = false
199
- @delay = options.fetch(:delay,0)
270
+ @delay = delay
200
271
  @history = Set[]
201
272
  @failures = Set[]
202
273
  @queue = []
203
274
 
204
- @limit = options[:limit]
275
+ @limit = limit
205
276
  @levels = Hash.new(0)
206
- @max_depth = options[:max_depth]
207
-
208
- if options[:queue]
209
- self.queue = options[:queue]
210
- end
211
-
212
- if options[:history]
213
- self.history = options[:history]
214
- end
215
-
216
- initialize_sanitizers(options)
217
- initialize_filters(options)
218
- initialize_actions(options)
219
- initialize_events(options)
220
-
221
- if options.fetch(:robots,Spidr.robots?)
222
- initialize_robots
223
- end
277
+ @max_depth = max_depth
278
+
279
+ self.queue = queue if queue
280
+ self.history = history if history
281
+
282
+ initialize_sanitizers(
283
+ strip_fragments: strip_fragments,
284
+ strip_query: strip_query
285
+ )
286
+
287
+ initialize_filters(
288
+ schemes: schemes,
289
+ host: host,
290
+ hosts: hosts,
291
+ ignore_hosts: ignore_hosts,
292
+ ports: ports,
293
+ ignore_ports: ignore_ports,
294
+ links: links,
295
+ ignore_links: ignore_links,
296
+ urls: urls,
297
+ ignore_urls: ignore_urls,
298
+ exts: exts,
299
+ ignore_exts: ignore_exts
300
+ )
301
+ initialize_actions
302
+ initialize_events
303
+
304
+ initialize_robots if robots
224
305
 
225
306
  yield self if block_given?
226
307
  end
@@ -231,8 +312,8 @@ module Spidr
231
312
  # @param [URI::HTTP, String] url
232
313
  # The URL to start spidering at.
233
314
  #
234
- # @param [Hash] options
235
- # Additional options. See {Agent#initialize}.
315
+ # @param [Hash{Symbol => Object}] kwargs
316
+ # Additional keyword arguments. See {Agent#initialize}.
236
317
  #
237
318
  # @yield [agent]
238
319
  # If a block is given, it will be passed the newly created agent
@@ -241,12 +322,16 @@ module Spidr
241
322
  # @yieldparam [Agent] agent
242
323
  # The newly created agent.
243
324
  #
325
+ # @return [Agent]
326
+ # The created agent object.
327
+ #
244
328
  # @see #initialize
245
329
  # @see #start_at
246
330
  #
247
- def self.start_at(url,options={},&block)
248
- agent = new(options,&block)
331
+ def self.start_at(url,**kwargs,&block)
332
+ agent = new(**kwargs,&block)
249
333
  agent.start_at(url)
334
+ return agent
250
335
  end
251
336
 
252
337
  #
@@ -255,8 +340,8 @@ module Spidr
255
340
  # @param [URI::HTTP, String] url
256
341
  # The web-site to spider.
257
342
  #
258
- # @param [Hash] options
259
- # Additional options. See {Agent#initialize}.
343
+ # @param [Hash{Symbol => Object}] kwargs
344
+ # Additional keyword arguments. See {Agent#initialize}.
260
345
  #
261
346
  # @yield [agent]
262
347
  # If a block is given, it will be passed the newly created agent
@@ -265,13 +350,17 @@ module Spidr
265
350
  # @yieldparam [Agent] agent
266
351
  # The newly created agent.
267
352
  #
353
+ # @return [Agent]
354
+ # The created agent object.
355
+ #
268
356
  # @see #initialize
269
357
  #
270
- def self.site(url,options={},&block)
358
+ def self.site(url,**kwargs,&block)
271
359
  url = URI(url)
272
360
 
273
- agent = new(options.merge(host: url.host),&block)
361
+ agent = new(host: url.host, **kwargs, &block)
274
362
  agent.start_at(url)
363
+ return agent
275
364
  end
276
365
 
277
366
  #
@@ -280,8 +369,35 @@ module Spidr
280
369
  # @param [String] name
281
370
  # The host-name to spider.
282
371
  #
283
- # @param [Hash] options
284
- # Additional options. See {Agent#initialize}.
372
+ # @param [Hash{Symbol => Object}] kwargs
373
+ # Additional keyword arguments. See {Agent#initialize}.
374
+ #
375
+ # @yield [agent]
376
+ # If a block is given, it will be passed the newly created agent
377
+ # before it begins spidering.
378
+ #
379
+ # @yieldparam [Agent] agent
380
+ # The newly created agent.
381
+ #
382
+ # @return [Agent]
383
+ # The created agent object.
384
+ #
385
+ # @see #initialize
386
+ #
387
+ def self.host(name,**kwargs,&block)
388
+ agent = new(host: name, **kwargs, &block)
389
+ agent.start_at(URI::HTTP.build(host: name, path: '/'))
390
+ return agent
391
+ end
392
+
393
+ #
394
+ # Creates a new agent and spiders the entire domain.
395
+ #
396
+ # @param [String] name
397
+ # The top-level domain to spider.
398
+ #
399
+ # @param [Hash{Symbol => Object}] kwargs
400
+ # Additional keyword arguments. See {Agent#initialize}.
285
401
  #
286
402
  # @yield [agent]
287
403
  # If a block is given, it will be passed the newly created agent
@@ -290,11 +406,17 @@ module Spidr
290
406
  # @yieldparam [Agent] agent
291
407
  # The newly created agent.
292
408
  #
409
+ # @return [Agent]
410
+ # The created agent object.
411
+ #
293
412
  # @see #initialize
294
413
  #
295
- def self.host(name,options={},&block)
296
- agent = new(options.merge(host: name),&block)
414
+ # @since 0.7.0
415
+ #
416
+ def self.domain(name,**kwargs,&block)
417
+ agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block)
297
418
  agent.start_at(URI::HTTP.build(host: name, path: '/'))
419
+ return agent
298
420
  end
299
421
 
300
422
  #
@@ -314,10 +436,10 @@ module Spidr
314
436
  #
315
437
  # Sets the proxy information that the agent uses.
316
438
  #
317
- # @param [Proxy] new_proxy
439
+ # @param [Proxy, Hash, URI::HTTP, String, nil] new_proxy
318
440
  # The new proxy information.
319
441
  #
320
- # @return [Hash]
442
+ # @return [Proxy]
321
443
  # The new proxy information.
322
444
  #
323
445
  # @see SessionCache#proxy=
@@ -534,7 +656,7 @@ module Spidr
534
656
  def enqueue(url,level=0)
535
657
  url = sanitize_url(url)
536
658
 
537
- if (!(queued?(url)) && visit?(url))
659
+ if (!queued?(url) && visit?(url))
538
660
  link = url.to_s
539
661
 
540
662
  begin
@@ -633,7 +755,7 @@ module Spidr
633
755
  end
634
756
 
635
757
  #
636
- # Visits a given URL, and enqueus the links recovered from the URL
758
+ # Visits a given URL, and enqueues the links recovered from the URL
637
759
  # to be visited later.
638
760
  #
639
761
  # @param [URI::HTTP, String] url
@@ -109,7 +109,7 @@ module Spidr
109
109
  # or `nil` if no authorization exists.
110
110
  #
111
111
  # @param [URI] url
112
- # The url.
112
+ # The URL.
113
113
  #
114
114
  # @return [String, nil]
115
115
  # The base64 encoded authorizatio string or `nil`.
@@ -221,5 +221,56 @@ module Spidr
221
221
  def zip?
222
222
  is_content_type?('application/zip')
223
223
  end
224
+
225
+ #
226
+ # Determines if the page is a PNG image.
227
+ #
228
+ # @return [Boolean]
229
+ # Specifies whether the page is a PNG image.
230
+ #
231
+ # @since 0.7.0
232
+ #
233
+ def png?
234
+ is_content_type?('image/png')
235
+ end
236
+
237
+ #
238
+ # Determines if the page is a GIF image.
239
+ #
240
+ # @return [Boolean]
241
+ # Specifies whether the page is a GIF image.
242
+ #
243
+ # @since 0.7.0
244
+ #
245
+ def gif?
246
+ is_content_type?('image/gif')
247
+ end
248
+
249
+ #
250
+ # Determines if the page is a JPEG image.
251
+ #
252
+ # @return [Boolean]
253
+ # Specifies whether the page is a JPEG image.
254
+ #
255
+ # @since 0.7.0
256
+ #
257
+ def jpeg?
258
+ is_content_type?('image/jpeg')
259
+ end
260
+
261
+ #
262
+ # Determines if the page is a ICO image.
263
+ #
264
+ # @return [Boolean]
265
+ # Specifies whether the page is a ICO image.
266
+ #
267
+ # @since 0.7.0
268
+ #
269
+ def ico?
270
+ is_content_type?('image/x-icon') ||
271
+ is_content_type?('image/vnd.microsoft.icon')
272
+ end
273
+
274
+ alias icon? ico?
224
275
  end
225
276
  end
@@ -105,7 +105,9 @@ module Spidr
105
105
  def each_redirect(&block)
106
106
  return enum_for(__method__) unless block
107
107
 
108
- if (locations = @response.get_fields('Location'))
108
+ locations = @response.get_fields('Location')
109
+
110
+ unless (locations.nil? || locations.empty?)
109
111
  # Location headers override any meta-refresh redirects in the HTML
110
112
  locations.each(&block)
111
113
  else
@@ -175,34 +177,30 @@ module Spidr
175
177
  #
176
178
  # @since 0.3.0
177
179
  #
178
- def each_link
180
+ def each_link(&block)
179
181
  return enum_for(__method__) unless block_given?
180
182
 
181
- filter = lambda { |url|
182
- yield url unless (url.nil? || url.empty?)
183
- }
184
-
185
- each_redirect(&filter) if is_redirect?
183
+ each_redirect(&block) if is_redirect?
186
184
 
187
185
  if (html? && doc)
188
- doc.search('//a[@href]').each do |a|
189
- filter.call(a.get_attribute('href'))
186
+ doc.search('//a[@href[string()]]').each do |a|
187
+ yield a.get_attribute('href')
190
188
  end
191
189
 
192
- doc.search('//frame[@src]').each do |iframe|
193
- filter.call(iframe.get_attribute('src'))
190
+ doc.search('//frame[@src[string()]]').each do |iframe|
191
+ yield iframe.get_attribute('src')
194
192
  end
195
193
 
196
- doc.search('//iframe[@src]').each do |iframe|
197
- filter.call(iframe.get_attribute('src'))
194
+ doc.search('//iframe[@src[string()]]').each do |iframe|
195
+ yield iframe.get_attribute('src')
198
196
  end
199
197
 
200
- doc.search('//link[@href]').each do |link|
201
- filter.call(link.get_attribute('href'))
198
+ doc.search('//link[@href[string()]]').each do |link|
199
+ yield link.get_attribute('href')
202
200
  end
203
201
 
204
- doc.search('//script[@src]').each do |script|
205
- filter.call(script.get_attribute('src'))
202
+ doc.search('//script[@src[string()]]').each do |script|
203
+ yield script.get_attribute('src')
206
204
  end
207
205
  end
208
206
  end
@@ -211,7 +209,7 @@ module Spidr
211
209
  # The links from within the page.
212
210
  #
213
211
  # @return [Array<String>]
214
- # All links within the HTML page, frame/iframe source URLs and any
212
+ # All links within the HTML page, `frame`/`iframe` source URLs and any
215
213
  # links in the `Location` header.
216
214
  #
217
215
  def links
@@ -22,16 +22,6 @@ module Spidr
22
22
 
23
23
  alias ok? is_ok?
24
24
 
25
- #
26
- # Determines if the response code is `308`.
27
- #
28
- # @return [Boolean]
29
- # Specifies whether the response code is `308`.
30
- #
31
- def timedout?
32
- code == 308
33
- end
34
-
35
25
  #
36
26
  # Determines if the response code is `400`.
37
27
  #
@@ -78,6 +68,18 @@ module Spidr
78
68
 
79
69
  alias missing? is_missing?
80
70
 
71
+ #
72
+ # Determines if the response code is `408`.
73
+ #
74
+ # @return [Boolean]
75
+ # Specifies whether the response code is `408`.
76
+ #
77
+ def is_timedout?
78
+ code == 408
79
+ end
80
+
81
+ alias timedout? is_timedout?
82
+
81
83
  #
82
84
  # Determines if the response code is `500`.
83
85
  #
data/lib/spidr/proxy.rb CHANGED
@@ -10,28 +10,20 @@ module Spidr
10
10
  #
11
11
  # Initializes the proxy.
12
12
  #
13
- # @param [Hash] attributes
14
- # Attributes for the proxy.
15
- #
16
- # @option attributes [String] :host
13
+ # @param [String] host
17
14
  # The host the proxy is running on.
18
15
  #
19
- # @option attributes [Integer] :port
16
+ # @param [Integer] port
20
17
  # The port the proxy is running on.
21
18
  #
22
- # @option attributes [String] :user
19
+ # @param [String] user
23
20
  # The user to authenticate as with the proxy.
24
21
  #
25
- # @option attributes [String] :password
22
+ # @param [String] password
26
23
  # The password to authenticate with.
27
24
  #
28
- def initialize(attributes={})
29
- super(
30
- attributes[:host],
31
- attributes.fetch(:port,DEFAULT_PORT),
32
- attributes[:user],
33
- attributes[:password]
34
- )
25
+ def initialize(host: nil, port: DEFAULT_PORT, user: nil, password: nil)
26
+ super(host,port,user,password)
35
27
  end
36
28
 
37
29
  #
data/lib/spidr/rules.rb CHANGED
@@ -14,21 +14,18 @@ module Spidr
14
14
  #
15
15
  # Creates a new Rules object.
16
16
  #
17
- # @param [Hash] options
18
- # Additional options.
19
- #
20
- # @option options [Array<String, Regexp, Proc>] :accept
17
+ # @param [Array<String, Regexp, Proc>, nil] accept
21
18
  # The patterns to accept data with.
22
19
  #
23
- # @option options [Array<String, Regexp, Proc>] :reject
20
+ # @param [Array<String, Regexp, Proc>, nil] reject
24
21
  # The patterns to reject data with.
25
22
  #
26
- def initialize(options={})
23
+ def initialize(accept: nil, reject: nil)
27
24
  @accept = []
28
25
  @reject = []
29
26
 
30
- @accept += options[:accept] if options[:accept]
31
- @reject += options[:reject] if options[:reject]
27
+ @accept += accept if accept
28
+ @reject += reject if reject
32
29
  end
33
30
 
34
31
  #