spidr_epg 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +10 -0
- data/.rspec +1 -0
- data/.yardopts +1 -0
- data/ChangeLog.md +291 -0
- data/ChangeLog.md~ +291 -0
- data/Gemfile +16 -0
- data/Gemfile.lock +49 -0
- data/Gemfile~ +16 -0
- data/LICENSE.txt +20 -0
- data/README.md +193 -0
- data/README.md~ +190 -0
- data/Rakefile +29 -0
- data/gemspec.yml +19 -0
- data/lib/spidr/actions/actions.rb +83 -0
- data/lib/spidr/actions/exceptions/action.rb +9 -0
- data/lib/spidr/actions/exceptions/paused.rb +11 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
- data/lib/spidr/actions/exceptions.rb +4 -0
- data/lib/spidr/actions.rb +2 -0
- data/lib/spidr/agent.rb +866 -0
- data/lib/spidr/auth_credential.rb +28 -0
- data/lib/spidr/auth_store.rb +161 -0
- data/lib/spidr/body.rb +98 -0
- data/lib/spidr/cookie_jar.rb +202 -0
- data/lib/spidr/events.rb +537 -0
- data/lib/spidr/extensions/uri.rb +52 -0
- data/lib/spidr/extensions.rb +1 -0
- data/lib/spidr/filters.rb +539 -0
- data/lib/spidr/headers.rb +370 -0
- data/lib/spidr/links.rb +229 -0
- data/lib/spidr/page.rb +108 -0
- data/lib/spidr/rules.rb +79 -0
- data/lib/spidr/sanitizers.rb +56 -0
- data/lib/spidr/session_cache.rb +145 -0
- data/lib/spidr/spidr.rb +107 -0
- data/lib/spidr/version.rb +4 -0
- data/lib/spidr/version.rb~ +4 -0
- data/lib/spidr.rb +3 -0
- data/pkg/spidr-1.0.0.gem +0 -0
- data/spec/actions_spec.rb +59 -0
- data/spec/agent_spec.rb +81 -0
- data/spec/auth_store_spec.rb +85 -0
- data/spec/cookie_jar_spec.rb +144 -0
- data/spec/extensions/uri_spec.rb +43 -0
- data/spec/filters_spec.rb +61 -0
- data/spec/helpers/history.rb +34 -0
- data/spec/helpers/page.rb +8 -0
- data/spec/helpers/wsoc.rb +83 -0
- data/spec/page_examples.rb +21 -0
- data/spec/page_spec.rb +125 -0
- data/spec/rules_spec.rb +45 -0
- data/spec/sanitizers_spec.rb +61 -0
- data/spec/session_cache.rb +58 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/spidr_spec.rb +39 -0
- data/spidr.gemspec +133 -0
- data/spidr.gemspec~ +131 -0
- metadata +158 -0
@@ -0,0 +1,539 @@
|
|
1
|
+
require 'spidrs/rules'
|
2
|
+
|
3
|
+
module Spidr
|
4
|
+
#
|
5
|
+
# The {Filters} module adds methods to {Agent} for controlling which
|
6
|
+
# URLs the agent will visit.
|
7
|
+
#
|
8
|
+
module Filters
|
9
|
+
# List of acceptable URL schemes to follow
|
10
|
+
attr_reader :schemes
|
11
|
+
|
12
|
+
#
|
13
|
+
# Sets the list of acceptable URL schemes to visit.
|
14
|
+
#
|
15
|
+
# @param [Array] new_schemes
|
16
|
+
# The new schemes to visit.
|
17
|
+
#
|
18
|
+
# @example
|
19
|
+
# agent.schemes = ['http']
|
20
|
+
#
|
21
|
+
def schemes=(new_schemes)
|
22
|
+
@schemes = new_schemes.map { |scheme| scheme.to_s }
|
23
|
+
end
|
24
|
+
|
25
|
+
#
|
26
|
+
# Specifies the patterns that match host-names to visit.
|
27
|
+
#
|
28
|
+
# @return [Array<String, Regexp, Proc>]
|
29
|
+
# The host-name patterns to visit.
|
30
|
+
#
|
31
|
+
def visit_hosts
|
32
|
+
@host_rules.accept
|
33
|
+
end
|
34
|
+
|
35
|
+
#
|
36
|
+
# Adds a given pattern to the {#visit_hosts}.
|
37
|
+
#
|
38
|
+
# @param [String, Regexp] pattern
|
39
|
+
# The pattern to match host-names with.
|
40
|
+
#
|
41
|
+
# @yield [host]
|
42
|
+
# If a block is given, it will be used to filter host-names.
|
43
|
+
#
|
44
|
+
# @yieldparam [String] host
|
45
|
+
# A host-name to accept or reject.
|
46
|
+
#
|
47
|
+
def visit_hosts_like(pattern=nil,&block)
|
48
|
+
if pattern
|
49
|
+
visit_hosts << pattern
|
50
|
+
elsif block
|
51
|
+
visit_hosts << block
|
52
|
+
end
|
53
|
+
|
54
|
+
return self
|
55
|
+
end
|
56
|
+
|
57
|
+
#
|
58
|
+
# Specifies the patterns that match host-names to not visit.
|
59
|
+
#
|
60
|
+
# @return [Array<String, Regexp, Proc>]
|
61
|
+
# The host-name patterns to not visit.
|
62
|
+
#
|
63
|
+
def ignore_hosts
|
64
|
+
@host_rules.reject
|
65
|
+
end
|
66
|
+
|
67
|
+
#
|
68
|
+
# Adds a given pattern to the {#ignore_hosts}.
|
69
|
+
#
|
70
|
+
# @param [String, Regexp] pattern
|
71
|
+
# The pattern to match host-names with.
|
72
|
+
#
|
73
|
+
# @yield [host]
|
74
|
+
# If a block is given, it will be used to filter host-names.
|
75
|
+
#
|
76
|
+
# @yieldparam [String] host
|
77
|
+
# A host-name to reject or accept.
|
78
|
+
#
|
79
|
+
def ignore_hosts_like(pattern=nil,&block)
|
80
|
+
if pattern
|
81
|
+
ignore_hosts << pattern
|
82
|
+
elsif block
|
83
|
+
ignore_hosts << block
|
84
|
+
end
|
85
|
+
|
86
|
+
return self
|
87
|
+
end
|
88
|
+
|
89
|
+
#
|
90
|
+
# Specifies the patterns that match the ports to visit.
|
91
|
+
#
|
92
|
+
# @return [Array<Integer, Regexp, Proc>]
|
93
|
+
# The port patterns to visit.
|
94
|
+
#
|
95
|
+
def visit_ports
|
96
|
+
@port_rules.accept
|
97
|
+
end
|
98
|
+
|
99
|
+
#
|
100
|
+
# Adds a given pattern to the {#visit_ports}.
|
101
|
+
#
|
102
|
+
# @param [Integer, Regexp] pattern
|
103
|
+
# The pattern to match ports with.
|
104
|
+
#
|
105
|
+
# @yield [port]
|
106
|
+
# If a block is given, it will be used to filter ports.
|
107
|
+
#
|
108
|
+
# @yieldparam [Integer] port
|
109
|
+
# A port to accept or reject.
|
110
|
+
#
|
111
|
+
def visit_ports_like(pattern=nil,&block)
|
112
|
+
if pattern
|
113
|
+
visit_ports << pattern
|
114
|
+
elsif block
|
115
|
+
visit_ports << block
|
116
|
+
end
|
117
|
+
|
118
|
+
return self
|
119
|
+
end
|
120
|
+
|
121
|
+
#
|
122
|
+
# Specifies the patterns that match ports to not visit.
|
123
|
+
#
|
124
|
+
# @return [Array<Integer, Regexp, Proc>]
|
125
|
+
# The port patterns to not visit.
|
126
|
+
#
|
127
|
+
def ignore_ports
|
128
|
+
@port_rules.reject
|
129
|
+
end
|
130
|
+
|
131
|
+
#
|
132
|
+
# Adds a given pattern to the {#ignore_ports}.
|
133
|
+
#
|
134
|
+
# @param [Integer, Regexp] pattern
|
135
|
+
# The pattern to match ports with.
|
136
|
+
#
|
137
|
+
# @yield [port]
|
138
|
+
# If a block is given, it will be used to filter ports.
|
139
|
+
#
|
140
|
+
# @yieldparam [Integer] port
|
141
|
+
# A port to reject or accept.
|
142
|
+
#
|
143
|
+
def ignore_ports_like(pattern=nil,&block)
|
144
|
+
if pattern
|
145
|
+
ignore_ports << pattern
|
146
|
+
elsif block
|
147
|
+
ignore_ports << block
|
148
|
+
end
|
149
|
+
|
150
|
+
return self
|
151
|
+
end
|
152
|
+
|
153
|
+
#
|
154
|
+
# Specifies the patterns that match the links to visit.
|
155
|
+
#
|
156
|
+
# @return [Array<String, Regexp, Proc>]
|
157
|
+
# The link patterns to visit.
|
158
|
+
#
|
159
|
+
# @since 0.2.4
|
160
|
+
#
|
161
|
+
def visit_links
|
162
|
+
@link_rules.accept
|
163
|
+
end
|
164
|
+
|
165
|
+
#
|
166
|
+
# Adds a given pattern to the {#visit_links}
|
167
|
+
#
|
168
|
+
# @param [String, Regexp] pattern
|
169
|
+
# The pattern to match link with.
|
170
|
+
#
|
171
|
+
# @yield [link]
|
172
|
+
# If a block is given, it will be used to filter links.
|
173
|
+
#
|
174
|
+
# @yieldparam [String] link
|
175
|
+
# A link to accept or reject.
|
176
|
+
#
|
177
|
+
# @since 0.2.4
|
178
|
+
#
|
179
|
+
def visit_links_like(pattern=nil,&block)
|
180
|
+
if pattern
|
181
|
+
visit_links << pattern
|
182
|
+
elsif block
|
183
|
+
visit_links << block
|
184
|
+
end
|
185
|
+
|
186
|
+
return self
|
187
|
+
end
|
188
|
+
|
189
|
+
#
|
190
|
+
# Specifies the patterns that match links to not visit.
|
191
|
+
#
|
192
|
+
# @return [Array<String, Regexp, Proc>]
|
193
|
+
# The link patterns to not visit.
|
194
|
+
#
|
195
|
+
def ignore_links
|
196
|
+
@link_rules.reject
|
197
|
+
end
|
198
|
+
|
199
|
+
#
|
200
|
+
# Adds a given pattern to the {#ignore_links}.
|
201
|
+
#
|
202
|
+
# @param [String, Regexp] pattern
|
203
|
+
# The pattern to match links with.
|
204
|
+
#
|
205
|
+
# @yield [link]
|
206
|
+
# If a block is given, it will be used to filter links.
|
207
|
+
#
|
208
|
+
# @yieldparam [String] link
|
209
|
+
# A link to reject or accept.
|
210
|
+
#
|
211
|
+
def ignore_links_like(pattern=nil,&block)
|
212
|
+
if pattern
|
213
|
+
ignore_links << pattern
|
214
|
+
elsif block
|
215
|
+
ignore_links << block
|
216
|
+
end
|
217
|
+
|
218
|
+
return self
|
219
|
+
end
|
220
|
+
|
221
|
+
#
|
222
|
+
# Specifies the patterns that match the URLs to visit.
|
223
|
+
#
|
224
|
+
# @return [Array<String, Regexp, Proc>]
|
225
|
+
# The link patterns to visit.
|
226
|
+
#
|
227
|
+
# @since 0.2.4
|
228
|
+
#
|
229
|
+
def visit_urls
|
230
|
+
@url_rules.accept
|
231
|
+
end
|
232
|
+
|
233
|
+
#
|
234
|
+
# Adds a given pattern to the {#visit_urls}
|
235
|
+
#
|
236
|
+
# @param [String, Regexp] pattern
|
237
|
+
# The pattern to match URLs with.
|
238
|
+
#
|
239
|
+
# @yield [url]
|
240
|
+
# If a block is given, it will be used to filter URLs.
|
241
|
+
#
|
242
|
+
# @yieldparam [URI::HTTP, URI::HTTPS] url
|
243
|
+
# A URL to accept or reject.
|
244
|
+
#
|
245
|
+
# @since 0.2.4
|
246
|
+
#
|
247
|
+
def visit_urls_like(pattern=nil,&block)
|
248
|
+
if pattern
|
249
|
+
visit_urls << pattern
|
250
|
+
elsif block
|
251
|
+
visit_urls << block
|
252
|
+
end
|
253
|
+
|
254
|
+
return self
|
255
|
+
end
|
256
|
+
|
257
|
+
#
|
258
|
+
# Specifies the patterns that match URLs to not visit.
|
259
|
+
#
|
260
|
+
# @return [Array<String, Regexp, Proc>]
|
261
|
+
# The URL patterns to not visit.
|
262
|
+
#
|
263
|
+
# @since 0.2.4
|
264
|
+
#
|
265
|
+
def ignore_urls
|
266
|
+
@url_rules.reject
|
267
|
+
end
|
268
|
+
|
269
|
+
#
|
270
|
+
# Adds a given pattern to the {#ignore_urls}.
|
271
|
+
#
|
272
|
+
# @param [String, Regexp] pattern
|
273
|
+
# The pattern to match URLs with.
|
274
|
+
#
|
275
|
+
# @yield [url]
|
276
|
+
# If a block is given, it will be used to filter URLs.
|
277
|
+
#
|
278
|
+
# @yieldparam [URI::HTTP, URI::HTTPS] url
|
279
|
+
# A URL to reject or accept.
|
280
|
+
#
|
281
|
+
# @since 0.2.4
|
282
|
+
#
|
283
|
+
def ignore_urls_like(pattern=nil,&block)
|
284
|
+
if pattern
|
285
|
+
ignore_urls << pattern
|
286
|
+
elsif block
|
287
|
+
ignore_urls << block
|
288
|
+
end
|
289
|
+
|
290
|
+
return self
|
291
|
+
end
|
292
|
+
|
293
|
+
#
|
294
|
+
# Specifies the patterns that match the URI path extensions to visit.
|
295
|
+
#
|
296
|
+
# @return [Array<String, Regexp, Proc>]
|
297
|
+
# The URI path extensions patterns to visit.
|
298
|
+
#
|
299
|
+
def visit_exts
|
300
|
+
@ext_rules.accept
|
301
|
+
end
|
302
|
+
|
303
|
+
#
|
304
|
+
# Adds a given pattern to the {#visit_exts}.
|
305
|
+
#
|
306
|
+
# @param [String, Regexp] pattern
|
307
|
+
# The pattern to match URI path extensions with.
|
308
|
+
#
|
309
|
+
# @yield [ext]
|
310
|
+
# If a block is given, it will be used to filter URI path extensions.
|
311
|
+
#
|
312
|
+
# @yieldparam [String] ext
|
313
|
+
# A URI path extension to accept or reject.
|
314
|
+
#
|
315
|
+
def visit_exts_like(pattern=nil,&block)
|
316
|
+
if pattern
|
317
|
+
visit_exts << pattern
|
318
|
+
elsif block
|
319
|
+
visit_exts << block
|
320
|
+
end
|
321
|
+
|
322
|
+
return self
|
323
|
+
end
|
324
|
+
|
325
|
+
#
|
326
|
+
# Specifies the patterns that match URI path extensions to not visit.
|
327
|
+
#
|
328
|
+
# @return [Array<String, Regexp, Proc>]
|
329
|
+
# The URI path extension patterns to not visit.
|
330
|
+
#
|
331
|
+
def ignore_exts
|
332
|
+
@ext_rules.reject
|
333
|
+
end
|
334
|
+
|
335
|
+
#
|
336
|
+
# Adds a given pattern to the {#ignore_exts}.
|
337
|
+
#
|
338
|
+
# @param [String, Regexp] pattern
|
339
|
+
# The pattern to match URI path extensions with.
|
340
|
+
#
|
341
|
+
# @yield [ext]
|
342
|
+
# If a block is given, it will be used to filter URI path extensions.
|
343
|
+
#
|
344
|
+
# @yieldparam [String] ext
|
345
|
+
# A URI path extension to reject or accept.
|
346
|
+
#
|
347
|
+
def ignore_exts_like(pattern=nil,&block)
|
348
|
+
if pattern
|
349
|
+
ignore_exts << pattern
|
350
|
+
elsif block
|
351
|
+
ignore_exts << block
|
352
|
+
end
|
353
|
+
|
354
|
+
return self
|
355
|
+
end
|
356
|
+
|
357
|
+
protected
|
358
|
+
|
359
|
+
#
|
360
|
+
# Initializes filtering rules.
|
361
|
+
#
|
362
|
+
# @param [Hash] options
|
363
|
+
# Additional options.
|
364
|
+
#
|
365
|
+
# @option options [Array] :schemes (['http', 'https'])
|
366
|
+
# The list of acceptable URI schemes to visit.
|
367
|
+
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
368
|
+
#
|
369
|
+
# @option options [String] :host
|
370
|
+
# The host-name to visit.
|
371
|
+
#
|
372
|
+
# @option options [Array<String, Regexp, Proc>] :hosts
|
373
|
+
# The patterns which match the host-names to visit.
|
374
|
+
#
|
375
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_hosts
|
376
|
+
# The patterns which match the host-names to not visit.
|
377
|
+
#
|
378
|
+
# @option options [Array<Integer, Regexp, Proc>] :ports
|
379
|
+
# The patterns which match the ports to visit.
|
380
|
+
#
|
381
|
+
# @option options [Array<Integer, Regexp, Proc>] :ignore_ports
|
382
|
+
# The patterns which match the ports to not visit.
|
383
|
+
#
|
384
|
+
# @option options [Array<String, Regexp, Proc>] :links
|
385
|
+
# The patterns which match the links to visit.
|
386
|
+
#
|
387
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_links
|
388
|
+
# The patterns which match the links to not visit.
|
389
|
+
#
|
390
|
+
# @option options [Array<String, Regexp, Proc>] :urls
|
391
|
+
# The patterns which match the URLs to visit.
|
392
|
+
#
|
393
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_urls
|
394
|
+
# The patterns which match the URLs to not visit.
|
395
|
+
#
|
396
|
+
# @option options [Array<String, Regexp, Proc>] :exts
|
397
|
+
# The patterns which match the URI path extensions to visit.
|
398
|
+
#
|
399
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_exts
|
400
|
+
# The patterns which match the URI path extensions to not visit.
|
401
|
+
#
|
402
|
+
def initialize_filters(options={})
|
403
|
+
@schemes = []
|
404
|
+
|
405
|
+
if options[:schemes]
|
406
|
+
@schemes += options[:schemes]
|
407
|
+
else
|
408
|
+
@schemes << 'http'
|
409
|
+
|
410
|
+
begin
|
411
|
+
require 'net/https'
|
412
|
+
|
413
|
+
@schemes << 'https'
|
414
|
+
rescue Gem::LoadError => e
|
415
|
+
raise(e)
|
416
|
+
rescue ::LoadError
|
417
|
+
warn "Warning: cannot load 'net/https', https support disabled"
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
@host_rules = Rules.new(
|
422
|
+
:accept => options[:hosts],
|
423
|
+
:reject => options[:ignore_hosts]
|
424
|
+
)
|
425
|
+
@port_rules = Rules.new(
|
426
|
+
:accept => options[:ports],
|
427
|
+
:reject => options[:ignore_ports]
|
428
|
+
)
|
429
|
+
@link_rules = Rules.new(
|
430
|
+
:accept => options[:links],
|
431
|
+
:reject => options[:ignore_links]
|
432
|
+
)
|
433
|
+
@url_rules = Rules.new(
|
434
|
+
:accept => options[:urls],
|
435
|
+
:reject => options[:ignore_urls]
|
436
|
+
)
|
437
|
+
@ext_rules = Rules.new(
|
438
|
+
:accept => options[:exts],
|
439
|
+
:reject => options[:ignore_exts]
|
440
|
+
)
|
441
|
+
|
442
|
+
if options[:host]
|
443
|
+
visit_hosts_like(options[:host])
|
444
|
+
end
|
445
|
+
|
446
|
+
if options[:queue]
|
447
|
+
self.queue = options[:queue]
|
448
|
+
end
|
449
|
+
|
450
|
+
if options[:history]
|
451
|
+
self.history = options[:history]
|
452
|
+
end
|
453
|
+
end
|
454
|
+
|
455
|
+
#
|
456
|
+
# Determines if a given URI scheme should be visited.
|
457
|
+
#
|
458
|
+
# @param [String] scheme
|
459
|
+
# The URI scheme.
|
460
|
+
#
|
461
|
+
# @return [Boolean]
|
462
|
+
# Specifies whether the given scheme should be visited.
|
463
|
+
#
|
464
|
+
def visit_scheme?(scheme)
|
465
|
+
if scheme
|
466
|
+
return @schemes.include?(scheme)
|
467
|
+
else
|
468
|
+
return true
|
469
|
+
end
|
470
|
+
end
|
471
|
+
|
472
|
+
#
|
473
|
+
# Determines if a given host-name should be visited.
|
474
|
+
#
|
475
|
+
# @param [String] host
|
476
|
+
# The host-name.
|
477
|
+
#
|
478
|
+
# @return [Boolean]
|
479
|
+
# Specifies whether the given host-name should be visited.
|
480
|
+
#
|
481
|
+
def visit_host?(host)
|
482
|
+
@host_rules.accept?(host)
|
483
|
+
end
|
484
|
+
|
485
|
+
#
|
486
|
+
# Determines if a given port should be visited.
|
487
|
+
#
|
488
|
+
# @param [Integer] port
|
489
|
+
# The port number.
|
490
|
+
#
|
491
|
+
# @return [Boolean]
|
492
|
+
# Specifies whether the given port should be visited.
|
493
|
+
#
|
494
|
+
def visit_port?(port)
|
495
|
+
@port_rules.accept?(port)
|
496
|
+
end
|
497
|
+
|
498
|
+
#
|
499
|
+
# Determines if a given link should be visited.
|
500
|
+
#
|
501
|
+
# @param [String] link
|
502
|
+
# The link.
|
503
|
+
#
|
504
|
+
# @return [Boolean]
|
505
|
+
# Specifies whether the given link should be visited.
|
506
|
+
#
|
507
|
+
def visit_link?(link)
|
508
|
+
@link_rules.accept?(link)
|
509
|
+
end
|
510
|
+
|
511
|
+
#
|
512
|
+
# Determines if a given URL should be visited.
|
513
|
+
#
|
514
|
+
# @param [URI::HTTP, URI::HTTPS] url
|
515
|
+
# The URL.
|
516
|
+
#
|
517
|
+
# @return [Boolean]
|
518
|
+
# Specifies whether the given URL should be visited.
|
519
|
+
#
|
520
|
+
# @since 0.2.4
|
521
|
+
#
|
522
|
+
def visit_url?(link)
|
523
|
+
@url_rules.accept?(link)
|
524
|
+
end
|
525
|
+
|
526
|
+
#
|
527
|
+
# Determines if a given URI path extension should be visited.
|
528
|
+
#
|
529
|
+
# @param [String] path
|
530
|
+
# The path that contains the extension.
|
531
|
+
#
|
532
|
+
# @return [Boolean]
|
533
|
+
# Specifies whether the given URI path extension should be visited.
|
534
|
+
#
|
535
|
+
def visit_ext?(path)
|
536
|
+
@ext_rules.accept?(File.extname(path)[1..-1])
|
537
|
+
end
|
538
|
+
end
|
539
|
+
end
|