spidr 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog.md +9 -0
- data/Gemfile +1 -1
- data/lib/spidr/actions/actions.rb +6 -6
- data/lib/spidr/agent.rb +33 -43
- data/lib/spidr/body.rb +8 -9
- data/lib/spidr/events.rb +22 -13
- data/lib/spidr/filters.rb +98 -104
- data/lib/spidr/headers.rb +3 -3
- data/lib/spidr/links.rb +1 -1
- data/lib/spidr/rules.rb +6 -14
- data/lib/spidr/sanitizers.rb +25 -32
- data/lib/spidr/version.rb +1 -1
- data/spidr.gemspec +125 -13
- metadata +3 -3
data/ChangeLog.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
### 0.3.2 / 2011-06-20
|
2
|
+
|
3
|
+
* Added separate intitialize methods for {Spidr::Actions}, {Spidr::Events},
|
4
|
+
{Spidr::Filters} and {Spidr::Sanitizers}.
|
5
|
+
* Aliased {Spidr::Events#urls_like} to {Spidr::Events#every_url_like}.
|
6
|
+
* Reduce usage of `self.included` and `module_eval`.
|
7
|
+
* Reduce usage of nested-blocks.
|
8
|
+
* Reduce usage of `return`.
|
9
|
+
|
1
10
|
### 0.3.1 / 2011-04-22
|
2
11
|
|
3
12
|
* Require `set` in `spidr/headers.rb`.
|
data/Gemfile
CHANGED
@@ -8,12 +8,6 @@ module Spidr
|
|
8
8
|
# spidering of links.
|
9
9
|
#
|
10
10
|
module Actions
|
11
|
-
def initialize(options={})
|
12
|
-
@paused = false
|
13
|
-
|
14
|
-
super(options)
|
15
|
-
end
|
16
|
-
|
17
11
|
#
|
18
12
|
# Continue spidering.
|
19
13
|
#
|
@@ -79,5 +73,11 @@ module Spidr
|
|
79
73
|
def skip_page!
|
80
74
|
raise(SkipPage)
|
81
75
|
end
|
76
|
+
|
77
|
+
protected
|
78
|
+
|
79
|
+
def initialize_actions(options={})
|
80
|
+
@paused = false
|
81
|
+
end
|
82
82
|
end
|
83
83
|
end
|
data/lib/spidr/agent.rb
CHANGED
@@ -115,15 +115,15 @@ module Spidr
|
|
115
115
|
@host_headers.merge!(options[:host_headers])
|
116
116
|
end
|
117
117
|
|
118
|
-
@user_agent = (
|
118
|
+
@user_agent = options.fetch(:user_agent,Spidr.user_agent)
|
119
119
|
@referer = options[:referer]
|
120
120
|
|
121
|
-
@sessions = SessionCache.new(options
|
121
|
+
@sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
|
122
122
|
@cookies = CookieJar.new
|
123
123
|
@authorized = AuthStore.new
|
124
124
|
|
125
125
|
@running = false
|
126
|
-
@delay = (
|
126
|
+
@delay = options.fetch(:delay,0)
|
127
127
|
@history = Set[]
|
128
128
|
@failures = Set[]
|
129
129
|
@queue = []
|
@@ -131,7 +131,10 @@ module Spidr
|
|
131
131
|
@levels = Hash.new(0)
|
132
132
|
@max_depth = options[:max_depth]
|
133
133
|
|
134
|
-
|
134
|
+
initialize_sanitizers(options)
|
135
|
+
initialize_filters(options)
|
136
|
+
initialize_actions(options)
|
137
|
+
initialize_events(options)
|
135
138
|
|
136
139
|
yield self if block_given?
|
137
140
|
end
|
@@ -152,19 +155,16 @@ module Spidr
|
|
152
155
|
# @yieldparam [Agent] agent
|
153
156
|
# The newly created agent.
|
154
157
|
#
|
155
|
-
def self.start_at(url,options={})
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
spider.start_at(url)
|
160
|
-
end
|
158
|
+
def self.start_at(url,options={},&block)
|
159
|
+
agent = new(options,&block)
|
160
|
+
agent.start_at(url)
|
161
161
|
end
|
162
162
|
|
163
163
|
#
|
164
|
-
# Creates a new agent and spiders the given
|
164
|
+
# Creates a new agent and spiders the web-site located at the given URL.
|
165
165
|
#
|
166
|
-
# @param [String]
|
167
|
-
# The
|
166
|
+
# @param [URI::HTTP, String] url
|
167
|
+
# The web-site to spider.
|
168
168
|
#
|
169
169
|
# @param [Hash] options
|
170
170
|
# Additional options. See {Agent#initialize}.
|
@@ -176,19 +176,18 @@ module Spidr
|
|
176
176
|
# @yieldparam [Agent] agent
|
177
177
|
# The newly created agent.
|
178
178
|
#
|
179
|
-
def self.
|
180
|
-
|
181
|
-
yield spider if block_given?
|
179
|
+
def self.site(url,options={},&block)
|
180
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
182
181
|
|
183
|
-
|
184
|
-
|
182
|
+
agent = new(options.merge(:host => url.host),&block)
|
183
|
+
agent.start_at(url)
|
185
184
|
end
|
186
185
|
|
187
186
|
#
|
188
|
-
# Creates a new agent and spiders the
|
187
|
+
# Creates a new agent and spiders the given host.
|
189
188
|
#
|
190
|
-
# @param [
|
191
|
-
# The
|
189
|
+
# @param [String]
|
190
|
+
# The host-name to spider.
|
192
191
|
#
|
193
192
|
# @param [Hash] options
|
194
193
|
# Additional options. See {Agent#initialize}.
|
@@ -200,14 +199,8 @@ module Spidr
|
|
200
199
|
# @yieldparam [Agent] agent
|
201
200
|
# The newly created agent.
|
202
201
|
#
|
203
|
-
def self.
|
204
|
-
|
205
|
-
|
206
|
-
return self.new(options.merge(:host => url.host)) do |spider|
|
207
|
-
yield spider if block_given?
|
208
|
-
|
209
|
-
spider.start_at(url)
|
210
|
-
end
|
202
|
+
def self.host(name,options={},&block)
|
203
|
+
site(URI::HTTP.build(:host => name, :path => '/'),options,&block)
|
211
204
|
end
|
212
205
|
|
213
206
|
#
|
@@ -234,7 +227,6 @@ module Spidr
|
|
234
227
|
#
|
235
228
|
def start_at(url,&block)
|
236
229
|
enqueue(url)
|
237
|
-
|
238
230
|
return run(&block)
|
239
231
|
end
|
240
232
|
|
@@ -261,7 +253,6 @@ module Spidr
|
|
261
253
|
end
|
262
254
|
|
263
255
|
@running = false
|
264
|
-
|
265
256
|
@sessions.clear
|
266
257
|
return self
|
267
258
|
end
|
@@ -387,10 +378,10 @@ module Spidr
|
|
387
378
|
|
388
379
|
new_failures.each do |url|
|
389
380
|
@failures << unless url.kind_of?(URI)
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
381
|
+
URI(url.to_s)
|
382
|
+
else
|
383
|
+
url
|
384
|
+
end
|
394
385
|
end
|
395
386
|
|
396
387
|
return @failures
|
@@ -471,7 +462,7 @@ module Spidr
|
|
471
462
|
begin
|
472
463
|
@every_url_blocks.each { |url_block| url_block.call(url) }
|
473
464
|
|
474
|
-
@
|
465
|
+
@every_url_like_blocks.each do |pattern,url_blocks|
|
475
466
|
match = case pattern
|
476
467
|
when Regexp
|
477
468
|
link =~ pattern
|
@@ -653,12 +644,11 @@ module Spidr
|
|
653
644
|
def prepare_request(url,&block)
|
654
645
|
host = url.host
|
655
646
|
port = url.port
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
end
|
647
|
+
path = unless url.path.empty?
|
648
|
+
url.path
|
649
|
+
else
|
650
|
+
'/'
|
651
|
+
end
|
662
652
|
|
663
653
|
# append the URL query to the path
|
664
654
|
path += "?#{url.query}" if url.query
|
@@ -724,7 +714,7 @@ module Spidr
|
|
724
714
|
# Specifies whether the given URL should be visited.
|
725
715
|
#
|
726
716
|
def visit?(url)
|
727
|
-
!
|
717
|
+
!visited?(url) &&
|
728
718
|
visit_scheme?(url.scheme) &&
|
729
719
|
visit_host?(url.host) &&
|
730
720
|
visit_port?(url.port) &&
|
data/lib/spidr/body.rb
CHANGED
@@ -24,16 +24,15 @@ module Spidr
|
|
24
24
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
25
25
|
#
|
26
26
|
def doc
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
27
|
+
unless body.empty?
|
28
|
+
begin
|
29
|
+
if html?
|
30
|
+
@doc ||= Nokogiri::HTML(body)
|
31
|
+
elsif (rss? || atom? || xml? || xsl?)
|
32
|
+
@doc ||= Nokogiri::XML(body)
|
33
|
+
end
|
34
|
+
rescue
|
34
35
|
end
|
35
|
-
rescue
|
36
|
-
return nil
|
37
36
|
end
|
38
37
|
end
|
39
38
|
|
data/lib/spidr/events.rb
CHANGED
@@ -5,17 +5,6 @@ module Spidr
|
|
5
5
|
# they are visited.
|
6
6
|
#
|
7
7
|
module Events
|
8
|
-
def initialize(options={})
|
9
|
-
super(options)
|
10
|
-
|
11
|
-
@every_url_blocks = []
|
12
|
-
@every_failed_url_blocks = []
|
13
|
-
@urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
14
|
-
|
15
|
-
@every_page_blocks = []
|
16
|
-
@every_link_blocks = []
|
17
|
-
end
|
18
|
-
|
19
8
|
#
|
20
9
|
# Pass each URL from each page visited to the given block.
|
21
10
|
#
|
@@ -57,11 +46,20 @@ module Spidr
|
|
57
46
|
# @yieldparam [URI::HTTP] url
|
58
47
|
# A matching URL.
|
59
48
|
#
|
60
|
-
|
61
|
-
|
49
|
+
# @since 0.3.2
|
50
|
+
#
|
51
|
+
def every_url_like(pattern,&block)
|
52
|
+
@every_url_like_blocks[pattern] << block
|
62
53
|
return self
|
63
54
|
end
|
64
55
|
|
56
|
+
#
|
57
|
+
# @see #every_url_like
|
58
|
+
#
|
59
|
+
def urls_like(pattern,&block)
|
60
|
+
every_url_like(pattern,&block)
|
61
|
+
end
|
62
|
+
|
65
63
|
#
|
66
64
|
# Pass the headers from every response the agent receives to a given
|
67
65
|
# block.
|
@@ -524,5 +522,16 @@ module Spidr
|
|
524
522
|
@every_link_blocks << block
|
525
523
|
return self
|
526
524
|
end
|
525
|
+
|
526
|
+
protected
|
527
|
+
|
528
|
+
def initialize_events(options={})
|
529
|
+
@every_url_blocks = []
|
530
|
+
@every_failed_url_blocks = []
|
531
|
+
@every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
532
|
+
|
533
|
+
@every_page_blocks = []
|
534
|
+
@every_link_blocks = []
|
535
|
+
end
|
527
536
|
end
|
528
537
|
end
|
data/lib/spidr/filters.rb
CHANGED
@@ -6,110 +6,8 @@ module Spidr
|
|
6
6
|
# URLs the agent will visit.
|
7
7
|
#
|
8
8
|
module Filters
|
9
|
-
|
10
|
-
|
11
|
-
# List of acceptable URL schemes to follow
|
12
|
-
attr_reader :schemes
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
#
|
17
|
-
# Initializes filtering rules.
|
18
|
-
#
|
19
|
-
# @param [Hash] options
|
20
|
-
# Additional options.
|
21
|
-
#
|
22
|
-
# @option options [Array] :schemes (['http', 'https'])
|
23
|
-
# The list of acceptable URI schemes to visit.
|
24
|
-
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
25
|
-
#
|
26
|
-
# @option options [String] :host
|
27
|
-
# The host-name to visit.
|
28
|
-
#
|
29
|
-
# @option options [Array<String, Regexp, Proc>] :hosts
|
30
|
-
# The patterns which match the host-names to visit.
|
31
|
-
#
|
32
|
-
# @option options [Array<String, Regexp, Proc>] :ignore_hosts
|
33
|
-
# The patterns which match the host-names to not visit.
|
34
|
-
#
|
35
|
-
# @option options [Array<Integer, Regexp, Proc>] :ports
|
36
|
-
# The patterns which match the ports to visit.
|
37
|
-
#
|
38
|
-
# @option options [Array<Integer, Regexp, Proc>] :ignore_ports
|
39
|
-
# The patterns which match the ports to not visit.
|
40
|
-
#
|
41
|
-
# @option options [Array<String, Regexp, Proc>] :links
|
42
|
-
# The patterns which match the links to visit.
|
43
|
-
#
|
44
|
-
# @option options [Array<String, Regexp, Proc>] :ignore_links
|
45
|
-
# The patterns which match the links to not visit.
|
46
|
-
#
|
47
|
-
# @option options [Array<String, Regexp, Proc>] :urls
|
48
|
-
# The patterns which match the URLs to visit.
|
49
|
-
#
|
50
|
-
# @option options [Array<String, Regexp, Proc>] :ignore_urls
|
51
|
-
# The patterns which match the URLs to not visit.
|
52
|
-
#
|
53
|
-
# @option options [Array<String, Regexp, Proc>] :exts
|
54
|
-
# The patterns which match the URI path extensions to visit.
|
55
|
-
#
|
56
|
-
# @option options [Array<String, Regexp, Proc>] :ignore_exts
|
57
|
-
# The patterns which match the URI path extensions to not visit.
|
58
|
-
#
|
59
|
-
def initialize(options={})
|
60
|
-
super(options)
|
61
|
-
|
62
|
-
@schemes = []
|
63
|
-
|
64
|
-
if options[:schemes]
|
65
|
-
@schemes += options[:schemes]
|
66
|
-
else
|
67
|
-
@schemes << 'http'
|
68
|
-
|
69
|
-
begin
|
70
|
-
require 'net/https'
|
71
|
-
|
72
|
-
@schemes << 'https'
|
73
|
-
rescue Gem::LoadError => e
|
74
|
-
raise(e)
|
75
|
-
rescue ::LoadError
|
76
|
-
STDERR.puts "Warning: cannot load 'net/https', https support disabled"
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
@host_rules = Rules.new(
|
81
|
-
:accept => options[:hosts],
|
82
|
-
:reject => options[:ignore_hosts]
|
83
|
-
)
|
84
|
-
@port_rules = Rules.new(
|
85
|
-
:accept => options[:ports],
|
86
|
-
:reject => options[:ignore_ports]
|
87
|
-
)
|
88
|
-
@link_rules = Rules.new(
|
89
|
-
:accept => options[:links],
|
90
|
-
:reject => options[:ignore_links]
|
91
|
-
)
|
92
|
-
@url_rules = Rules.new(
|
93
|
-
:accept => options[:urls],
|
94
|
-
:reject => options[:ignore_urls]
|
95
|
-
)
|
96
|
-
@ext_rules = Rules.new(
|
97
|
-
:accept => options[:exts],
|
98
|
-
:reject => options[:ignore_exts]
|
99
|
-
)
|
100
|
-
|
101
|
-
if options[:host]
|
102
|
-
visit_hosts_like(options[:host])
|
103
|
-
end
|
104
|
-
|
105
|
-
if options[:queue]
|
106
|
-
self.queue = options[:queue]
|
107
|
-
end
|
108
|
-
|
109
|
-
if options[:history]
|
110
|
-
self.history = options[:history]
|
111
|
-
end
|
112
|
-
end
|
9
|
+
# List of acceptable URL schemes to follow
|
10
|
+
attr_reader :schemes
|
113
11
|
|
114
12
|
#
|
115
13
|
# Sets the list of acceptable URL schemes to visit.
|
@@ -458,6 +356,102 @@ module Spidr
|
|
458
356
|
|
459
357
|
protected
|
460
358
|
|
359
|
+
#
|
360
|
+
# Initializes filtering rules.
|
361
|
+
#
|
362
|
+
# @param [Hash] options
|
363
|
+
# Additional options.
|
364
|
+
#
|
365
|
+
# @option options [Array] :schemes (['http', 'https'])
|
366
|
+
# The list of acceptable URI schemes to visit.
|
367
|
+
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
368
|
+
#
|
369
|
+
# @option options [String] :host
|
370
|
+
# The host-name to visit.
|
371
|
+
#
|
372
|
+
# @option options [Array<String, Regexp, Proc>] :hosts
|
373
|
+
# The patterns which match the host-names to visit.
|
374
|
+
#
|
375
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_hosts
|
376
|
+
# The patterns which match the host-names to not visit.
|
377
|
+
#
|
378
|
+
# @option options [Array<Integer, Regexp, Proc>] :ports
|
379
|
+
# The patterns which match the ports to visit.
|
380
|
+
#
|
381
|
+
# @option options [Array<Integer, Regexp, Proc>] :ignore_ports
|
382
|
+
# The patterns which match the ports to not visit.
|
383
|
+
#
|
384
|
+
# @option options [Array<String, Regexp, Proc>] :links
|
385
|
+
# The patterns which match the links to visit.
|
386
|
+
#
|
387
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_links
|
388
|
+
# The patterns which match the links to not visit.
|
389
|
+
#
|
390
|
+
# @option options [Array<String, Regexp, Proc>] :urls
|
391
|
+
# The patterns which match the URLs to visit.
|
392
|
+
#
|
393
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_urls
|
394
|
+
# The patterns which match the URLs to not visit.
|
395
|
+
#
|
396
|
+
# @option options [Array<String, Regexp, Proc>] :exts
|
397
|
+
# The patterns which match the URI path extensions to visit.
|
398
|
+
#
|
399
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_exts
|
400
|
+
# The patterns which match the URI path extensions to not visit.
|
401
|
+
#
|
402
|
+
def initialize_filters(options={})
|
403
|
+
@schemes = []
|
404
|
+
|
405
|
+
if options[:schemes]
|
406
|
+
@schemes += options[:schemes]
|
407
|
+
else
|
408
|
+
@schemes << 'http'
|
409
|
+
|
410
|
+
begin
|
411
|
+
require 'net/https'
|
412
|
+
|
413
|
+
@schemes << 'https'
|
414
|
+
rescue Gem::LoadError => e
|
415
|
+
raise(e)
|
416
|
+
rescue ::LoadError
|
417
|
+
STDERR.puts "Warning: cannot load 'net/https', https support disabled"
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
@host_rules = Rules.new(
|
422
|
+
:accept => options[:hosts],
|
423
|
+
:reject => options[:ignore_hosts]
|
424
|
+
)
|
425
|
+
@port_rules = Rules.new(
|
426
|
+
:accept => options[:ports],
|
427
|
+
:reject => options[:ignore_ports]
|
428
|
+
)
|
429
|
+
@link_rules = Rules.new(
|
430
|
+
:accept => options[:links],
|
431
|
+
:reject => options[:ignore_links]
|
432
|
+
)
|
433
|
+
@url_rules = Rules.new(
|
434
|
+
:accept => options[:urls],
|
435
|
+
:reject => options[:ignore_urls]
|
436
|
+
)
|
437
|
+
@ext_rules = Rules.new(
|
438
|
+
:accept => options[:exts],
|
439
|
+
:reject => options[:ignore_exts]
|
440
|
+
)
|
441
|
+
|
442
|
+
if options[:host]
|
443
|
+
visit_hosts_like(options[:host])
|
444
|
+
end
|
445
|
+
|
446
|
+
if options[:queue]
|
447
|
+
self.queue = options[:queue]
|
448
|
+
end
|
449
|
+
|
450
|
+
if options[:history]
|
451
|
+
self.history = options[:history]
|
452
|
+
end
|
453
|
+
end
|
454
|
+
|
461
455
|
#
|
462
456
|
# Determines if a given URI scheme should be visited.
|
463
457
|
#
|
data/lib/spidr/headers.rb
CHANGED
@@ -295,9 +295,9 @@ module Spidr
|
|
295
295
|
cookie.split('; ').each do |key_value|
|
296
296
|
key, value = key_value.split('=',2)
|
297
297
|
|
298
|
-
|
299
|
-
|
300
|
-
|
298
|
+
unless RESERVED_COOKIE_NAMES.include?(key)
|
299
|
+
params[key] = (value || '')
|
300
|
+
end
|
301
301
|
end
|
302
302
|
end
|
303
303
|
|
data/lib/spidr/links.rb
CHANGED
data/lib/spidr/rules.rb
CHANGED
@@ -40,17 +40,9 @@ module Spidr
|
|
40
40
|
#
|
41
41
|
def accept?(data)
|
42
42
|
unless @accept.empty?
|
43
|
-
@accept.
|
44
|
-
return true if test_data(data,rule)
|
45
|
-
end
|
46
|
-
|
47
|
-
return false
|
43
|
+
@accept.any? { |rule| test_data(data,rule) }
|
48
44
|
else
|
49
|
-
|
50
|
-
return false if test_data(data,rule)
|
51
|
-
end
|
52
|
-
|
53
|
-
return true
|
45
|
+
!@reject.any? { |rule| test_data(data,rule) }
|
54
46
|
end
|
55
47
|
end
|
56
48
|
|
@@ -62,7 +54,7 @@ module Spidr
|
|
62
54
|
# rejection patterns.
|
63
55
|
#
|
64
56
|
def reject?(data)
|
65
|
-
!
|
57
|
+
!accept?(data)
|
66
58
|
end
|
67
59
|
|
68
60
|
protected
|
@@ -75,11 +67,11 @@ module Spidr
|
|
75
67
|
#
|
76
68
|
def test_data(data,rule)
|
77
69
|
if rule.kind_of?(Proc)
|
78
|
-
|
70
|
+
rule.call(data) == true
|
79
71
|
elsif rule.kind_of?(Regexp)
|
80
|
-
|
72
|
+
!((data.to_s =~ rule).nil?)
|
81
73
|
else
|
82
|
-
|
74
|
+
data == rule
|
83
75
|
end
|
84
76
|
end
|
85
77
|
|
data/lib/spidr/sanitizers.rb
CHANGED
@@ -6,39 +6,11 @@ module Spidr
|
|
6
6
|
# sanitation of incoming links.
|
7
7
|
#
|
8
8
|
module Sanitizers
|
9
|
-
|
10
|
-
|
11
|
-
# Specifies whether the Agent will strip URI fragments
|
12
|
-
attr_accessor :strip_fragments
|
9
|
+
# Specifies whether the Agent will strip URI fragments
|
10
|
+
attr_accessor :strip_fragments
|
13
11
|
|
14
|
-
|
15
|
-
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
#
|
20
|
-
# Initializes the Sanitizer rules.
|
21
|
-
#
|
22
|
-
# @param [Hash] options
|
23
|
-
# Additional options.
|
24
|
-
#
|
25
|
-
# @option options [Boolean] :strip_fragments (true)
|
26
|
-
# Specifies whether or not to strip the fragment component from URLs.
|
27
|
-
#
|
28
|
-
# @option options [Boolean] :strip_query (false)
|
29
|
-
# Specifies whether or not to strip the query component from URLs.
|
30
|
-
#
|
31
|
-
# @since 0.2.2
|
32
|
-
#
|
33
|
-
def initialize(options={})
|
34
|
-
@strip_fragments = true
|
35
|
-
|
36
|
-
if options.has_key?(:strip_fragments)
|
37
|
-
@strip_fragments = options[:strip_fragments]
|
38
|
-
end
|
39
|
-
|
40
|
-
@strip_query = (options[:strip_query] || false)
|
41
|
-
end
|
12
|
+
# Specifies whether the Agent will strip URI queries
|
13
|
+
attr_accessor :strip_query
|
42
14
|
|
43
15
|
#
|
44
16
|
# Sanitizes a URL based on filtering options.
|
@@ -59,5 +31,26 @@ module Spidr
|
|
59
31
|
|
60
32
|
return url
|
61
33
|
end
|
34
|
+
|
35
|
+
protected
|
36
|
+
|
37
|
+
#
|
38
|
+
# Initializes the Sanitizer rules.
|
39
|
+
#
|
40
|
+
# @param [Hash] options
|
41
|
+
# Additional options.
|
42
|
+
#
|
43
|
+
# @option options [Boolean] :strip_fragments (true)
|
44
|
+
# Specifies whether or not to strip the fragment component from URLs.
|
45
|
+
#
|
46
|
+
# @option options [Boolean] :strip_query (false)
|
47
|
+
# Specifies whether or not to strip the query component from URLs.
|
48
|
+
#
|
49
|
+
# @since 0.2.2
|
50
|
+
#
|
51
|
+
def initialize_sanitizers(options={})
|
52
|
+
@strip_fragments = options.fetch(:strip_fragments,true)
|
53
|
+
@strip_query = options.fetch(:strip_query,false)
|
54
|
+
end
|
62
55
|
end
|
63
56
|
end
|
data/lib/spidr/version.rb
CHANGED
data/spidr.gemspec
CHANGED
@@ -1,15 +1,127 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
Gem::Specification.new do |gemspec|
|
6
|
+
files = if File.directory?('.git')
|
7
|
+
`git ls-files`.split($/)
|
8
|
+
elsif File.directory?('.hg')
|
9
|
+
`hg manifest`.split($/)
|
10
|
+
elsif File.directory?('.svn')
|
11
|
+
`svn ls -R`.split($/).select { |path| File.file?(path) }
|
12
|
+
else
|
13
|
+
Dir['{**/}{.*,*}'].select { |path| File.file?(path) }
|
14
|
+
end
|
15
|
+
|
16
|
+
filter_files = lambda { |paths|
|
17
|
+
case paths
|
18
|
+
when Array
|
19
|
+
(files & paths)
|
20
|
+
when String
|
21
|
+
(files & Dir[paths])
|
22
|
+
end
|
23
|
+
}
|
24
|
+
|
25
|
+
version = {
|
26
|
+
:file => 'lib/spidr/version.rb',
|
27
|
+
:constant => 'Spidr::VERSION'
|
28
|
+
}
|
29
|
+
|
30
|
+
defaults = {
|
31
|
+
'name' => File.basename(File.dirname(__FILE__)),
|
32
|
+
'files' => files,
|
33
|
+
'executables' => filter_files['bin/*'].map { |path| File.basename(path) },
|
34
|
+
'test_files' => filter_files['{test/{**/}*_test.rb,spec/{**/}*_spec.rb}'],
|
35
|
+
'extra_doc_files' => filter_files['*.{txt,rdoc,md,markdown,tt,textile}'],
|
36
|
+
}
|
37
|
+
|
38
|
+
metadata = defaults.merge(YAML.load_file('gemspec.yml'))
|
39
|
+
|
40
|
+
gemspec.name = metadata.fetch('name',defaults[:name])
|
41
|
+
gemspec.version = if metadata['version']
|
42
|
+
metadata['version']
|
43
|
+
elsif File.file?(version[:file])
|
44
|
+
require File.join('.',version[:file])
|
45
|
+
eval(version[:constant])
|
46
|
+
end
|
47
|
+
|
48
|
+
gemspec.summary = metadata.fetch('summary',metadata['description'])
|
49
|
+
gemspec.description = metadata.fetch('description',metadata['summary'])
|
50
|
+
|
51
|
+
case metadata['license']
|
52
|
+
when Array
|
53
|
+
gemspec.licenses = metadata['license']
|
54
|
+
when String
|
55
|
+
gemspec.license = metadata['license']
|
56
|
+
end
|
57
|
+
|
58
|
+
case metadata['authors']
|
59
|
+
when Array
|
60
|
+
gemspec.authors = metadata['authors']
|
61
|
+
when String
|
62
|
+
gemspec.author = metadata['authors']
|
63
|
+
end
|
64
|
+
|
65
|
+
gemspec.email = metadata['email']
|
66
|
+
gemspec.homepage = metadata['homepage']
|
67
|
+
|
68
|
+
case metadata['require_paths']
|
69
|
+
when Array
|
70
|
+
gemspec.require_paths = metadata['require_paths']
|
71
|
+
when String
|
72
|
+
gemspec.require_path = metadata['require_paths']
|
73
|
+
end
|
74
|
+
|
75
|
+
gemspec.files = filter_files[metadata['files']]
|
76
|
+
|
77
|
+
gemspec.executables = metadata['executables']
|
78
|
+
gemspec.extensions = metadata['extensions']
|
79
|
+
|
80
|
+
if Gem::VERSION < '1.7.'
|
81
|
+
gemspec.default_executable = gemspec.executables.first
|
82
|
+
end
|
83
|
+
|
84
|
+
gemspec.test_files = filter_files[metadata['test_files']]
|
85
|
+
|
86
|
+
unless gemspec.files.include?('.document')
|
87
|
+
gemspec.extra_rdoc_files = metadata['extra_doc_files']
|
88
|
+
end
|
89
|
+
|
90
|
+
gemspec.post_install_message = metadata['post_install_message']
|
91
|
+
gemspec.requirements = metadata['requirements']
|
92
|
+
|
93
|
+
if gemspec.respond_to?(:required_ruby_version=)
|
94
|
+
gemspec.required_ruby_version = metadata['required_ruby_version']
|
95
|
+
end
|
96
|
+
|
97
|
+
if gemspec.respond_to?(:required_rubygems_version=)
|
98
|
+
gemspec.required_rubygems_version = metadata['required_ruby_version']
|
99
|
+
end
|
100
|
+
|
101
|
+
parse_versions = lambda { |versions|
|
102
|
+
case versions
|
103
|
+
when Array
|
104
|
+
versions.map { |v| v.to_s }
|
105
|
+
when String
|
106
|
+
versions.split(/,\s*/)
|
107
|
+
end
|
108
|
+
}
|
109
|
+
|
110
|
+
if metadata['dependencies']
|
111
|
+
metadata['dependencies'].each do |name,versions|
|
112
|
+
gemspec.add_dependency(name,parse_versions[versions])
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
if metadata['runtime_dependencies']
|
117
|
+
metadata['runtime_dependencies'].each do |name,versions|
|
118
|
+
gemspec.add_runtime_dependency(name,parse_versions[versions])
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
if metadata['development_dependencies']
|
123
|
+
metadata['development_dependencies'].each do |name,versions|
|
124
|
+
gemspec.add_development_dependency(name,parse_versions[versions])
|
125
|
+
end
|
14
126
|
end
|
15
127
|
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.3.
|
5
|
+
version: 0.3.2
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Postmodern
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-
|
13
|
+
date: 2011-06-20 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: nokogiri
|
@@ -128,7 +128,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
128
128
|
requirements: []
|
129
129
|
|
130
130
|
rubyforge_project: spidr
|
131
|
-
rubygems_version: 1.
|
131
|
+
rubygems_version: 1.8.5
|
132
132
|
signing_key:
|
133
133
|
specification_version: 3
|
134
134
|
summary: A versatile Ruby web spidering library
|