spidr 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog.md +9 -0
- data/Gemfile +1 -1
- data/lib/spidr/actions/actions.rb +6 -6
- data/lib/spidr/agent.rb +33 -43
- data/lib/spidr/body.rb +8 -9
- data/lib/spidr/events.rb +22 -13
- data/lib/spidr/filters.rb +98 -104
- data/lib/spidr/headers.rb +3 -3
- data/lib/spidr/links.rb +1 -1
- data/lib/spidr/rules.rb +6 -14
- data/lib/spidr/sanitizers.rb +25 -32
- data/lib/spidr/version.rb +1 -1
- data/spidr.gemspec +125 -13
- metadata +3 -3
data/ChangeLog.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
### 0.3.2 / 2011-06-20
|
2
|
+
|
3
|
+
* Added separate intitialize methods for {Spidr::Actions}, {Spidr::Events},
|
4
|
+
{Spidr::Filters} and {Spidr::Sanitizers}.
|
5
|
+
* Aliased {Spidr::Events#urls_like} to {Spidr::Events#every_url_like}.
|
6
|
+
* Reduce usage of `self.included` and `module_eval`.
|
7
|
+
* Reduce usage of nested-blocks.
|
8
|
+
* Reduce usage of `return`.
|
9
|
+
|
1
10
|
### 0.3.1 / 2011-04-22
|
2
11
|
|
3
12
|
* Require `set` in `spidr/headers.rb`.
|
data/Gemfile
CHANGED
@@ -8,12 +8,6 @@ module Spidr
|
|
8
8
|
# spidering of links.
|
9
9
|
#
|
10
10
|
module Actions
|
11
|
-
def initialize(options={})
|
12
|
-
@paused = false
|
13
|
-
|
14
|
-
super(options)
|
15
|
-
end
|
16
|
-
|
17
11
|
#
|
18
12
|
# Continue spidering.
|
19
13
|
#
|
@@ -79,5 +73,11 @@ module Spidr
|
|
79
73
|
def skip_page!
|
80
74
|
raise(SkipPage)
|
81
75
|
end
|
76
|
+
|
77
|
+
protected
|
78
|
+
|
79
|
+
def initialize_actions(options={})
|
80
|
+
@paused = false
|
81
|
+
end
|
82
82
|
end
|
83
83
|
end
|
data/lib/spidr/agent.rb
CHANGED
@@ -115,15 +115,15 @@ module Spidr
|
|
115
115
|
@host_headers.merge!(options[:host_headers])
|
116
116
|
end
|
117
117
|
|
118
|
-
@user_agent = (
|
118
|
+
@user_agent = options.fetch(:user_agent,Spidr.user_agent)
|
119
119
|
@referer = options[:referer]
|
120
120
|
|
121
|
-
@sessions = SessionCache.new(options
|
121
|
+
@sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
|
122
122
|
@cookies = CookieJar.new
|
123
123
|
@authorized = AuthStore.new
|
124
124
|
|
125
125
|
@running = false
|
126
|
-
@delay = (
|
126
|
+
@delay = options.fetch(:delay,0)
|
127
127
|
@history = Set[]
|
128
128
|
@failures = Set[]
|
129
129
|
@queue = []
|
@@ -131,7 +131,10 @@ module Spidr
|
|
131
131
|
@levels = Hash.new(0)
|
132
132
|
@max_depth = options[:max_depth]
|
133
133
|
|
134
|
-
|
134
|
+
initialize_sanitizers(options)
|
135
|
+
initialize_filters(options)
|
136
|
+
initialize_actions(options)
|
137
|
+
initialize_events(options)
|
135
138
|
|
136
139
|
yield self if block_given?
|
137
140
|
end
|
@@ -152,19 +155,16 @@ module Spidr
|
|
152
155
|
# @yieldparam [Agent] agent
|
153
156
|
# The newly created agent.
|
154
157
|
#
|
155
|
-
def self.start_at(url,options={})
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
spider.start_at(url)
|
160
|
-
end
|
158
|
+
def self.start_at(url,options={},&block)
|
159
|
+
agent = new(options,&block)
|
160
|
+
agent.start_at(url)
|
161
161
|
end
|
162
162
|
|
163
163
|
#
|
164
|
-
# Creates a new agent and spiders the given
|
164
|
+
# Creates a new agent and spiders the web-site located at the given URL.
|
165
165
|
#
|
166
|
-
# @param [String]
|
167
|
-
# The
|
166
|
+
# @param [URI::HTTP, String] url
|
167
|
+
# The web-site to spider.
|
168
168
|
#
|
169
169
|
# @param [Hash] options
|
170
170
|
# Additional options. See {Agent#initialize}.
|
@@ -176,19 +176,18 @@ module Spidr
|
|
176
176
|
# @yieldparam [Agent] agent
|
177
177
|
# The newly created agent.
|
178
178
|
#
|
179
|
-
def self.
|
180
|
-
|
181
|
-
yield spider if block_given?
|
179
|
+
def self.site(url,options={},&block)
|
180
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
182
181
|
|
183
|
-
|
184
|
-
|
182
|
+
agent = new(options.merge(:host => url.host),&block)
|
183
|
+
agent.start_at(url)
|
185
184
|
end
|
186
185
|
|
187
186
|
#
|
188
|
-
# Creates a new agent and spiders the
|
187
|
+
# Creates a new agent and spiders the given host.
|
189
188
|
#
|
190
|
-
# @param [
|
191
|
-
# The
|
189
|
+
# @param [String]
|
190
|
+
# The host-name to spider.
|
192
191
|
#
|
193
192
|
# @param [Hash] options
|
194
193
|
# Additional options. See {Agent#initialize}.
|
@@ -200,14 +199,8 @@ module Spidr
|
|
200
199
|
# @yieldparam [Agent] agent
|
201
200
|
# The newly created agent.
|
202
201
|
#
|
203
|
-
def self.
|
204
|
-
|
205
|
-
|
206
|
-
return self.new(options.merge(:host => url.host)) do |spider|
|
207
|
-
yield spider if block_given?
|
208
|
-
|
209
|
-
spider.start_at(url)
|
210
|
-
end
|
202
|
+
def self.host(name,options={},&block)
|
203
|
+
site(URI::HTTP.build(:host => name, :path => '/'),options,&block)
|
211
204
|
end
|
212
205
|
|
213
206
|
#
|
@@ -234,7 +227,6 @@ module Spidr
|
|
234
227
|
#
|
235
228
|
def start_at(url,&block)
|
236
229
|
enqueue(url)
|
237
|
-
|
238
230
|
return run(&block)
|
239
231
|
end
|
240
232
|
|
@@ -261,7 +253,6 @@ module Spidr
|
|
261
253
|
end
|
262
254
|
|
263
255
|
@running = false
|
264
|
-
|
265
256
|
@sessions.clear
|
266
257
|
return self
|
267
258
|
end
|
@@ -387,10 +378,10 @@ module Spidr
|
|
387
378
|
|
388
379
|
new_failures.each do |url|
|
389
380
|
@failures << unless url.kind_of?(URI)
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
381
|
+
URI(url.to_s)
|
382
|
+
else
|
383
|
+
url
|
384
|
+
end
|
394
385
|
end
|
395
386
|
|
396
387
|
return @failures
|
@@ -471,7 +462,7 @@ module Spidr
|
|
471
462
|
begin
|
472
463
|
@every_url_blocks.each { |url_block| url_block.call(url) }
|
473
464
|
|
474
|
-
@
|
465
|
+
@every_url_like_blocks.each do |pattern,url_blocks|
|
475
466
|
match = case pattern
|
476
467
|
when Regexp
|
477
468
|
link =~ pattern
|
@@ -653,12 +644,11 @@ module Spidr
|
|
653
644
|
def prepare_request(url,&block)
|
654
645
|
host = url.host
|
655
646
|
port = url.port
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
end
|
647
|
+
path = unless url.path.empty?
|
648
|
+
url.path
|
649
|
+
else
|
650
|
+
'/'
|
651
|
+
end
|
662
652
|
|
663
653
|
# append the URL query to the path
|
664
654
|
path += "?#{url.query}" if url.query
|
@@ -724,7 +714,7 @@ module Spidr
|
|
724
714
|
# Specifies whether the given URL should be visited.
|
725
715
|
#
|
726
716
|
def visit?(url)
|
727
|
-
!
|
717
|
+
!visited?(url) &&
|
728
718
|
visit_scheme?(url.scheme) &&
|
729
719
|
visit_host?(url.host) &&
|
730
720
|
visit_port?(url.port) &&
|
data/lib/spidr/body.rb
CHANGED
@@ -24,16 +24,15 @@ module Spidr
|
|
24
24
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
25
25
|
#
|
26
26
|
def doc
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
27
|
+
unless body.empty?
|
28
|
+
begin
|
29
|
+
if html?
|
30
|
+
@doc ||= Nokogiri::HTML(body)
|
31
|
+
elsif (rss? || atom? || xml? || xsl?)
|
32
|
+
@doc ||= Nokogiri::XML(body)
|
33
|
+
end
|
34
|
+
rescue
|
34
35
|
end
|
35
|
-
rescue
|
36
|
-
return nil
|
37
36
|
end
|
38
37
|
end
|
39
38
|
|
data/lib/spidr/events.rb
CHANGED
@@ -5,17 +5,6 @@ module Spidr
|
|
5
5
|
# they are visited.
|
6
6
|
#
|
7
7
|
module Events
|
8
|
-
def initialize(options={})
|
9
|
-
super(options)
|
10
|
-
|
11
|
-
@every_url_blocks = []
|
12
|
-
@every_failed_url_blocks = []
|
13
|
-
@urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
14
|
-
|
15
|
-
@every_page_blocks = []
|
16
|
-
@every_link_blocks = []
|
17
|
-
end
|
18
|
-
|
19
8
|
#
|
20
9
|
# Pass each URL from each page visited to the given block.
|
21
10
|
#
|
@@ -57,11 +46,20 @@ module Spidr
|
|
57
46
|
# @yieldparam [URI::HTTP] url
|
58
47
|
# A matching URL.
|
59
48
|
#
|
60
|
-
|
61
|
-
|
49
|
+
# @since 0.3.2
|
50
|
+
#
|
51
|
+
def every_url_like(pattern,&block)
|
52
|
+
@every_url_like_blocks[pattern] << block
|
62
53
|
return self
|
63
54
|
end
|
64
55
|
|
56
|
+
#
|
57
|
+
# @see #every_url_like
|
58
|
+
#
|
59
|
+
def urls_like(pattern,&block)
|
60
|
+
every_url_like(pattern,&block)
|
61
|
+
end
|
62
|
+
|
65
63
|
#
|
66
64
|
# Pass the headers from every response the agent receives to a given
|
67
65
|
# block.
|
@@ -524,5 +522,16 @@ module Spidr
|
|
524
522
|
@every_link_blocks << block
|
525
523
|
return self
|
526
524
|
end
|
525
|
+
|
526
|
+
protected
|
527
|
+
|
528
|
+
def initialize_events(options={})
|
529
|
+
@every_url_blocks = []
|
530
|
+
@every_failed_url_blocks = []
|
531
|
+
@every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
532
|
+
|
533
|
+
@every_page_blocks = []
|
534
|
+
@every_link_blocks = []
|
535
|
+
end
|
527
536
|
end
|
528
537
|
end
|
data/lib/spidr/filters.rb
CHANGED
@@ -6,110 +6,8 @@ module Spidr
|
|
6
6
|
# URLs the agent will visit.
|
7
7
|
#
|
8
8
|
module Filters
|
9
|
-
|
10
|
-
|
11
|
-
# List of acceptable URL schemes to follow
|
12
|
-
attr_reader :schemes
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
#
|
17
|
-
# Initializes filtering rules.
|
18
|
-
#
|
19
|
-
# @param [Hash] options
|
20
|
-
# Additional options.
|
21
|
-
#
|
22
|
-
# @option options [Array] :schemes (['http', 'https'])
|
23
|
-
# The list of acceptable URI schemes to visit.
|
24
|
-
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
25
|
-
#
|
26
|
-
# @option options [String] :host
|
27
|
-
# The host-name to visit.
|
28
|
-
#
|
29
|
-
# @option options [Array<String, Regexp, Proc>] :hosts
|
30
|
-
# The patterns which match the host-names to visit.
|
31
|
-
#
|
32
|
-
# @option options [Array<String, Regexp, Proc>] :ignore_hosts
|
33
|
-
# The patterns which match the host-names to not visit.
|
34
|
-
#
|
35
|
-
# @option options [Array<Integer, Regexp, Proc>] :ports
|
36
|
-
# The patterns which match the ports to visit.
|
37
|
-
#
|
38
|
-
# @option options [Array<Integer, Regexp, Proc>] :ignore_ports
|
39
|
-
# The patterns which match the ports to not visit.
|
40
|
-
#
|
41
|
-
# @option options [Array<String, Regexp, Proc>] :links
|
42
|
-
# The patterns which match the links to visit.
|
43
|
-
#
|
44
|
-
# @option options [Array<String, Regexp, Proc>] :ignore_links
|
45
|
-
# The patterns which match the links to not visit.
|
46
|
-
#
|
47
|
-
# @option options [Array<String, Regexp, Proc>] :urls
|
48
|
-
# The patterns which match the URLs to visit.
|
49
|
-
#
|
50
|
-
# @option options [Array<String, Regexp, Proc>] :ignore_urls
|
51
|
-
# The patterns which match the URLs to not visit.
|
52
|
-
#
|
53
|
-
# @option options [Array<String, Regexp, Proc>] :exts
|
54
|
-
# The patterns which match the URI path extensions to visit.
|
55
|
-
#
|
56
|
-
# @option options [Array<String, Regexp, Proc>] :ignore_exts
|
57
|
-
# The patterns which match the URI path extensions to not visit.
|
58
|
-
#
|
59
|
-
def initialize(options={})
|
60
|
-
super(options)
|
61
|
-
|
62
|
-
@schemes = []
|
63
|
-
|
64
|
-
if options[:schemes]
|
65
|
-
@schemes += options[:schemes]
|
66
|
-
else
|
67
|
-
@schemes << 'http'
|
68
|
-
|
69
|
-
begin
|
70
|
-
require 'net/https'
|
71
|
-
|
72
|
-
@schemes << 'https'
|
73
|
-
rescue Gem::LoadError => e
|
74
|
-
raise(e)
|
75
|
-
rescue ::LoadError
|
76
|
-
STDERR.puts "Warning: cannot load 'net/https', https support disabled"
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
@host_rules = Rules.new(
|
81
|
-
:accept => options[:hosts],
|
82
|
-
:reject => options[:ignore_hosts]
|
83
|
-
)
|
84
|
-
@port_rules = Rules.new(
|
85
|
-
:accept => options[:ports],
|
86
|
-
:reject => options[:ignore_ports]
|
87
|
-
)
|
88
|
-
@link_rules = Rules.new(
|
89
|
-
:accept => options[:links],
|
90
|
-
:reject => options[:ignore_links]
|
91
|
-
)
|
92
|
-
@url_rules = Rules.new(
|
93
|
-
:accept => options[:urls],
|
94
|
-
:reject => options[:ignore_urls]
|
95
|
-
)
|
96
|
-
@ext_rules = Rules.new(
|
97
|
-
:accept => options[:exts],
|
98
|
-
:reject => options[:ignore_exts]
|
99
|
-
)
|
100
|
-
|
101
|
-
if options[:host]
|
102
|
-
visit_hosts_like(options[:host])
|
103
|
-
end
|
104
|
-
|
105
|
-
if options[:queue]
|
106
|
-
self.queue = options[:queue]
|
107
|
-
end
|
108
|
-
|
109
|
-
if options[:history]
|
110
|
-
self.history = options[:history]
|
111
|
-
end
|
112
|
-
end
|
9
|
+
# List of acceptable URL schemes to follow
|
10
|
+
attr_reader :schemes
|
113
11
|
|
114
12
|
#
|
115
13
|
# Sets the list of acceptable URL schemes to visit.
|
@@ -458,6 +356,102 @@ module Spidr
|
|
458
356
|
|
459
357
|
protected
|
460
358
|
|
359
|
+
#
|
360
|
+
# Initializes filtering rules.
|
361
|
+
#
|
362
|
+
# @param [Hash] options
|
363
|
+
# Additional options.
|
364
|
+
#
|
365
|
+
# @option options [Array] :schemes (['http', 'https'])
|
366
|
+
# The list of acceptable URI schemes to visit.
|
367
|
+
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
368
|
+
#
|
369
|
+
# @option options [String] :host
|
370
|
+
# The host-name to visit.
|
371
|
+
#
|
372
|
+
# @option options [Array<String, Regexp, Proc>] :hosts
|
373
|
+
# The patterns which match the host-names to visit.
|
374
|
+
#
|
375
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_hosts
|
376
|
+
# The patterns which match the host-names to not visit.
|
377
|
+
#
|
378
|
+
# @option options [Array<Integer, Regexp, Proc>] :ports
|
379
|
+
# The patterns which match the ports to visit.
|
380
|
+
#
|
381
|
+
# @option options [Array<Integer, Regexp, Proc>] :ignore_ports
|
382
|
+
# The patterns which match the ports to not visit.
|
383
|
+
#
|
384
|
+
# @option options [Array<String, Regexp, Proc>] :links
|
385
|
+
# The patterns which match the links to visit.
|
386
|
+
#
|
387
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_links
|
388
|
+
# The patterns which match the links to not visit.
|
389
|
+
#
|
390
|
+
# @option options [Array<String, Regexp, Proc>] :urls
|
391
|
+
# The patterns which match the URLs to visit.
|
392
|
+
#
|
393
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_urls
|
394
|
+
# The patterns which match the URLs to not visit.
|
395
|
+
#
|
396
|
+
# @option options [Array<String, Regexp, Proc>] :exts
|
397
|
+
# The patterns which match the URI path extensions to visit.
|
398
|
+
#
|
399
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_exts
|
400
|
+
# The patterns which match the URI path extensions to not visit.
|
401
|
+
#
|
402
|
+
def initialize_filters(options={})
|
403
|
+
@schemes = []
|
404
|
+
|
405
|
+
if options[:schemes]
|
406
|
+
@schemes += options[:schemes]
|
407
|
+
else
|
408
|
+
@schemes << 'http'
|
409
|
+
|
410
|
+
begin
|
411
|
+
require 'net/https'
|
412
|
+
|
413
|
+
@schemes << 'https'
|
414
|
+
rescue Gem::LoadError => e
|
415
|
+
raise(e)
|
416
|
+
rescue ::LoadError
|
417
|
+
STDERR.puts "Warning: cannot load 'net/https', https support disabled"
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
@host_rules = Rules.new(
|
422
|
+
:accept => options[:hosts],
|
423
|
+
:reject => options[:ignore_hosts]
|
424
|
+
)
|
425
|
+
@port_rules = Rules.new(
|
426
|
+
:accept => options[:ports],
|
427
|
+
:reject => options[:ignore_ports]
|
428
|
+
)
|
429
|
+
@link_rules = Rules.new(
|
430
|
+
:accept => options[:links],
|
431
|
+
:reject => options[:ignore_links]
|
432
|
+
)
|
433
|
+
@url_rules = Rules.new(
|
434
|
+
:accept => options[:urls],
|
435
|
+
:reject => options[:ignore_urls]
|
436
|
+
)
|
437
|
+
@ext_rules = Rules.new(
|
438
|
+
:accept => options[:exts],
|
439
|
+
:reject => options[:ignore_exts]
|
440
|
+
)
|
441
|
+
|
442
|
+
if options[:host]
|
443
|
+
visit_hosts_like(options[:host])
|
444
|
+
end
|
445
|
+
|
446
|
+
if options[:queue]
|
447
|
+
self.queue = options[:queue]
|
448
|
+
end
|
449
|
+
|
450
|
+
if options[:history]
|
451
|
+
self.history = options[:history]
|
452
|
+
end
|
453
|
+
end
|
454
|
+
|
461
455
|
#
|
462
456
|
# Determines if a given URI scheme should be visited.
|
463
457
|
#
|
data/lib/spidr/headers.rb
CHANGED
@@ -295,9 +295,9 @@ module Spidr
|
|
295
295
|
cookie.split('; ').each do |key_value|
|
296
296
|
key, value = key_value.split('=',2)
|
297
297
|
|
298
|
-
|
299
|
-
|
300
|
-
|
298
|
+
unless RESERVED_COOKIE_NAMES.include?(key)
|
299
|
+
params[key] = (value || '')
|
300
|
+
end
|
301
301
|
end
|
302
302
|
end
|
303
303
|
|
data/lib/spidr/links.rb
CHANGED
data/lib/spidr/rules.rb
CHANGED
@@ -40,17 +40,9 @@ module Spidr
|
|
40
40
|
#
|
41
41
|
def accept?(data)
|
42
42
|
unless @accept.empty?
|
43
|
-
@accept.
|
44
|
-
return true if test_data(data,rule)
|
45
|
-
end
|
46
|
-
|
47
|
-
return false
|
43
|
+
@accept.any? { |rule| test_data(data,rule) }
|
48
44
|
else
|
49
|
-
|
50
|
-
return false if test_data(data,rule)
|
51
|
-
end
|
52
|
-
|
53
|
-
return true
|
45
|
+
!@reject.any? { |rule| test_data(data,rule) }
|
54
46
|
end
|
55
47
|
end
|
56
48
|
|
@@ -62,7 +54,7 @@ module Spidr
|
|
62
54
|
# rejection patterns.
|
63
55
|
#
|
64
56
|
def reject?(data)
|
65
|
-
!
|
57
|
+
!accept?(data)
|
66
58
|
end
|
67
59
|
|
68
60
|
protected
|
@@ -75,11 +67,11 @@ module Spidr
|
|
75
67
|
#
|
76
68
|
def test_data(data,rule)
|
77
69
|
if rule.kind_of?(Proc)
|
78
|
-
|
70
|
+
rule.call(data) == true
|
79
71
|
elsif rule.kind_of?(Regexp)
|
80
|
-
|
72
|
+
!((data.to_s =~ rule).nil?)
|
81
73
|
else
|
82
|
-
|
74
|
+
data == rule
|
83
75
|
end
|
84
76
|
end
|
85
77
|
|
data/lib/spidr/sanitizers.rb
CHANGED
@@ -6,39 +6,11 @@ module Spidr
|
|
6
6
|
# sanitation of incoming links.
|
7
7
|
#
|
8
8
|
module Sanitizers
|
9
|
-
|
10
|
-
|
11
|
-
# Specifies whether the Agent will strip URI fragments
|
12
|
-
attr_accessor :strip_fragments
|
9
|
+
# Specifies whether the Agent will strip URI fragments
|
10
|
+
attr_accessor :strip_fragments
|
13
11
|
|
14
|
-
|
15
|
-
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
#
|
20
|
-
# Initializes the Sanitizer rules.
|
21
|
-
#
|
22
|
-
# @param [Hash] options
|
23
|
-
# Additional options.
|
24
|
-
#
|
25
|
-
# @option options [Boolean] :strip_fragments (true)
|
26
|
-
# Specifies whether or not to strip the fragment component from URLs.
|
27
|
-
#
|
28
|
-
# @option options [Boolean] :strip_query (false)
|
29
|
-
# Specifies whether or not to strip the query component from URLs.
|
30
|
-
#
|
31
|
-
# @since 0.2.2
|
32
|
-
#
|
33
|
-
def initialize(options={})
|
34
|
-
@strip_fragments = true
|
35
|
-
|
36
|
-
if options.has_key?(:strip_fragments)
|
37
|
-
@strip_fragments = options[:strip_fragments]
|
38
|
-
end
|
39
|
-
|
40
|
-
@strip_query = (options[:strip_query] || false)
|
41
|
-
end
|
12
|
+
# Specifies whether the Agent will strip URI queries
|
13
|
+
attr_accessor :strip_query
|
42
14
|
|
43
15
|
#
|
44
16
|
# Sanitizes a URL based on filtering options.
|
@@ -59,5 +31,26 @@ module Spidr
|
|
59
31
|
|
60
32
|
return url
|
61
33
|
end
|
34
|
+
|
35
|
+
protected
|
36
|
+
|
37
|
+
#
|
38
|
+
# Initializes the Sanitizer rules.
|
39
|
+
#
|
40
|
+
# @param [Hash] options
|
41
|
+
# Additional options.
|
42
|
+
#
|
43
|
+
# @option options [Boolean] :strip_fragments (true)
|
44
|
+
# Specifies whether or not to strip the fragment component from URLs.
|
45
|
+
#
|
46
|
+
# @option options [Boolean] :strip_query (false)
|
47
|
+
# Specifies whether or not to strip the query component from URLs.
|
48
|
+
#
|
49
|
+
# @since 0.2.2
|
50
|
+
#
|
51
|
+
def initialize_sanitizers(options={})
|
52
|
+
@strip_fragments = options.fetch(:strip_fragments,true)
|
53
|
+
@strip_query = options.fetch(:strip_query,false)
|
54
|
+
end
|
62
55
|
end
|
63
56
|
end
|
data/lib/spidr/version.rb
CHANGED
data/spidr.gemspec
CHANGED
@@ -1,15 +1,127 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
Gem::Specification.new do |gemspec|
|
6
|
+
files = if File.directory?('.git')
|
7
|
+
`git ls-files`.split($/)
|
8
|
+
elsif File.directory?('.hg')
|
9
|
+
`hg manifest`.split($/)
|
10
|
+
elsif File.directory?('.svn')
|
11
|
+
`svn ls -R`.split($/).select { |path| File.file?(path) }
|
12
|
+
else
|
13
|
+
Dir['{**/}{.*,*}'].select { |path| File.file?(path) }
|
14
|
+
end
|
15
|
+
|
16
|
+
filter_files = lambda { |paths|
|
17
|
+
case paths
|
18
|
+
when Array
|
19
|
+
(files & paths)
|
20
|
+
when String
|
21
|
+
(files & Dir[paths])
|
22
|
+
end
|
23
|
+
}
|
24
|
+
|
25
|
+
version = {
|
26
|
+
:file => 'lib/spidr/version.rb',
|
27
|
+
:constant => 'Spidr::VERSION'
|
28
|
+
}
|
29
|
+
|
30
|
+
defaults = {
|
31
|
+
'name' => File.basename(File.dirname(__FILE__)),
|
32
|
+
'files' => files,
|
33
|
+
'executables' => filter_files['bin/*'].map { |path| File.basename(path) },
|
34
|
+
'test_files' => filter_files['{test/{**/}*_test.rb,spec/{**/}*_spec.rb}'],
|
35
|
+
'extra_doc_files' => filter_files['*.{txt,rdoc,md,markdown,tt,textile}'],
|
36
|
+
}
|
37
|
+
|
38
|
+
metadata = defaults.merge(YAML.load_file('gemspec.yml'))
|
39
|
+
|
40
|
+
gemspec.name = metadata.fetch('name',defaults[:name])
|
41
|
+
gemspec.version = if metadata['version']
|
42
|
+
metadata['version']
|
43
|
+
elsif File.file?(version[:file])
|
44
|
+
require File.join('.',version[:file])
|
45
|
+
eval(version[:constant])
|
46
|
+
end
|
47
|
+
|
48
|
+
gemspec.summary = metadata.fetch('summary',metadata['description'])
|
49
|
+
gemspec.description = metadata.fetch('description',metadata['summary'])
|
50
|
+
|
51
|
+
case metadata['license']
|
52
|
+
when Array
|
53
|
+
gemspec.licenses = metadata['license']
|
54
|
+
when String
|
55
|
+
gemspec.license = metadata['license']
|
56
|
+
end
|
57
|
+
|
58
|
+
case metadata['authors']
|
59
|
+
when Array
|
60
|
+
gemspec.authors = metadata['authors']
|
61
|
+
when String
|
62
|
+
gemspec.author = metadata['authors']
|
63
|
+
end
|
64
|
+
|
65
|
+
gemspec.email = metadata['email']
|
66
|
+
gemspec.homepage = metadata['homepage']
|
67
|
+
|
68
|
+
case metadata['require_paths']
|
69
|
+
when Array
|
70
|
+
gemspec.require_paths = metadata['require_paths']
|
71
|
+
when String
|
72
|
+
gemspec.require_path = metadata['require_paths']
|
73
|
+
end
|
74
|
+
|
75
|
+
gemspec.files = filter_files[metadata['files']]
|
76
|
+
|
77
|
+
gemspec.executables = metadata['executables']
|
78
|
+
gemspec.extensions = metadata['extensions']
|
79
|
+
|
80
|
+
if Gem::VERSION < '1.7.'
|
81
|
+
gemspec.default_executable = gemspec.executables.first
|
82
|
+
end
|
83
|
+
|
84
|
+
gemspec.test_files = filter_files[metadata['test_files']]
|
85
|
+
|
86
|
+
unless gemspec.files.include?('.document')
|
87
|
+
gemspec.extra_rdoc_files = metadata['extra_doc_files']
|
88
|
+
end
|
89
|
+
|
90
|
+
gemspec.post_install_message = metadata['post_install_message']
|
91
|
+
gemspec.requirements = metadata['requirements']
|
92
|
+
|
93
|
+
if gemspec.respond_to?(:required_ruby_version=)
|
94
|
+
gemspec.required_ruby_version = metadata['required_ruby_version']
|
95
|
+
end
|
96
|
+
|
97
|
+
if gemspec.respond_to?(:required_rubygems_version=)
|
98
|
+
gemspec.required_rubygems_version = metadata['required_ruby_version']
|
99
|
+
end
|
100
|
+
|
101
|
+
parse_versions = lambda { |versions|
|
102
|
+
case versions
|
103
|
+
when Array
|
104
|
+
versions.map { |v| v.to_s }
|
105
|
+
when String
|
106
|
+
versions.split(/,\s*/)
|
107
|
+
end
|
108
|
+
}
|
109
|
+
|
110
|
+
if metadata['dependencies']
|
111
|
+
metadata['dependencies'].each do |name,versions|
|
112
|
+
gemspec.add_dependency(name,parse_versions[versions])
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
if metadata['runtime_dependencies']
|
117
|
+
metadata['runtime_dependencies'].each do |name,versions|
|
118
|
+
gemspec.add_runtime_dependency(name,parse_versions[versions])
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
if metadata['development_dependencies']
|
123
|
+
metadata['development_dependencies'].each do |name,versions|
|
124
|
+
gemspec.add_development_dependency(name,parse_versions[versions])
|
125
|
+
end
|
14
126
|
end
|
15
127
|
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.3.
|
5
|
+
version: 0.3.2
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Postmodern
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-
|
13
|
+
date: 2011-06-20 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: nokogiri
|
@@ -128,7 +128,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
128
128
|
requirements: []
|
129
129
|
|
130
130
|
rubyforge_project: spidr
|
131
|
-
rubygems_version: 1.
|
131
|
+
rubygems_version: 1.8.5
|
132
132
|
signing_key:
|
133
133
|
specification_version: 3
|
134
134
|
summary: A versatile Ruby web spidering library
|