spidr 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.editorconfig +11 -0
- data/.github/workflows/ruby.yml +26 -0
- data/.gitignore +4 -5
- data/ChangeLog.md +17 -0
- data/Gemfile +8 -5
- data/LICENSE.txt +1 -1
- data/README.md +137 -78
- data/Rakefile +1 -0
- data/gemspec.yml +8 -1
- data/lib/spidr/agent/actions.rb +1 -1
- data/lib/spidr/agent/events.rb +1 -1
- data/lib/spidr/agent/filters.rb +55 -56
- data/lib/spidr/agent/sanitizers.rb +6 -9
- data/lib/spidr/agent.rb +230 -120
- data/lib/spidr/auth_store.rb +10 -6
- data/lib/spidr/page/content_types.rb +51 -0
- data/lib/spidr/page/html.rb +17 -19
- data/lib/spidr/page/status_codes.rb +12 -10
- data/lib/spidr/proxy.rb +6 -14
- data/lib/spidr/rules.rb +5 -8
- data/lib/spidr/session_cache.rb +23 -21
- data/lib/spidr/settings/proxy.rb +19 -5
- data/lib/spidr/spidr.rb +16 -6
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +357 -10
- data/spec/example_page.rb +2 -0
- data/spec/page/content_types_spec.rb +22 -0
- data/spec/page/html_spec.rb +255 -51
- data/spec/page/status_codes_spec.rb +4 -4
- data/spec/proxy_spec.rb +2 -2
- data/spec/settings/proxy_examples.rb +31 -11
- data/spec/spec_helper.rb +3 -0
- metadata +19 -19
- data/.travis.yml +0 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 46a2f2ad2ca789b83fac0e2519294403734e2ad6d647fbc3a612d429e57c1b43
|
4
|
+
data.tar.gz: b72f561e337c6a0fcdbca9f59562e06f0b5854b15d321f90be1a4168b352faca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ced221d8cdbeaf95df12d6c038de6539a5148657209137433cc82c5abc69779a13376a7e6becdf423d2f2bdd9ebfaf8c7b94a51dda70ffcbab932da4fc5260b3
|
7
|
+
data.tar.gz: f54bedf3648dd033b8a37388413ae4ab71b4b09f16cc508b8e43e72f2ef870c59fe325e3f36a841791d9d843acb08bb02009469168e9b231a9835a0249b55b6c
|
data/.editorconfig
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on: [ push, pull_request ]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
tests:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
strategy:
|
9
|
+
fail-fast: false
|
10
|
+
matrix:
|
11
|
+
ruby:
|
12
|
+
- 2.7
|
13
|
+
- '3.0'
|
14
|
+
- '3.1'
|
15
|
+
- jruby
|
16
|
+
name: Ruby ${{ matrix.ruby }}
|
17
|
+
steps:
|
18
|
+
- uses: actions/checkout@v2
|
19
|
+
- name: Set up Ruby
|
20
|
+
uses: ruby/setup-ruby@v1
|
21
|
+
with:
|
22
|
+
ruby-version: ${{ matrix.ruby }}
|
23
|
+
- name: Install dependencies
|
24
|
+
run: bundle install --jobs 4 --retry 3
|
25
|
+
- name: Run tests
|
26
|
+
run: bundle exec rake test
|
data/.gitignore
CHANGED
data/ChangeLog.md
CHANGED
@@ -1,3 +1,20 @@
|
|
1
|
+
### 0.7.0 / 2022-12-31
|
2
|
+
|
3
|
+
* Added {Spidr.domain} and {Spidr::Agent.domain}.
|
4
|
+
* Added {Spidr::Page#gif?}.
|
5
|
+
* Added {Spidr::Page#jpeg?}.
|
6
|
+
* Added {Spidr::Page#icon?} and {Spidr::Page#ico?}.
|
7
|
+
* Added {Spidr::Page#png?}.
|
8
|
+
* {Spidr.proxy=} and {Spidr::Agent#proxy=} can now accept a `String` or a
|
9
|
+
`URI::HTTP` object.
|
10
|
+
|
11
|
+
### 0.6.1 / 2019-10-24
|
12
|
+
|
13
|
+
* Check for the opaque component of URIs before attempting to set the path
|
14
|
+
component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
|
15
|
+
opaque` exceptions.
|
16
|
+
* Fix `@robots` instance variable warning (@spk).
|
17
|
+
|
1
18
|
### 0.6.0 / 2016-08-04
|
2
19
|
|
3
20
|
* Added {Spidr::Proxy}.
|
data/Gemfile
CHANGED
@@ -12,10 +12,13 @@ group :development do
|
|
12
12
|
gem 'rake'
|
13
13
|
gem 'rubygems-tasks', '~> 0.2'
|
14
14
|
|
15
|
-
gem 'rspec',
|
16
|
-
gem 'webmock',
|
17
|
-
gem 'sinatra',
|
15
|
+
gem 'rspec', '~> 3.0'
|
16
|
+
gem 'webmock', '~> 3.0'
|
17
|
+
gem 'sinatra', '~> 2.0'
|
18
|
+
gem 'simplecov', '~> 0.20'
|
18
19
|
|
19
|
-
gem 'kramdown'
|
20
|
-
gem '
|
20
|
+
gem 'kramdown'
|
21
|
+
gem 'redcarpet', platform: :mri
|
22
|
+
gem 'yard', '~> 0.9'
|
23
|
+
gem 'yard-spellcheck', require: false
|
21
24
|
end
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
# Spidr
|
2
2
|
|
3
|
+
[![CI](https://github.com/postmodern/spidr/actions/workflows/ruby.yml/badge.svg)](https://github.com/postmodern/spidr/actions/workflows/ruby.yml)
|
4
|
+
|
3
5
|
* [Homepage](https://github.com/postmodern/spidr#readme)
|
4
6
|
* [Source](https://github.com/postmodern/spidr)
|
5
7
|
* [Issues](https://github.com/postmodern/spidr/issues)
|
6
8
|
* [Mailing List](http://groups.google.com/group/spidr)
|
7
|
-
* [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
|
8
|
-
* [![Build Status](https://travis-ci.org/postmodern/spidr.svg)](https://travis-ci.org/postmodern/spidr)
|
9
9
|
|
10
10
|
## Description
|
11
11
|
|
@@ -49,137 +49,194 @@ and easy to use.
|
|
49
49
|
|
50
50
|
Start spidering from a URL:
|
51
51
|
|
52
|
-
|
52
|
+
```ruby
|
53
|
+
Spidr.start_at('http://tenderlovemaking.com/') do |agent|
|
54
|
+
# ...
|
55
|
+
end
|
56
|
+
```
|
53
57
|
|
54
58
|
Spider a host:
|
55
59
|
|
56
|
-
|
60
|
+
```ruby
|
61
|
+
Spidr.host('solnic.eu') do |agent|
|
62
|
+
# ...
|
63
|
+
end
|
64
|
+
```
|
65
|
+
|
66
|
+
Spider a domain (and any sub-domains):
|
67
|
+
|
68
|
+
```ruby
|
69
|
+
Spidr.domain('ruby-lang.org') do |agent|
|
70
|
+
# ...
|
71
|
+
end
|
72
|
+
```
|
57
73
|
|
58
74
|
Spider a site:
|
59
75
|
|
60
|
-
|
76
|
+
```ruby
|
77
|
+
Spidr.site('http://www.rubyflow.com/') do |agent|
|
78
|
+
# ...
|
79
|
+
end
|
80
|
+
```
|
61
81
|
|
62
82
|
Spider multiple hosts:
|
63
83
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
]
|
70
|
-
)
|
84
|
+
```ruby
|
85
|
+
Spidr.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
|
86
|
+
# ...
|
87
|
+
end
|
88
|
+
```
|
71
89
|
|
72
90
|
Do not spider certain links:
|
73
91
|
|
74
|
-
|
92
|
+
```ruby
|
93
|
+
Spidr.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
|
94
|
+
# ...
|
95
|
+
end
|
96
|
+
```
|
75
97
|
|
76
98
|
Do not spider links on certain ports:
|
77
99
|
|
78
|
-
|
100
|
+
```ruby
|
101
|
+
Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
|
102
|
+
# ...
|
103
|
+
end
|
104
|
+
```
|
79
105
|
|
80
106
|
Do not spider links blacklisted in robots.txt:
|
81
107
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
108
|
+
```ruby
|
109
|
+
Spidr.site('http://company.com/', robots: true) do |agent|
|
110
|
+
# ...
|
111
|
+
end
|
112
|
+
```
|
86
113
|
|
87
114
|
Print out visited URLs:
|
88
115
|
|
89
|
-
|
90
|
-
|
91
|
-
|
116
|
+
```ruby
|
117
|
+
Spidr.site('http://www.rubyinside.com/') do |spider|
|
118
|
+
spider.every_url { |url| puts url }
|
119
|
+
end
|
120
|
+
```
|
92
121
|
|
93
122
|
Build a URL map of a site:
|
94
123
|
|
95
|
-
|
124
|
+
```ruby
|
125
|
+
url_map = Hash.new { |hash,key| hash[key] = [] }
|
96
126
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
127
|
+
Spidr.site('http://intranet.com/') do |spider|
|
128
|
+
spider.every_link do |origin,dest|
|
129
|
+
url_map[dest] << origin
|
130
|
+
end
|
131
|
+
end
|
132
|
+
```
|
102
133
|
|
103
134
|
Print out the URLs that could not be requested:
|
104
135
|
|
105
|
-
|
106
|
-
|
107
|
-
|
136
|
+
```ruby
|
137
|
+
Spidr.site('http://company.com/') do |spider|
|
138
|
+
spider.every_failed_url { |url| puts url }
|
139
|
+
end
|
140
|
+
```
|
108
141
|
|
109
142
|
Finds all pages which have broken links:
|
110
143
|
|
111
|
-
|
144
|
+
```ruby
|
145
|
+
url_map = Hash.new { |hash,key| hash[key] = [] }
|
112
146
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
147
|
+
spider = Spidr.site('http://intranet.com/') do |spider|
|
148
|
+
spider.every_link do |origin,dest|
|
149
|
+
url_map[dest] << origin
|
150
|
+
end
|
151
|
+
end
|
118
152
|
|
119
|
-
|
120
|
-
|
153
|
+
spider.failures.each do |url|
|
154
|
+
puts "Broken link #{url} found in:"
|
121
155
|
|
122
|
-
|
123
|
-
|
156
|
+
url_map[url].each { |page| puts " #{page}" }
|
157
|
+
end
|
158
|
+
```
|
124
159
|
|
125
160
|
Search HTML and XML pages:
|
126
161
|
|
127
|
-
|
128
|
-
|
129
|
-
|
162
|
+
```ruby
|
163
|
+
Spidr.site('http://company.com/') do |spider|
|
164
|
+
spider.every_page do |page|
|
165
|
+
puts ">>> #{page.url}"
|
130
166
|
|
131
|
-
|
132
|
-
|
133
|
-
|
167
|
+
page.search('//meta').each do |meta|
|
168
|
+
name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
169
|
+
value = meta.attributes['content']
|
134
170
|
|
135
|
-
|
136
|
-
end
|
137
|
-
end
|
171
|
+
puts " #{name} = #{value}"
|
138
172
|
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
```
|
139
176
|
|
140
177
|
Print out the titles from every page:
|
141
178
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
179
|
+
```ruby
|
180
|
+
Spidr.site('https://www.ruby-lang.org/') do |spider|
|
181
|
+
spider.every_html_page do |page|
|
182
|
+
puts page.title
|
183
|
+
end
|
184
|
+
end
|
185
|
+
```
|
186
|
+
|
187
|
+
Print out every HTTP redirect:
|
188
|
+
|
189
|
+
```ruby
|
190
|
+
Spidr.host('company.com') do |spider|
|
191
|
+
spider.every_redirect_page do |page|
|
192
|
+
puts "#{page.url} -> #{page.headers['Location']}"
|
193
|
+
end
|
194
|
+
end
|
195
|
+
```
|
147
196
|
|
148
197
|
Find what kinds of web servers a host is using, by accessing the headers:
|
149
198
|
|
150
|
-
|
199
|
+
```ruby
|
200
|
+
servers = Set[]
|
151
201
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
202
|
+
Spidr.host('company.com') do |spider|
|
203
|
+
spider.all_headers do |headers|
|
204
|
+
servers << headers['server']
|
205
|
+
end
|
206
|
+
end
|
207
|
+
```
|
157
208
|
|
158
209
|
Pause the spider on a forbidden page:
|
159
210
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
211
|
+
```ruby
|
212
|
+
Spidr.host('company.com') do |spider|
|
213
|
+
spider.every_forbidden_page do |page|
|
214
|
+
spider.pause!
|
215
|
+
end
|
216
|
+
end
|
217
|
+
```
|
165
218
|
|
166
219
|
Skip the processing of a page:
|
167
220
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
221
|
+
```ruby
|
222
|
+
Spidr.host('company.com') do |spider|
|
223
|
+
spider.every_missing_page do |page|
|
224
|
+
spider.skip_page!
|
225
|
+
end
|
226
|
+
end
|
227
|
+
```
|
173
228
|
|
174
229
|
Skip the processing of links:
|
175
230
|
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
end
|
231
|
+
```ruby
|
232
|
+
Spidr.host('company.com') do |spider|
|
233
|
+
spider.every_url do |url|
|
234
|
+
if url.path.split('/').find { |dir| dir.to_i > 1000 }
|
235
|
+
spider.skip_link!
|
182
236
|
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
```
|
183
240
|
|
184
241
|
## Requirements
|
185
242
|
|
@@ -188,11 +245,13 @@ Skip the processing of links:
|
|
188
245
|
|
189
246
|
## Install
|
190
247
|
|
191
|
-
|
248
|
+
```shell
|
249
|
+
$ gem install spidr
|
250
|
+
```
|
192
251
|
|
193
252
|
## License
|
194
253
|
|
195
|
-
Copyright (c) 2008-
|
254
|
+
Copyright (c) 2008-2022 Hal Brodigan
|
196
255
|
|
197
256
|
See {file:LICENSE.txt} for license information.
|
198
257
|
|
data/Rakefile
CHANGED
data/gemspec.yml
CHANGED
@@ -11,10 +11,17 @@ email: postmodern.mod3@gmail.com
|
|
11
11
|
homepage: https://github.com/postmodern/spidr#readme
|
12
12
|
has_yard: true
|
13
13
|
|
14
|
+
metadata:
|
15
|
+
documentation_uri: https://rubydoc.info/gems/spidr
|
16
|
+
source_code_uri: https://github.com/postmodern/spidr.rb
|
17
|
+
bug_tracker_uri: https://github.com/postmodern/spidr.rb/issues
|
18
|
+
changelog_uri: https://github.com/postmodern/spidr.rb/blob/master/ChangeLog.md
|
19
|
+
rubygems_mfa_required: 'true'
|
20
|
+
|
14
21
|
required_ruby_version: ">= 2.0.0"
|
15
22
|
|
16
23
|
dependencies:
|
17
24
|
nokogiri: ~> 1.3
|
18
25
|
|
19
26
|
development_dependencies:
|
20
|
-
bundler: ~>
|
27
|
+
bundler: ~> 2.0
|
data/lib/spidr/agent/actions.rb
CHANGED
data/lib/spidr/agent/events.rb
CHANGED
data/lib/spidr/agent/filters.rb
CHANGED
@@ -16,7 +16,7 @@ module Spidr
|
|
16
16
|
# agent.schemes = ['http']
|
17
17
|
#
|
18
18
|
def schemes=(new_schemes)
|
19
|
-
@schemes = new_schemes.map
|
19
|
+
@schemes = new_schemes.map(&:to_s)
|
20
20
|
end
|
21
21
|
|
22
22
|
#
|
@@ -356,89 +356,88 @@ module Spidr
|
|
356
356
|
#
|
357
357
|
# Initializes filtering rules.
|
358
358
|
#
|
359
|
-
# @param [
|
360
|
-
# Additional options.
|
361
|
-
#
|
362
|
-
# @option options [Array] :schemes (['http', 'https'])
|
359
|
+
# @param [Array<String>] schemes
|
363
360
|
# The list of acceptable URI schemes to visit.
|
364
361
|
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
365
362
|
#
|
366
|
-
# @
|
363
|
+
# @param [String] host
|
367
364
|
# The host-name to visit.
|
368
365
|
#
|
369
|
-
# @
|
366
|
+
# @param [Array<String, Regexp, Proc>] hosts
|
370
367
|
# The patterns which match the host-names to visit.
|
371
368
|
#
|
372
|
-
# @
|
369
|
+
# @param [Array<String, Regexp, Proc>] ignore_hosts
|
373
370
|
# The patterns which match the host-names to not visit.
|
374
371
|
#
|
375
|
-
# @
|
372
|
+
# @param [Array<Integer, Regexp, Proc>] ports
|
376
373
|
# The patterns which match the ports to visit.
|
377
374
|
#
|
378
|
-
# @
|
375
|
+
# @param [Array<Integer, Regexp, Proc>] ignore_ports
|
379
376
|
# The patterns which match the ports to not visit.
|
380
377
|
#
|
381
|
-
# @
|
378
|
+
# @param [Array<String, Regexp, Proc>] links
|
382
379
|
# The patterns which match the links to visit.
|
383
380
|
#
|
384
|
-
# @
|
381
|
+
# @param [Array<String, Regexp, Proc>] ignore_links
|
385
382
|
# The patterns which match the links to not visit.
|
386
383
|
#
|
387
|
-
# @
|
384
|
+
# @param [Array<String, Regexp, Proc>] urls
|
388
385
|
# The patterns which match the URLs to visit.
|
389
386
|
#
|
390
|
-
# @
|
387
|
+
# @param [Array<String, Regexp, Proc>] ignore_urls
|
391
388
|
# The patterns which match the URLs to not visit.
|
392
389
|
#
|
393
|
-
# @
|
390
|
+
# @param [Array<String, Regexp, Proc>] exts
|
394
391
|
# The patterns which match the URI path extensions to visit.
|
395
392
|
#
|
396
|
-
# @
|
393
|
+
# @param [Array<String, Regexp, Proc>] ignore_exts
|
397
394
|
# The patterns which match the URI path extensions to not visit.
|
398
395
|
#
|
399
|
-
def initialize_filters(
|
400
|
-
|
396
|
+
def initialize_filters(schemes: self.class.default_schemes,
|
397
|
+
host: nil,
|
398
|
+
hosts: nil,
|
399
|
+
ignore_hosts: nil,
|
400
|
+
ports: nil,
|
401
|
+
ignore_ports: nil,
|
402
|
+
links: nil,
|
403
|
+
ignore_links: nil,
|
404
|
+
urls: nil,
|
405
|
+
ignore_urls: nil,
|
406
|
+
exts: nil,
|
407
|
+
ignore_exts: nil)
|
408
|
+
@schemes = schemes.map(&:to_s)
|
409
|
+
|
410
|
+
@host_rules = Rules.new(accept: hosts, reject: ignore_hosts)
|
411
|
+
@port_rules = Rules.new(accept: ports, reject: ignore_ports)
|
412
|
+
@link_rules = Rules.new(accept: links, reject: ignore_links)
|
413
|
+
@url_rules = Rules.new(accept: urls, reject: ignore_urls)
|
414
|
+
@ext_rules = Rules.new(accept: exts, reject: ignore_exts)
|
415
|
+
|
416
|
+
visit_hosts_like(host) if host
|
417
|
+
end
|
401
418
|
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
419
|
+
#
|
420
|
+
# Determines the default URI schemes to follow.
|
421
|
+
#
|
422
|
+
# @return [Array<String>]
|
423
|
+
# The default URI schemes to follow.
|
424
|
+
#
|
425
|
+
# @since 0.6.2
|
426
|
+
#
|
427
|
+
def self.default_schemes
|
428
|
+
schemes = ['http']
|
406
429
|
|
407
|
-
|
408
|
-
|
430
|
+
begin
|
431
|
+
require 'net/https'
|
409
432
|
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
end
|
433
|
+
schemes << 'https'
|
434
|
+
rescue Gem::LoadError => e
|
435
|
+
raise(e)
|
436
|
+
rescue ::LoadError
|
437
|
+
warn "Warning: cannot load 'net/https', https support disabled"
|
416
438
|
end
|
417
439
|
|
418
|
-
|
419
|
-
accept: options[:hosts],
|
420
|
-
reject: options[:ignore_hosts]
|
421
|
-
)
|
422
|
-
@port_rules = Rules.new(
|
423
|
-
accept: options[:ports],
|
424
|
-
reject: options[:ignore_ports]
|
425
|
-
)
|
426
|
-
@link_rules = Rules.new(
|
427
|
-
accept: options[:links],
|
428
|
-
reject: options[:ignore_links]
|
429
|
-
)
|
430
|
-
@url_rules = Rules.new(
|
431
|
-
accept: options[:urls],
|
432
|
-
reject: options[:ignore_urls]
|
433
|
-
)
|
434
|
-
@ext_rules = Rules.new(
|
435
|
-
accept: options[:exts],
|
436
|
-
reject: options[:ignore_exts]
|
437
|
-
)
|
438
|
-
|
439
|
-
if options[:host]
|
440
|
-
visit_hosts_like(options[:host])
|
441
|
-
end
|
440
|
+
return schemes
|
442
441
|
end
|
443
442
|
|
444
443
|
#
|
@@ -452,9 +451,9 @@ module Spidr
|
|
452
451
|
#
|
453
452
|
def visit_scheme?(scheme)
|
454
453
|
if scheme
|
455
|
-
|
454
|
+
@schemes.include?(scheme)
|
456
455
|
else
|
457
|
-
|
456
|
+
true
|
458
457
|
end
|
459
458
|
end
|
460
459
|
|
@@ -21,7 +21,7 @@ module Spidr
|
|
21
21
|
# @since 0.2.2
|
22
22
|
#
|
23
23
|
def sanitize_url(url)
|
24
|
-
url = URI(url
|
24
|
+
url = URI(url)
|
25
25
|
|
26
26
|
url.fragment = nil if @strip_fragments
|
27
27
|
url.query = nil if @strip_query
|
@@ -34,20 +34,17 @@ module Spidr
|
|
34
34
|
#
|
35
35
|
# Initializes the Sanitizer rules.
|
36
36
|
#
|
37
|
-
# @param [
|
38
|
-
# Additional options.
|
39
|
-
#
|
40
|
-
# @option options [Boolean] :strip_fragments (true)
|
37
|
+
# @param [Boolean] strip_fragments
|
41
38
|
# Specifies whether or not to strip the fragment component from URLs.
|
42
39
|
#
|
43
|
-
# @
|
40
|
+
# @param [Boolean] strip_query
|
44
41
|
# Specifies whether or not to strip the query component from URLs.
|
45
42
|
#
|
46
43
|
# @since 0.2.2
|
47
44
|
#
|
48
|
-
def initialize_sanitizers(
|
49
|
-
@strip_fragments =
|
50
|
-
@strip_query =
|
45
|
+
def initialize_sanitizers(strip_fragments: true, strip_query: false)
|
46
|
+
@strip_fragments = strip_fragments
|
47
|
+
@strip_query = strip_query
|
51
48
|
end
|
52
49
|
|
53
50
|
end
|