spidr 0.6.1 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +11 -0
- data/.github/workflows/ruby.yml +26 -0
- data/.gitignore +4 -5
- data/ChangeLog.md +19 -1
- data/Gemfile +7 -4
- data/LICENSE.txt +1 -1
- data/README.md +136 -79
- data/Rakefile +1 -0
- data/gemspec.yml +7 -0
- data/lib/spidr/agent/actions.rb +3 -1
- data/lib/spidr/agent/events.rb +3 -1
- data/lib/spidr/agent/filters.rb +57 -56
- data/lib/spidr/agent/robots.rb +2 -0
- data/lib/spidr/agent/sanitizers.rb +7 -8
- data/lib/spidr/agent.rb +232 -108
- data/lib/spidr/auth_credential.rb +2 -0
- data/lib/spidr/auth_store.rb +9 -7
- data/lib/spidr/cookie_jar.rb +7 -5
- data/lib/spidr/extensions/uri.rb +3 -1
- data/lib/spidr/extensions.rb +3 -1
- data/lib/spidr/page/content_types.rb +53 -0
- data/lib/spidr/page/cookies.rb +2 -0
- data/lib/spidr/page/html.rb +21 -20
- data/lib/spidr/page/status_codes.rb +15 -11
- data/lib/spidr/page.rb +3 -1
- data/lib/spidr/proxy.rb +8 -14
- data/lib/spidr/rules.rb +7 -8
- data/lib/spidr/session_cache.rb +26 -22
- data/lib/spidr/settings/proxy.rb +22 -6
- data/lib/spidr/settings/timeouts.rb +2 -0
- data/lib/spidr/settings/user_agent.rb +2 -0
- data/lib/spidr/settings.rb +5 -3
- data/lib/spidr/spidr.rb +22 -11
- data/lib/spidr/version.rb +3 -1
- data/lib/spidr.rb +5 -3
- data/spec/agent_spec.rb +356 -7
- data/spec/example_page.rb +2 -0
- data/spec/page/content_types_spec.rb +22 -0
- data/spec/page/html_spec.rb +255 -51
- data/spec/page/status_codes_spec.rb +4 -4
- data/spec/proxy_spec.rb +2 -2
- data/spec/settings/proxy_examples.rb +31 -11
- data/spec/spec_helper.rb +3 -0
- data/spidr.gemspec +1 -4
- metadata +8 -7
- data/.travis.yml +0 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 471764341b98b0cfeb57db24ac34a849dcfdcf43a751b648451a20c29c1ec051
|
4
|
+
data.tar.gz: '009c903cf30a13e55bbb8029fe2fdbfa4f8a8af32126b74aeb558f1afd3d3d88'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bddb65750dce8f6193764ac9d372adfa1893dc8743c24c383c359069043b51cd94e09ecd8bffad16bb8b4d92f99324c98ca95f8f59a9c9655a3f2fb7c42b9f57
|
7
|
+
data.tar.gz: c02f98806d9297ee22c6552eaaf6bb82f619001af25b0d8eeaabf91d0e32ab7154b5436de71ed4773b15353ba5556b52ece92a6035a891eb001c27b90e5cdda5
|
data/.editorconfig
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on: [ push, pull_request ]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
tests:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
strategy:
|
9
|
+
fail-fast: false
|
10
|
+
matrix:
|
11
|
+
ruby:
|
12
|
+
- '3.0'
|
13
|
+
- '3.1'
|
14
|
+
- '3.2'
|
15
|
+
- '3.3'
|
16
|
+
- jruby
|
17
|
+
name: Ruby ${{ matrix.ruby }}
|
18
|
+
steps:
|
19
|
+
- uses: actions/checkout@v4
|
20
|
+
- name: Set up Ruby
|
21
|
+
uses: ruby/setup-ruby@v1
|
22
|
+
with:
|
23
|
+
ruby-version: ${{ matrix.ruby }}
|
24
|
+
bundler-cache: true
|
25
|
+
- name: Run tests
|
26
|
+
run: bundle exec rake test
|
data/.gitignore
CHANGED
data/ChangeLog.md
CHANGED
@@ -1,6 +1,24 @@
|
|
1
|
+
### 0.7.1 / 2024-01-25
|
2
|
+
|
3
|
+
* Switched to using `require_relative` to improve load-times.
|
4
|
+
* Added `# frozen_string_literal: true` to all files.
|
5
|
+
* Use keyword arguments for {Spidr.domain}.
|
6
|
+
* Rescue `URI::Error` instead of `Exception` when calling `URI::HTTP#merge` in
|
7
|
+
{Spidr::Page#to_absolute}.
|
8
|
+
|
9
|
+
### 0.7.0 / 2022-12-31
|
10
|
+
|
11
|
+
* Added {Spidr.domain} and {Spidr::Agent.domain}.
|
12
|
+
* Added {Spidr::Page#gif?}.
|
13
|
+
* Added {Spidr::Page#jpeg?}.
|
14
|
+
* Added {Spidr::Page#icon?} and {Spidr::Page#ico?}.
|
15
|
+
* Added {Spidr::Page#png?}.
|
16
|
+
* {Spidr.proxy=} and {Spidr::Agent#proxy=} can now accept a `String` or a
|
17
|
+
`URI::HTTP` object.
|
18
|
+
|
1
19
|
### 0.6.1 / 2019-10-24
|
2
20
|
|
3
|
-
* Check for opaque component of URIs before attempting to set the path
|
21
|
+
* Check for the opaque component of URIs before attempting to set the path
|
4
22
|
component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
|
5
23
|
opaque` exceptions.
|
6
24
|
* Fix `@robots` instance variable warning (@spk).
|
data/Gemfile
CHANGED
@@ -12,10 +12,13 @@ group :development do
|
|
12
12
|
gem 'rake'
|
13
13
|
gem 'rubygems-tasks', '~> 0.2'
|
14
14
|
|
15
|
-
gem 'rspec',
|
16
|
-
gem 'webmock',
|
17
|
-
gem 'sinatra',
|
15
|
+
gem 'rspec', '~> 3.0'
|
16
|
+
gem 'webmock', '~> 3.0'
|
17
|
+
gem 'sinatra', '~> 2.0'
|
18
|
+
gem 'simplecov', '~> 0.20'
|
18
19
|
|
19
20
|
gem 'kramdown'
|
20
|
-
gem '
|
21
|
+
gem 'redcarpet', platform: :mri
|
22
|
+
gem 'yard', '~> 0.9'
|
23
|
+
gem 'yard-spellcheck', require: false
|
21
24
|
end
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
# Spidr
|
2
2
|
|
3
|
+
[](https://github.com/postmodern/spidr/actions/workflows/ruby.yml)
|
4
|
+
|
3
5
|
* [Homepage](https://github.com/postmodern/spidr#readme)
|
4
6
|
* [Source](https://github.com/postmodern/spidr)
|
5
7
|
* [Issues](https://github.com/postmodern/spidr/issues)
|
6
8
|
* [Mailing List](http://groups.google.com/group/spidr)
|
7
|
-
* [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
|
8
|
-
* [](https://travis-ci.org/postmodern/spidr)
|
9
9
|
|
10
10
|
## Description
|
11
11
|
|
@@ -49,137 +49,194 @@ and easy to use.
|
|
49
49
|
|
50
50
|
Start spidering from a URL:
|
51
51
|
|
52
|
-
|
52
|
+
```ruby
|
53
|
+
Spidr.start_at('http://tenderlovemaking.com/') do |agent|
|
54
|
+
# ...
|
55
|
+
end
|
56
|
+
```
|
53
57
|
|
54
58
|
Spider a host:
|
55
59
|
|
56
|
-
|
60
|
+
```ruby
|
61
|
+
Spidr.host('solnic.eu') do |agent|
|
62
|
+
# ...
|
63
|
+
end
|
64
|
+
```
|
65
|
+
|
66
|
+
Spider a domain (and any sub-domains):
|
67
|
+
|
68
|
+
```ruby
|
69
|
+
Spidr.domain('ruby-lang.org') do |agent|
|
70
|
+
# ...
|
71
|
+
end
|
72
|
+
```
|
57
73
|
|
58
74
|
Spider a site:
|
59
75
|
|
60
|
-
|
76
|
+
```ruby
|
77
|
+
Spidr.site('http://www.rubyflow.com/') do |agent|
|
78
|
+
# ...
|
79
|
+
end
|
80
|
+
```
|
61
81
|
|
62
82
|
Spider multiple hosts:
|
63
83
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
]
|
70
|
-
)
|
84
|
+
```ruby
|
85
|
+
Spidr.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
|
86
|
+
# ...
|
87
|
+
end
|
88
|
+
```
|
71
89
|
|
72
90
|
Do not spider certain links:
|
73
91
|
|
74
|
-
|
92
|
+
```ruby
|
93
|
+
Spidr.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
|
94
|
+
# ...
|
95
|
+
end
|
96
|
+
```
|
75
97
|
|
76
98
|
Do not spider links on certain ports:
|
77
99
|
|
78
|
-
|
100
|
+
```ruby
|
101
|
+
Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
|
102
|
+
# ...
|
103
|
+
end
|
104
|
+
```
|
79
105
|
|
80
106
|
Do not spider links blacklisted in robots.txt:
|
81
107
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
108
|
+
```ruby
|
109
|
+
Spidr.site('http://company.com/', robots: true) do |agent|
|
110
|
+
# ...
|
111
|
+
end
|
112
|
+
```
|
86
113
|
|
87
114
|
Print out visited URLs:
|
88
115
|
|
89
|
-
|
90
|
-
|
91
|
-
|
116
|
+
```ruby
|
117
|
+
Spidr.site('http://www.rubyinside.com/') do |spider|
|
118
|
+
spider.every_url { |url| puts url }
|
119
|
+
end
|
120
|
+
```
|
92
121
|
|
93
122
|
Build a URL map of a site:
|
94
123
|
|
95
|
-
|
124
|
+
```ruby
|
125
|
+
url_map = Hash.new { |hash,key| hash[key] = [] }
|
96
126
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
127
|
+
Spidr.site('http://intranet.com/') do |spider|
|
128
|
+
spider.every_link do |origin,dest|
|
129
|
+
url_map[dest] << origin
|
130
|
+
end
|
131
|
+
end
|
132
|
+
```
|
102
133
|
|
103
134
|
Print out the URLs that could not be requested:
|
104
135
|
|
105
|
-
|
106
|
-
|
107
|
-
|
136
|
+
```ruby
|
137
|
+
Spidr.site('http://company.com/') do |spider|
|
138
|
+
spider.every_failed_url { |url| puts url }
|
139
|
+
end
|
140
|
+
```
|
108
141
|
|
109
142
|
Finds all pages which have broken links:
|
110
143
|
|
111
|
-
|
144
|
+
```ruby
|
145
|
+
url_map = Hash.new { |hash,key| hash[key] = [] }
|
112
146
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
147
|
+
spider = Spidr.site('http://intranet.com/') do |spider|
|
148
|
+
spider.every_link do |origin,dest|
|
149
|
+
url_map[dest] << origin
|
150
|
+
end
|
151
|
+
end
|
118
152
|
|
119
|
-
|
120
|
-
|
153
|
+
spider.failures.each do |url|
|
154
|
+
puts "Broken link #{url} found in:"
|
121
155
|
|
122
|
-
|
123
|
-
|
156
|
+
url_map[url].each { |page| puts " #{page}" }
|
157
|
+
end
|
158
|
+
```
|
124
159
|
|
125
160
|
Search HTML and XML pages:
|
126
161
|
|
127
|
-
|
128
|
-
|
129
|
-
|
162
|
+
```ruby
|
163
|
+
Spidr.site('http://company.com/') do |spider|
|
164
|
+
spider.every_page do |page|
|
165
|
+
puts ">>> #{page.url}"
|
130
166
|
|
131
|
-
|
132
|
-
|
133
|
-
|
167
|
+
page.search('//meta').each do |meta|
|
168
|
+
name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
169
|
+
value = meta.attributes['content']
|
134
170
|
|
135
|
-
|
136
|
-
end
|
137
|
-
end
|
171
|
+
puts " #{name} = #{value}"
|
138
172
|
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
```
|
139
176
|
|
140
177
|
Print out the titles from every page:
|
141
178
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
179
|
+
```ruby
|
180
|
+
Spidr.site('https://www.ruby-lang.org/') do |spider|
|
181
|
+
spider.every_html_page do |page|
|
182
|
+
puts page.title
|
183
|
+
end
|
184
|
+
end
|
185
|
+
```
|
186
|
+
|
187
|
+
Print out every HTTP redirect:
|
188
|
+
|
189
|
+
```ruby
|
190
|
+
Spidr.host('company.com') do |spider|
|
191
|
+
spider.every_redirect_page do |page|
|
192
|
+
puts "#{page.url} -> #{page.headers['Location']}"
|
193
|
+
end
|
194
|
+
end
|
195
|
+
```
|
147
196
|
|
148
197
|
Find what kinds of web servers a host is using, by accessing the headers:
|
149
198
|
|
150
|
-
|
199
|
+
```ruby
|
200
|
+
servers = Set[]
|
151
201
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
202
|
+
Spidr.host('company.com') do |spider|
|
203
|
+
spider.all_headers do |headers|
|
204
|
+
servers << headers['server']
|
205
|
+
end
|
206
|
+
end
|
207
|
+
```
|
157
208
|
|
158
209
|
Pause the spider on a forbidden page:
|
159
210
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
211
|
+
```ruby
|
212
|
+
Spidr.host('company.com') do |spider|
|
213
|
+
spider.every_forbidden_page do |page|
|
214
|
+
spider.pause!
|
215
|
+
end
|
216
|
+
end
|
217
|
+
```
|
165
218
|
|
166
219
|
Skip the processing of a page:
|
167
220
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
221
|
+
```ruby
|
222
|
+
Spidr.host('company.com') do |spider|
|
223
|
+
spider.every_missing_page do |page|
|
224
|
+
spider.skip_page!
|
225
|
+
end
|
226
|
+
end
|
227
|
+
```
|
173
228
|
|
174
229
|
Skip the processing of links:
|
175
230
|
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
end
|
231
|
+
```ruby
|
232
|
+
Spidr.host('company.com') do |spider|
|
233
|
+
spider.every_url do |url|
|
234
|
+
if url.path.split('/').find { |dir| dir.to_i > 1000 }
|
235
|
+
spider.skip_link!
|
182
236
|
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
```
|
183
240
|
|
184
241
|
## Requirements
|
185
242
|
|
@@ -188,12 +245,12 @@ Skip the processing of links:
|
|
188
245
|
|
189
246
|
## Install
|
190
247
|
|
191
|
-
|
248
|
+
```shell
|
249
|
+
$ gem install spidr
|
250
|
+
```
|
192
251
|
|
193
252
|
## License
|
194
253
|
|
195
|
-
Copyright (c) 2008-2016 Hal Brodigan
|
196
|
-
|
197
254
|
See {file:LICENSE.txt} for license information.
|
198
255
|
|
199
256
|
[ruby]: https://www.ruby-lang.org/
|
data/Rakefile
CHANGED
data/gemspec.yml
CHANGED
@@ -11,6 +11,13 @@ email: postmodern.mod3@gmail.com
|
|
11
11
|
homepage: https://github.com/postmodern/spidr#readme
|
12
12
|
has_yard: true
|
13
13
|
|
14
|
+
metadata:
|
15
|
+
documentation_uri: https://rubydoc.info/gems/spidr
|
16
|
+
source_code_uri: https://github.com/postmodern/spidr.rb
|
17
|
+
bug_tracker_uri: https://github.com/postmodern/spidr.rb/issues
|
18
|
+
changelog_uri: https://github.com/postmodern/spidr.rb/blob/master/ChangeLog.md
|
19
|
+
rubygems_mfa_required: 'true'
|
20
|
+
|
14
21
|
required_ruby_version: ">= 2.0.0"
|
15
22
|
|
16
23
|
dependencies:
|
data/lib/spidr/agent/actions.rb
CHANGED
data/lib/spidr/agent/events.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Spidr
|
2
4
|
class Agent
|
3
5
|
#
|
@@ -520,7 +522,7 @@ module Spidr
|
|
520
522
|
|
521
523
|
protected
|
522
524
|
|
523
|
-
def initialize_events
|
525
|
+
def initialize_events
|
524
526
|
@every_url_blocks = []
|
525
527
|
@every_failed_url_blocks = []
|
526
528
|
@every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
data/lib/spidr/agent/filters.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../rules'
|
2
4
|
|
3
5
|
module Spidr
|
4
6
|
class Agent
|
@@ -170,7 +172,7 @@ module Spidr
|
|
170
172
|
#
|
171
173
|
# @yieldparam [String] link
|
172
174
|
# A link to accept or reject.
|
173
|
-
#
|
175
|
+
#
|
174
176
|
# @since 0.2.4
|
175
177
|
#
|
176
178
|
def visit_links_like(pattern=nil,&block)
|
@@ -238,7 +240,7 @@ module Spidr
|
|
238
240
|
#
|
239
241
|
# @yieldparam [URI::HTTP, URI::HTTPS] url
|
240
242
|
# A URL to accept or reject.
|
241
|
-
#
|
243
|
+
#
|
242
244
|
# @since 0.2.4
|
243
245
|
#
|
244
246
|
def visit_urls_like(pattern=nil,&block)
|
@@ -356,89 +358,88 @@ module Spidr
|
|
356
358
|
#
|
357
359
|
# Initializes filtering rules.
|
358
360
|
#
|
359
|
-
# @param [
|
360
|
-
# Additional options.
|
361
|
-
#
|
362
|
-
# @option options [Array] :schemes (['http', 'https'])
|
361
|
+
# @param [Array<String>] schemes
|
363
362
|
# The list of acceptable URI schemes to visit.
|
364
363
|
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
365
364
|
#
|
366
|
-
# @
|
365
|
+
# @param [String] host
|
367
366
|
# The host-name to visit.
|
368
367
|
#
|
369
|
-
# @
|
368
|
+
# @param [Array<String, Regexp, Proc>] hosts
|
370
369
|
# The patterns which match the host-names to visit.
|
371
370
|
#
|
372
|
-
# @
|
371
|
+
# @param [Array<String, Regexp, Proc>] ignore_hosts
|
373
372
|
# The patterns which match the host-names to not visit.
|
374
373
|
#
|
375
|
-
# @
|
374
|
+
# @param [Array<Integer, Regexp, Proc>] ports
|
376
375
|
# The patterns which match the ports to visit.
|
377
376
|
#
|
378
|
-
# @
|
377
|
+
# @param [Array<Integer, Regexp, Proc>] ignore_ports
|
379
378
|
# The patterns which match the ports to not visit.
|
380
379
|
#
|
381
|
-
# @
|
380
|
+
# @param [Array<String, Regexp, Proc>] links
|
382
381
|
# The patterns which match the links to visit.
|
383
382
|
#
|
384
|
-
# @
|
383
|
+
# @param [Array<String, Regexp, Proc>] ignore_links
|
385
384
|
# The patterns which match the links to not visit.
|
386
385
|
#
|
387
|
-
# @
|
386
|
+
# @param [Array<String, Regexp, Proc>] urls
|
388
387
|
# The patterns which match the URLs to visit.
|
389
388
|
#
|
390
|
-
# @
|
389
|
+
# @param [Array<String, Regexp, Proc>] ignore_urls
|
391
390
|
# The patterns which match the URLs to not visit.
|
392
391
|
#
|
393
|
-
# @
|
392
|
+
# @param [Array<String, Regexp, Proc>] exts
|
394
393
|
# The patterns which match the URI path extensions to visit.
|
395
394
|
#
|
396
|
-
# @
|
395
|
+
# @param [Array<String, Regexp, Proc>] ignore_exts
|
397
396
|
# The patterns which match the URI path extensions to not visit.
|
398
397
|
#
|
399
|
-
def initialize_filters(
|
400
|
-
|
398
|
+
def initialize_filters(schemes: self.class.default_schemes,
|
399
|
+
host: nil,
|
400
|
+
hosts: nil,
|
401
|
+
ignore_hosts: nil,
|
402
|
+
ports: nil,
|
403
|
+
ignore_ports: nil,
|
404
|
+
links: nil,
|
405
|
+
ignore_links: nil,
|
406
|
+
urls: nil,
|
407
|
+
ignore_urls: nil,
|
408
|
+
exts: nil,
|
409
|
+
ignore_exts: nil)
|
410
|
+
@schemes = schemes.map(&:to_s)
|
411
|
+
|
412
|
+
@host_rules = Rules.new(accept: hosts, reject: ignore_hosts)
|
413
|
+
@port_rules = Rules.new(accept: ports, reject: ignore_ports)
|
414
|
+
@link_rules = Rules.new(accept: links, reject: ignore_links)
|
415
|
+
@url_rules = Rules.new(accept: urls, reject: ignore_urls)
|
416
|
+
@ext_rules = Rules.new(accept: exts, reject: ignore_exts)
|
417
|
+
|
418
|
+
visit_hosts_like(host) if host
|
419
|
+
end
|
401
420
|
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
421
|
+
#
|
422
|
+
# Determines the default URI schemes to follow.
|
423
|
+
#
|
424
|
+
# @return [Array<String>]
|
425
|
+
# The default URI schemes to follow.
|
426
|
+
#
|
427
|
+
# @since 0.6.2
|
428
|
+
#
|
429
|
+
def self.default_schemes
|
430
|
+
schemes = ['http']
|
406
431
|
|
407
|
-
|
408
|
-
|
432
|
+
begin
|
433
|
+
require 'net/https'
|
409
434
|
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
end
|
435
|
+
schemes << 'https'
|
436
|
+
rescue Gem::LoadError => e
|
437
|
+
raise(e)
|
438
|
+
rescue ::LoadError
|
439
|
+
warn "Warning: cannot load 'net/https', https support disabled"
|
416
440
|
end
|
417
441
|
|
418
|
-
|
419
|
-
accept: options[:hosts],
|
420
|
-
reject: options[:ignore_hosts]
|
421
|
-
)
|
422
|
-
@port_rules = Rules.new(
|
423
|
-
accept: options[:ports],
|
424
|
-
reject: options[:ignore_ports]
|
425
|
-
)
|
426
|
-
@link_rules = Rules.new(
|
427
|
-
accept: options[:links],
|
428
|
-
reject: options[:ignore_links]
|
429
|
-
)
|
430
|
-
@url_rules = Rules.new(
|
431
|
-
accept: options[:urls],
|
432
|
-
reject: options[:ignore_urls]
|
433
|
-
)
|
434
|
-
@ext_rules = Rules.new(
|
435
|
-
accept: options[:exts],
|
436
|
-
reject: options[:ignore_exts]
|
437
|
-
)
|
438
|
-
|
439
|
-
if options[:host]
|
440
|
-
visit_hosts_like(options[:host])
|
441
|
-
end
|
442
|
+
return schemes
|
442
443
|
end
|
443
444
|
|
444
445
|
#
|
data/lib/spidr/agent/robots.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'uri'
|
2
4
|
|
3
5
|
module Spidr
|
@@ -34,20 +36,17 @@ module Spidr
|
|
34
36
|
#
|
35
37
|
# Initializes the Sanitizer rules.
|
36
38
|
#
|
37
|
-
# @param [
|
38
|
-
# Additional options.
|
39
|
-
#
|
40
|
-
# @option options [Boolean] :strip_fragments (true)
|
39
|
+
# @param [Boolean] strip_fragments
|
41
40
|
# Specifies whether or not to strip the fragment component from URLs.
|
42
41
|
#
|
43
|
-
# @
|
42
|
+
# @param [Boolean] strip_query
|
44
43
|
# Specifies whether or not to strip the query component from URLs.
|
45
44
|
#
|
46
45
|
# @since 0.2.2
|
47
46
|
#
|
48
|
-
def initialize_sanitizers(
|
49
|
-
@strip_fragments =
|
50
|
-
@strip_query =
|
47
|
+
def initialize_sanitizers(strip_fragments: true, strip_query: false)
|
48
|
+
@strip_fragments = strip_fragments
|
49
|
+
@strip_query = strip_query
|
51
50
|
end
|
52
51
|
|
53
52
|
end
|