spidr 0.6.1 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.editorconfig +11 -0
- data/.github/workflows/ruby.yml +26 -0
- data/.gitignore +4 -5
- data/ChangeLog.md +19 -1
- data/Gemfile +7 -4
- data/LICENSE.txt +1 -1
- data/README.md +136 -79
- data/Rakefile +1 -0
- data/gemspec.yml +7 -0
- data/lib/spidr/agent/actions.rb +3 -1
- data/lib/spidr/agent/events.rb +3 -1
- data/lib/spidr/agent/filters.rb +57 -56
- data/lib/spidr/agent/robots.rb +2 -0
- data/lib/spidr/agent/sanitizers.rb +7 -8
- data/lib/spidr/agent.rb +232 -108
- data/lib/spidr/auth_credential.rb +2 -0
- data/lib/spidr/auth_store.rb +9 -7
- data/lib/spidr/cookie_jar.rb +7 -5
- data/lib/spidr/extensions/uri.rb +3 -1
- data/lib/spidr/extensions.rb +3 -1
- data/lib/spidr/page/content_types.rb +53 -0
- data/lib/spidr/page/cookies.rb +2 -0
- data/lib/spidr/page/html.rb +21 -20
- data/lib/spidr/page/status_codes.rb +15 -11
- data/lib/spidr/page.rb +3 -1
- data/lib/spidr/proxy.rb +8 -14
- data/lib/spidr/rules.rb +7 -8
- data/lib/spidr/session_cache.rb +26 -22
- data/lib/spidr/settings/proxy.rb +22 -6
- data/lib/spidr/settings/timeouts.rb +2 -0
- data/lib/spidr/settings/user_agent.rb +2 -0
- data/lib/spidr/settings.rb +5 -3
- data/lib/spidr/spidr.rb +22 -11
- data/lib/spidr/version.rb +3 -1
- data/lib/spidr.rb +5 -3
- data/spec/agent_spec.rb +356 -7
- data/spec/example_page.rb +2 -0
- data/spec/page/content_types_spec.rb +22 -0
- data/spec/page/html_spec.rb +255 -51
- data/spec/page/status_codes_spec.rb +4 -4
- data/spec/proxy_spec.rb +2 -2
- data/spec/settings/proxy_examples.rb +31 -11
- data/spec/spec_helper.rb +3 -0
- data/spidr.gemspec +1 -4
- metadata +8 -7
- data/.travis.yml +0 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 471764341b98b0cfeb57db24ac34a849dcfdcf43a751b648451a20c29c1ec051
|
4
|
+
data.tar.gz: '009c903cf30a13e55bbb8029fe2fdbfa4f8a8af32126b74aeb558f1afd3d3d88'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bddb65750dce8f6193764ac9d372adfa1893dc8743c24c383c359069043b51cd94e09ecd8bffad16bb8b4d92f99324c98ca95f8f59a9c9655a3f2fb7c42b9f57
|
7
|
+
data.tar.gz: c02f98806d9297ee22c6552eaaf6bb82f619001af25b0d8eeaabf91d0e32ab7154b5436de71ed4773b15353ba5556b52ece92a6035a891eb001c27b90e5cdda5
|
data/.editorconfig
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on: [ push, pull_request ]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
tests:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
strategy:
|
9
|
+
fail-fast: false
|
10
|
+
matrix:
|
11
|
+
ruby:
|
12
|
+
- '3.0'
|
13
|
+
- '3.1'
|
14
|
+
- '3.2'
|
15
|
+
- '3.3'
|
16
|
+
- jruby
|
17
|
+
name: Ruby ${{ matrix.ruby }}
|
18
|
+
steps:
|
19
|
+
- uses: actions/checkout@v4
|
20
|
+
- name: Set up Ruby
|
21
|
+
uses: ruby/setup-ruby@v1
|
22
|
+
with:
|
23
|
+
ruby-version: ${{ matrix.ruby }}
|
24
|
+
bundler-cache: true
|
25
|
+
- name: Run tests
|
26
|
+
run: bundle exec rake test
|
data/.gitignore
CHANGED
data/ChangeLog.md
CHANGED
@@ -1,6 +1,24 @@
|
|
1
|
+
### 0.7.1 / 2024-01-25
|
2
|
+
|
3
|
+
* Switched to using `require_relative` to improve load-times.
|
4
|
+
* Added `# frozen_string_literal: true` to all files.
|
5
|
+
* Use keyword arguments for {Spidr.domain}.
|
6
|
+
* Rescue `URI::Error` instead of `Exception` when calling `URI::HTTP#merge` in
|
7
|
+
{Spidr::Page#to_absolute}.
|
8
|
+
|
9
|
+
### 0.7.0 / 2022-12-31
|
10
|
+
|
11
|
+
* Added {Spidr.domain} and {Spidr::Agent.domain}.
|
12
|
+
* Added {Spidr::Page#gif?}.
|
13
|
+
* Added {Spidr::Page#jpeg?}.
|
14
|
+
* Added {Spidr::Page#icon?} and {Spidr::Page#ico?}.
|
15
|
+
* Added {Spidr::Page#png?}.
|
16
|
+
* {Spidr.proxy=} and {Spidr::Agent#proxy=} can now accept a `String` or a
|
17
|
+
`URI::HTTP` object.
|
18
|
+
|
1
19
|
### 0.6.1 / 2019-10-24
|
2
20
|
|
3
|
-
* Check for opaque component of URIs before attempting to set the path
|
21
|
+
* Check for the opaque component of URIs before attempting to set the path
|
4
22
|
component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
|
5
23
|
opaque` exceptions.
|
6
24
|
* Fix `@robots` instance variable warning (@spk).
|
data/Gemfile
CHANGED
@@ -12,10 +12,13 @@ group :development do
|
|
12
12
|
gem 'rake'
|
13
13
|
gem 'rubygems-tasks', '~> 0.2'
|
14
14
|
|
15
|
-
gem 'rspec',
|
16
|
-
gem 'webmock',
|
17
|
-
gem 'sinatra',
|
15
|
+
gem 'rspec', '~> 3.0'
|
16
|
+
gem 'webmock', '~> 3.0'
|
17
|
+
gem 'sinatra', '~> 2.0'
|
18
|
+
gem 'simplecov', '~> 0.20'
|
18
19
|
|
19
20
|
gem 'kramdown'
|
20
|
-
gem '
|
21
|
+
gem 'redcarpet', platform: :mri
|
22
|
+
gem 'yard', '~> 0.9'
|
23
|
+
gem 'yard-spellcheck', require: false
|
21
24
|
end
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
# Spidr
|
2
2
|
|
3
|
+
[![CI](https://github.com/postmodern/spidr/actions/workflows/ruby.yml/badge.svg)](https://github.com/postmodern/spidr/actions/workflows/ruby.yml)
|
4
|
+
|
3
5
|
* [Homepage](https://github.com/postmodern/spidr#readme)
|
4
6
|
* [Source](https://github.com/postmodern/spidr)
|
5
7
|
* [Issues](https://github.com/postmodern/spidr/issues)
|
6
8
|
* [Mailing List](http://groups.google.com/group/spidr)
|
7
|
-
* [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
|
8
|
-
* [![Build Status](https://travis-ci.org/postmodern/spidr.svg)](https://travis-ci.org/postmodern/spidr)
|
9
9
|
|
10
10
|
## Description
|
11
11
|
|
@@ -49,137 +49,194 @@ and easy to use.
|
|
49
49
|
|
50
50
|
Start spidering from a URL:
|
51
51
|
|
52
|
-
|
52
|
+
```ruby
|
53
|
+
Spidr.start_at('http://tenderlovemaking.com/') do |agent|
|
54
|
+
# ...
|
55
|
+
end
|
56
|
+
```
|
53
57
|
|
54
58
|
Spider a host:
|
55
59
|
|
56
|
-
|
60
|
+
```ruby
|
61
|
+
Spidr.host('solnic.eu') do |agent|
|
62
|
+
# ...
|
63
|
+
end
|
64
|
+
```
|
65
|
+
|
66
|
+
Spider a domain (and any sub-domains):
|
67
|
+
|
68
|
+
```ruby
|
69
|
+
Spidr.domain('ruby-lang.org') do |agent|
|
70
|
+
# ...
|
71
|
+
end
|
72
|
+
```
|
57
73
|
|
58
74
|
Spider a site:
|
59
75
|
|
60
|
-
|
76
|
+
```ruby
|
77
|
+
Spidr.site('http://www.rubyflow.com/') do |agent|
|
78
|
+
# ...
|
79
|
+
end
|
80
|
+
```
|
61
81
|
|
62
82
|
Spider multiple hosts:
|
63
83
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
]
|
70
|
-
)
|
84
|
+
```ruby
|
85
|
+
Spidr.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
|
86
|
+
# ...
|
87
|
+
end
|
88
|
+
```
|
71
89
|
|
72
90
|
Do not spider certain links:
|
73
91
|
|
74
|
-
|
92
|
+
```ruby
|
93
|
+
Spidr.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
|
94
|
+
# ...
|
95
|
+
end
|
96
|
+
```
|
75
97
|
|
76
98
|
Do not spider links on certain ports:
|
77
99
|
|
78
|
-
|
100
|
+
```ruby
|
101
|
+
Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
|
102
|
+
# ...
|
103
|
+
end
|
104
|
+
```
|
79
105
|
|
80
106
|
Do not spider links blacklisted in robots.txt:
|
81
107
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
108
|
+
```ruby
|
109
|
+
Spidr.site('http://company.com/', robots: true) do |agent|
|
110
|
+
# ...
|
111
|
+
end
|
112
|
+
```
|
86
113
|
|
87
114
|
Print out visited URLs:
|
88
115
|
|
89
|
-
|
90
|
-
|
91
|
-
|
116
|
+
```ruby
|
117
|
+
Spidr.site('http://www.rubyinside.com/') do |spider|
|
118
|
+
spider.every_url { |url| puts url }
|
119
|
+
end
|
120
|
+
```
|
92
121
|
|
93
122
|
Build a URL map of a site:
|
94
123
|
|
95
|
-
|
124
|
+
```ruby
|
125
|
+
url_map = Hash.new { |hash,key| hash[key] = [] }
|
96
126
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
127
|
+
Spidr.site('http://intranet.com/') do |spider|
|
128
|
+
spider.every_link do |origin,dest|
|
129
|
+
url_map[dest] << origin
|
130
|
+
end
|
131
|
+
end
|
132
|
+
```
|
102
133
|
|
103
134
|
Print out the URLs that could not be requested:
|
104
135
|
|
105
|
-
|
106
|
-
|
107
|
-
|
136
|
+
```ruby
|
137
|
+
Spidr.site('http://company.com/') do |spider|
|
138
|
+
spider.every_failed_url { |url| puts url }
|
139
|
+
end
|
140
|
+
```
|
108
141
|
|
109
142
|
Finds all pages which have broken links:
|
110
143
|
|
111
|
-
|
144
|
+
```ruby
|
145
|
+
url_map = Hash.new { |hash,key| hash[key] = [] }
|
112
146
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
147
|
+
spider = Spidr.site('http://intranet.com/') do |spider|
|
148
|
+
spider.every_link do |origin,dest|
|
149
|
+
url_map[dest] << origin
|
150
|
+
end
|
151
|
+
end
|
118
152
|
|
119
|
-
|
120
|
-
|
153
|
+
spider.failures.each do |url|
|
154
|
+
puts "Broken link #{url} found in:"
|
121
155
|
|
122
|
-
|
123
|
-
|
156
|
+
url_map[url].each { |page| puts " #{page}" }
|
157
|
+
end
|
158
|
+
```
|
124
159
|
|
125
160
|
Search HTML and XML pages:
|
126
161
|
|
127
|
-
|
128
|
-
|
129
|
-
|
162
|
+
```ruby
|
163
|
+
Spidr.site('http://company.com/') do |spider|
|
164
|
+
spider.every_page do |page|
|
165
|
+
puts ">>> #{page.url}"
|
130
166
|
|
131
|
-
|
132
|
-
|
133
|
-
|
167
|
+
page.search('//meta').each do |meta|
|
168
|
+
name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
169
|
+
value = meta.attributes['content']
|
134
170
|
|
135
|
-
|
136
|
-
end
|
137
|
-
end
|
171
|
+
puts " #{name} = #{value}"
|
138
172
|
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
```
|
139
176
|
|
140
177
|
Print out the titles from every page:
|
141
178
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
179
|
+
```ruby
|
180
|
+
Spidr.site('https://www.ruby-lang.org/') do |spider|
|
181
|
+
spider.every_html_page do |page|
|
182
|
+
puts page.title
|
183
|
+
end
|
184
|
+
end
|
185
|
+
```
|
186
|
+
|
187
|
+
Print out every HTTP redirect:
|
188
|
+
|
189
|
+
```ruby
|
190
|
+
Spidr.host('company.com') do |spider|
|
191
|
+
spider.every_redirect_page do |page|
|
192
|
+
puts "#{page.url} -> #{page.headers['Location']}"
|
193
|
+
end
|
194
|
+
end
|
195
|
+
```
|
147
196
|
|
148
197
|
Find what kinds of web servers a host is using, by accessing the headers:
|
149
198
|
|
150
|
-
|
199
|
+
```ruby
|
200
|
+
servers = Set[]
|
151
201
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
202
|
+
Spidr.host('company.com') do |spider|
|
203
|
+
spider.all_headers do |headers|
|
204
|
+
servers << headers['server']
|
205
|
+
end
|
206
|
+
end
|
207
|
+
```
|
157
208
|
|
158
209
|
Pause the spider on a forbidden page:
|
159
210
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
211
|
+
```ruby
|
212
|
+
Spidr.host('company.com') do |spider|
|
213
|
+
spider.every_forbidden_page do |page|
|
214
|
+
spider.pause!
|
215
|
+
end
|
216
|
+
end
|
217
|
+
```
|
165
218
|
|
166
219
|
Skip the processing of a page:
|
167
220
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
221
|
+
```ruby
|
222
|
+
Spidr.host('company.com') do |spider|
|
223
|
+
spider.every_missing_page do |page|
|
224
|
+
spider.skip_page!
|
225
|
+
end
|
226
|
+
end
|
227
|
+
```
|
173
228
|
|
174
229
|
Skip the processing of links:
|
175
230
|
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
end
|
231
|
+
```ruby
|
232
|
+
Spidr.host('company.com') do |spider|
|
233
|
+
spider.every_url do |url|
|
234
|
+
if url.path.split('/').find { |dir| dir.to_i > 1000 }
|
235
|
+
spider.skip_link!
|
182
236
|
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
```
|
183
240
|
|
184
241
|
## Requirements
|
185
242
|
|
@@ -188,12 +245,12 @@ Skip the processing of links:
|
|
188
245
|
|
189
246
|
## Install
|
190
247
|
|
191
|
-
|
248
|
+
```shell
|
249
|
+
$ gem install spidr
|
250
|
+
```
|
192
251
|
|
193
252
|
## License
|
194
253
|
|
195
|
-
Copyright (c) 2008-2016 Hal Brodigan
|
196
|
-
|
197
254
|
See {file:LICENSE.txt} for license information.
|
198
255
|
|
199
256
|
[ruby]: https://www.ruby-lang.org/
|
data/Rakefile
CHANGED
data/gemspec.yml
CHANGED
@@ -11,6 +11,13 @@ email: postmodern.mod3@gmail.com
|
|
11
11
|
homepage: https://github.com/postmodern/spidr#readme
|
12
12
|
has_yard: true
|
13
13
|
|
14
|
+
metadata:
|
15
|
+
documentation_uri: https://rubydoc.info/gems/spidr
|
16
|
+
source_code_uri: https://github.com/postmodern/spidr.rb
|
17
|
+
bug_tracker_uri: https://github.com/postmodern/spidr.rb/issues
|
18
|
+
changelog_uri: https://github.com/postmodern/spidr.rb/blob/master/ChangeLog.md
|
19
|
+
rubygems_mfa_required: 'true'
|
20
|
+
|
14
21
|
required_ruby_version: ">= 2.0.0"
|
15
22
|
|
16
23
|
dependencies:
|
data/lib/spidr/agent/actions.rb
CHANGED
data/lib/spidr/agent/events.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Spidr
|
2
4
|
class Agent
|
3
5
|
#
|
@@ -520,7 +522,7 @@ module Spidr
|
|
520
522
|
|
521
523
|
protected
|
522
524
|
|
523
|
-
def initialize_events
|
525
|
+
def initialize_events
|
524
526
|
@every_url_blocks = []
|
525
527
|
@every_failed_url_blocks = []
|
526
528
|
@every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
data/lib/spidr/agent/filters.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../rules'
|
2
4
|
|
3
5
|
module Spidr
|
4
6
|
class Agent
|
@@ -170,7 +172,7 @@ module Spidr
|
|
170
172
|
#
|
171
173
|
# @yieldparam [String] link
|
172
174
|
# A link to accept or reject.
|
173
|
-
#
|
175
|
+
#
|
174
176
|
# @since 0.2.4
|
175
177
|
#
|
176
178
|
def visit_links_like(pattern=nil,&block)
|
@@ -238,7 +240,7 @@ module Spidr
|
|
238
240
|
#
|
239
241
|
# @yieldparam [URI::HTTP, URI::HTTPS] url
|
240
242
|
# A URL to accept or reject.
|
241
|
-
#
|
243
|
+
#
|
242
244
|
# @since 0.2.4
|
243
245
|
#
|
244
246
|
def visit_urls_like(pattern=nil,&block)
|
@@ -356,89 +358,88 @@ module Spidr
|
|
356
358
|
#
|
357
359
|
# Initializes filtering rules.
|
358
360
|
#
|
359
|
-
# @param [
|
360
|
-
# Additional options.
|
361
|
-
#
|
362
|
-
# @option options [Array] :schemes (['http', 'https'])
|
361
|
+
# @param [Array<String>] schemes
|
363
362
|
# The list of acceptable URI schemes to visit.
|
364
363
|
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
365
364
|
#
|
366
|
-
# @
|
365
|
+
# @param [String] host
|
367
366
|
# The host-name to visit.
|
368
367
|
#
|
369
|
-
# @
|
368
|
+
# @param [Array<String, Regexp, Proc>] hosts
|
370
369
|
# The patterns which match the host-names to visit.
|
371
370
|
#
|
372
|
-
# @
|
371
|
+
# @param [Array<String, Regexp, Proc>] ignore_hosts
|
373
372
|
# The patterns which match the host-names to not visit.
|
374
373
|
#
|
375
|
-
# @
|
374
|
+
# @param [Array<Integer, Regexp, Proc>] ports
|
376
375
|
# The patterns which match the ports to visit.
|
377
376
|
#
|
378
|
-
# @
|
377
|
+
# @param [Array<Integer, Regexp, Proc>] ignore_ports
|
379
378
|
# The patterns which match the ports to not visit.
|
380
379
|
#
|
381
|
-
# @
|
380
|
+
# @param [Array<String, Regexp, Proc>] links
|
382
381
|
# The patterns which match the links to visit.
|
383
382
|
#
|
384
|
-
# @
|
383
|
+
# @param [Array<String, Regexp, Proc>] ignore_links
|
385
384
|
# The patterns which match the links to not visit.
|
386
385
|
#
|
387
|
-
# @
|
386
|
+
# @param [Array<String, Regexp, Proc>] urls
|
388
387
|
# The patterns which match the URLs to visit.
|
389
388
|
#
|
390
|
-
# @
|
389
|
+
# @param [Array<String, Regexp, Proc>] ignore_urls
|
391
390
|
# The patterns which match the URLs to not visit.
|
392
391
|
#
|
393
|
-
# @
|
392
|
+
# @param [Array<String, Regexp, Proc>] exts
|
394
393
|
# The patterns which match the URI path extensions to visit.
|
395
394
|
#
|
396
|
-
# @
|
395
|
+
# @param [Array<String, Regexp, Proc>] ignore_exts
|
397
396
|
# The patterns which match the URI path extensions to not visit.
|
398
397
|
#
|
399
|
-
def initialize_filters(
|
400
|
-
|
398
|
+
def initialize_filters(schemes: self.class.default_schemes,
|
399
|
+
host: nil,
|
400
|
+
hosts: nil,
|
401
|
+
ignore_hosts: nil,
|
402
|
+
ports: nil,
|
403
|
+
ignore_ports: nil,
|
404
|
+
links: nil,
|
405
|
+
ignore_links: nil,
|
406
|
+
urls: nil,
|
407
|
+
ignore_urls: nil,
|
408
|
+
exts: nil,
|
409
|
+
ignore_exts: nil)
|
410
|
+
@schemes = schemes.map(&:to_s)
|
411
|
+
|
412
|
+
@host_rules = Rules.new(accept: hosts, reject: ignore_hosts)
|
413
|
+
@port_rules = Rules.new(accept: ports, reject: ignore_ports)
|
414
|
+
@link_rules = Rules.new(accept: links, reject: ignore_links)
|
415
|
+
@url_rules = Rules.new(accept: urls, reject: ignore_urls)
|
416
|
+
@ext_rules = Rules.new(accept: exts, reject: ignore_exts)
|
417
|
+
|
418
|
+
visit_hosts_like(host) if host
|
419
|
+
end
|
401
420
|
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
421
|
+
#
|
422
|
+
# Determines the default URI schemes to follow.
|
423
|
+
#
|
424
|
+
# @return [Array<String>]
|
425
|
+
# The default URI schemes to follow.
|
426
|
+
#
|
427
|
+
# @since 0.6.2
|
428
|
+
#
|
429
|
+
def self.default_schemes
|
430
|
+
schemes = ['http']
|
406
431
|
|
407
|
-
|
408
|
-
|
432
|
+
begin
|
433
|
+
require 'net/https'
|
409
434
|
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
end
|
435
|
+
schemes << 'https'
|
436
|
+
rescue Gem::LoadError => e
|
437
|
+
raise(e)
|
438
|
+
rescue ::LoadError
|
439
|
+
warn "Warning: cannot load 'net/https', https support disabled"
|
416
440
|
end
|
417
441
|
|
418
|
-
|
419
|
-
accept: options[:hosts],
|
420
|
-
reject: options[:ignore_hosts]
|
421
|
-
)
|
422
|
-
@port_rules = Rules.new(
|
423
|
-
accept: options[:ports],
|
424
|
-
reject: options[:ignore_ports]
|
425
|
-
)
|
426
|
-
@link_rules = Rules.new(
|
427
|
-
accept: options[:links],
|
428
|
-
reject: options[:ignore_links]
|
429
|
-
)
|
430
|
-
@url_rules = Rules.new(
|
431
|
-
accept: options[:urls],
|
432
|
-
reject: options[:ignore_urls]
|
433
|
-
)
|
434
|
-
@ext_rules = Rules.new(
|
435
|
-
accept: options[:exts],
|
436
|
-
reject: options[:ignore_exts]
|
437
|
-
)
|
438
|
-
|
439
|
-
if options[:host]
|
440
|
-
visit_hosts_like(options[:host])
|
441
|
-
end
|
442
|
+
return schemes
|
442
443
|
end
|
443
444
|
|
444
445
|
#
|
data/lib/spidr/agent/robots.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'uri'
|
2
4
|
|
3
5
|
module Spidr
|
@@ -34,20 +36,17 @@ module Spidr
|
|
34
36
|
#
|
35
37
|
# Initializes the Sanitizer rules.
|
36
38
|
#
|
37
|
-
# @param [
|
38
|
-
# Additional options.
|
39
|
-
#
|
40
|
-
# @option options [Boolean] :strip_fragments (true)
|
39
|
+
# @param [Boolean] strip_fragments
|
41
40
|
# Specifies whether or not to strip the fragment component from URLs.
|
42
41
|
#
|
43
|
-
# @
|
42
|
+
# @param [Boolean] strip_query
|
44
43
|
# Specifies whether or not to strip the query component from URLs.
|
45
44
|
#
|
46
45
|
# @since 0.2.2
|
47
46
|
#
|
48
|
-
def initialize_sanitizers(
|
49
|
-
@strip_fragments =
|
50
|
-
@strip_query =
|
47
|
+
def initialize_sanitizers(strip_fragments: true, strip_query: false)
|
48
|
+
@strip_fragments = strip_fragments
|
49
|
+
@strip_query = strip_query
|
51
50
|
end
|
52
51
|
|
53
52
|
end
|