spidr 0.6.1 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.editorconfig +11 -0
  3. data/.github/workflows/ruby.yml +26 -0
  4. data/.gitignore +4 -5
  5. data/ChangeLog.md +19 -1
  6. data/Gemfile +7 -4
  7. data/LICENSE.txt +1 -1
  8. data/README.md +136 -79
  9. data/Rakefile +1 -0
  10. data/gemspec.yml +7 -0
  11. data/lib/spidr/agent/actions.rb +3 -1
  12. data/lib/spidr/agent/events.rb +3 -1
  13. data/lib/spidr/agent/filters.rb +57 -56
  14. data/lib/spidr/agent/robots.rb +2 -0
  15. data/lib/spidr/agent/sanitizers.rb +7 -8
  16. data/lib/spidr/agent.rb +232 -108
  17. data/lib/spidr/auth_credential.rb +2 -0
  18. data/lib/spidr/auth_store.rb +9 -7
  19. data/lib/spidr/cookie_jar.rb +7 -5
  20. data/lib/spidr/extensions/uri.rb +3 -1
  21. data/lib/spidr/extensions.rb +3 -1
  22. data/lib/spidr/page/content_types.rb +53 -0
  23. data/lib/spidr/page/cookies.rb +2 -0
  24. data/lib/spidr/page/html.rb +21 -20
  25. data/lib/spidr/page/status_codes.rb +15 -11
  26. data/lib/spidr/page.rb +3 -1
  27. data/lib/spidr/proxy.rb +8 -14
  28. data/lib/spidr/rules.rb +7 -8
  29. data/lib/spidr/session_cache.rb +26 -22
  30. data/lib/spidr/settings/proxy.rb +22 -6
  31. data/lib/spidr/settings/timeouts.rb +2 -0
  32. data/lib/spidr/settings/user_agent.rb +2 -0
  33. data/lib/spidr/settings.rb +5 -3
  34. data/lib/spidr/spidr.rb +22 -11
  35. data/lib/spidr/version.rb +3 -1
  36. data/lib/spidr.rb +5 -3
  37. data/spec/agent_spec.rb +356 -7
  38. data/spec/example_page.rb +2 -0
  39. data/spec/page/content_types_spec.rb +22 -0
  40. data/spec/page/html_spec.rb +255 -51
  41. data/spec/page/status_codes_spec.rb +4 -4
  42. data/spec/proxy_spec.rb +2 -2
  43. data/spec/settings/proxy_examples.rb +31 -11
  44. data/spec/spec_helper.rb +3 -0
  45. data/spidr.gemspec +1 -4
  46. metadata +8 -7
  47. data/.travis.yml +0 -16
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e2202e0ce389cbbc6f88360d7e7b430328bc6da973b69929ebc54b1d92c104bb
4
- data.tar.gz: 0777e972ef2cb1d540ee138a7703ee67f88b482260074c13e8c08a8da963aa77
3
+ metadata.gz: 471764341b98b0cfeb57db24ac34a849dcfdcf43a751b648451a20c29c1ec051
4
+ data.tar.gz: '009c903cf30a13e55bbb8029fe2fdbfa4f8a8af32126b74aeb558f1afd3d3d88'
5
5
  SHA512:
6
- metadata.gz: 45e923ad3aa59812de4af67cc3a1739d7751749b3aa3205c5979e1f29c302abd43316cd74ad7c61befda2d31b0ff080872047e9713537d3bfe2509a8d555156a
7
- data.tar.gz: deae3dbd7d9566723ca8760064c715de8795b1d4e4be1ae0866497d697a411af3e8bbe2a677028a5bf57bba8e681bd9d712cef3a2d0d123bc903af28f5e32a77
6
+ metadata.gz: bddb65750dce8f6193764ac9d372adfa1893dc8743c24c383c359069043b51cd94e09ecd8bffad16bb8b4d92f99324c98ca95f8f59a9c9655a3f2fb7c42b9f57
7
+ data.tar.gz: c02f98806d9297ee22c6552eaaf6bb82f619001af25b0d8eeaabf91d0e32ab7154b5436de71ed4773b15353ba5556b52ece92a6035a891eb001c27b90e5cdda5
data/.editorconfig ADDED
@@ -0,0 +1,11 @@
1
+ root = true
2
+
3
+ [*]
4
+ end_of_line = lf
5
+ insert_final_newline = true
6
+ tab_width = 8
7
+ trim_trailing_whitespace = true
8
+
9
+ [{Gemfile,Rakefile,*.rb,*.gemspec,*.yml}]
10
+ indent_style = space
11
+ indent_size = 2
@@ -0,0 +1,26 @@
1
+ name: CI
2
+
3
+ on: [ push, pull_request ]
4
+
5
+ jobs:
6
+ tests:
7
+ runs-on: ubuntu-latest
8
+ strategy:
9
+ fail-fast: false
10
+ matrix:
11
+ ruby:
12
+ - '3.0'
13
+ - '3.1'
14
+ - '3.2'
15
+ - '3.3'
16
+ - jruby
17
+ name: Ruby ${{ matrix.ruby }}
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+ - name: Set up Ruby
21
+ uses: ruby/setup-ruby@v1
22
+ with:
23
+ ruby-version: ${{ matrix.ruby }}
24
+ bundler-cache: true
25
+ - name: Run tests
26
+ run: bundle exec rake test
data/.gitignore CHANGED
@@ -1,8 +1,7 @@
1
- pkg
2
- doc
3
- web
4
- tmp
5
- Gemfile.lock
1
+ /Gemfile.lock
2
+ /coverage
3
+ /doc
4
+ /pkg
6
5
  .DS_Store
7
6
  .bundle
8
7
  .yardoc
data/ChangeLog.md CHANGED
@@ -1,6 +1,24 @@
1
+ ### 0.7.1 / 2024-01-25
2
+
3
+ * Switched to using `require_relative` to improve load-times.
4
+ * Added `# frozen_string_literal: true` to all files.
5
+ * Use keyword arguments for {Spidr.domain}.
6
+ * Rescue `URI::Error` instead of `Exception` when calling `URI::HTTP#merge` in
7
+ {Spidr::Page#to_absolute}.
8
+
9
+ ### 0.7.0 / 2022-12-31
10
+
11
+ * Added {Spidr.domain} and {Spidr::Agent.domain}.
12
+ * Added {Spidr::Page#gif?}.
13
+ * Added {Spidr::Page#jpeg?}.
14
+ * Added {Spidr::Page#icon?} and {Spidr::Page#ico?}.
15
+ * Added {Spidr::Page#png?}.
16
+ * {Spidr.proxy=} and {Spidr::Agent#proxy=} can now accept a `String` or a
17
+ `URI::HTTP` object.
18
+
1
19
  ### 0.6.1 / 2019-10-24
2
20
 
3
- * Check for opaque component of URIs before attempting to set the path
21
+ * Check for the opaque component of URIs before attempting to set the path
4
22
  component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
5
23
  opaque` exceptions.
6
24
  * Fix `@robots` instance variable warning (@spk).
data/Gemfile CHANGED
@@ -12,10 +12,13 @@ group :development do
12
12
  gem 'rake'
13
13
  gem 'rubygems-tasks', '~> 0.2'
14
14
 
15
- gem 'rspec', '~> 3.0'
16
- gem 'webmock', '~> 3.0'
17
- gem 'sinatra', '~> 1.0'
15
+ gem 'rspec', '~> 3.0'
16
+ gem 'webmock', '~> 3.0'
17
+ gem 'sinatra', '~> 2.0'
18
+ gem 'simplecov', '~> 0.20'
18
19
 
19
20
  gem 'kramdown'
20
- gem 'yard', '~> 0.9'
21
+ gem 'redcarpet', platform: :mri
22
+ gem 'yard', '~> 0.9'
23
+ gem 'yard-spellcheck', require: false
21
24
  end
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2008-2016 Hal Brodigan
1
+ Copyright (c) 2008-2024 Hal Brodigan
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -1,11 +1,11 @@
1
1
  # Spidr
2
2
 
3
+ [![CI](https://github.com/postmodern/spidr/actions/workflows/ruby.yml/badge.svg)](https://github.com/postmodern/spidr/actions/workflows/ruby.yml)
4
+
3
5
  * [Homepage](https://github.com/postmodern/spidr#readme)
4
6
  * [Source](https://github.com/postmodern/spidr)
5
7
  * [Issues](https://github.com/postmodern/spidr/issues)
6
8
  * [Mailing List](http://groups.google.com/group/spidr)
7
- * [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
8
- * [![Build Status](https://travis-ci.org/postmodern/spidr.svg)](https://travis-ci.org/postmodern/spidr)
9
9
 
10
10
  ## Description
11
11
 
@@ -49,137 +49,194 @@ and easy to use.
49
49
 
50
50
  Start spidering from a URL:
51
51
 
52
- Spidr.start_at('http://tenderlovemaking.com/')
52
+ ```ruby
53
+ Spidr.start_at('http://tenderlovemaking.com/') do |agent|
54
+ # ...
55
+ end
56
+ ```
53
57
 
54
58
  Spider a host:
55
59
 
56
- Spidr.host('solnic.eu')
60
+ ```ruby
61
+ Spidr.host('solnic.eu') do |agent|
62
+ # ...
63
+ end
64
+ ```
65
+
66
+ Spider a domain (and any sub-domains):
67
+
68
+ ```ruby
69
+ Spidr.domain('ruby-lang.org') do |agent|
70
+ # ...
71
+ end
72
+ ```
57
73
 
58
74
  Spider a site:
59
75
 
60
- Spidr.site('http://www.rubyflow.com/')
76
+ ```ruby
77
+ Spidr.site('http://www.rubyflow.com/') do |agent|
78
+ # ...
79
+ end
80
+ ```
61
81
 
62
82
  Spider multiple hosts:
63
83
 
64
- Spidr.start_at(
65
- 'http://company.com/',
66
- hosts: [
67
- 'company.com',
68
- /host[\d]+\.company\.com/
69
- ]
70
- )
84
+ ```ruby
85
+ Spidr.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
86
+ # ...
87
+ end
88
+ ```
71
89
 
72
90
  Do not spider certain links:
73
91
 
74
- Spidr.site('http://company.com/', ignore_links: [%{^/blog/}])
92
+ ```ruby
93
+ Spidr.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
94
+ # ...
95
+ end
96
+ ```
75
97
 
76
98
  Do not spider links on certain ports:
77
99
 
78
- Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080])
100
+ ```ruby
101
+ Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
102
+ # ...
103
+ end
104
+ ```
79
105
 
80
106
  Do not spider links blacklisted in robots.txt:
81
107
 
82
- Spidr.site(
83
- 'http://company.com/',
84
- robots: true
85
- )
108
+ ```ruby
109
+ Spidr.site('http://company.com/', robots: true) do |agent|
110
+ # ...
111
+ end
112
+ ```
86
113
 
87
114
  Print out visited URLs:
88
115
 
89
- Spidr.site('http://www.rubyinside.com/') do |spider|
90
- spider.every_url { |url| puts url }
91
- end
116
+ ```ruby
117
+ Spidr.site('http://www.rubyinside.com/') do |spider|
118
+ spider.every_url { |url| puts url }
119
+ end
120
+ ```
92
121
 
93
122
  Build a URL map of a site:
94
123
 
95
- url_map = Hash.new { |hash,key| hash[key] = [] }
124
+ ```ruby
125
+ url_map = Hash.new { |hash,key| hash[key] = [] }
96
126
 
97
- Spidr.site('http://intranet.com/') do |spider|
98
- spider.every_link do |origin,dest|
99
- url_map[dest] << origin
100
- end
101
- end
127
+ Spidr.site('http://intranet.com/') do |spider|
128
+ spider.every_link do |origin,dest|
129
+ url_map[dest] << origin
130
+ end
131
+ end
132
+ ```
102
133
 
103
134
  Print out the URLs that could not be requested:
104
135
 
105
- Spidr.site('http://company.com/') do |spider|
106
- spider.every_failed_url { |url| puts url }
107
- end
136
+ ```ruby
137
+ Spidr.site('http://company.com/') do |spider|
138
+ spider.every_failed_url { |url| puts url }
139
+ end
140
+ ```
108
141
 
109
142
  Finds all pages which have broken links:
110
143
 
111
- url_map = Hash.new { |hash,key| hash[key] = [] }
144
+ ```ruby
145
+ url_map = Hash.new { |hash,key| hash[key] = [] }
112
146
 
113
- spider = Spidr.site('http://intranet.com/') do |spider|
114
- spider.every_link do |origin,dest|
115
- url_map[dest] << origin
116
- end
117
- end
147
+ spider = Spidr.site('http://intranet.com/') do |spider|
148
+ spider.every_link do |origin,dest|
149
+ url_map[dest] << origin
150
+ end
151
+ end
118
152
 
119
- spider.failures.each do |url|
120
- puts "Broken link #{url} found in:"
153
+ spider.failures.each do |url|
154
+ puts "Broken link #{url} found in:"
121
155
 
122
- url_map[url].each { |page| puts " #{page}" }
123
- end
156
+ url_map[url].each { |page| puts " #{page}" }
157
+ end
158
+ ```
124
159
 
125
160
  Search HTML and XML pages:
126
161
 
127
- Spidr.site('http://company.com/') do |spider|
128
- spider.every_page do |page|
129
- puts ">>> #{page.url}"
162
+ ```ruby
163
+ Spidr.site('http://company.com/') do |spider|
164
+ spider.every_page do |page|
165
+ puts ">>> #{page.url}"
130
166
 
131
- page.search('//meta').each do |meta|
132
- name = (meta.attributes['name'] || meta.attributes['http-equiv'])
133
- value = meta.attributes['content']
167
+ page.search('//meta').each do |meta|
168
+ name = (meta.attributes['name'] || meta.attributes['http-equiv'])
169
+ value = meta.attributes['content']
134
170
 
135
- puts " #{name} = #{value}"
136
- end
137
- end
171
+ puts " #{name} = #{value}"
138
172
  end
173
+ end
174
+ end
175
+ ```
139
176
 
140
177
  Print out the titles from every page:
141
178
 
142
- Spidr.site('https://www.ruby-lang.org/') do |spider|
143
- spider.every_html_page do |page|
144
- puts page.title
145
- end
146
- end
179
+ ```ruby
180
+ Spidr.site('https://www.ruby-lang.org/') do |spider|
181
+ spider.every_html_page do |page|
182
+ puts page.title
183
+ end
184
+ end
185
+ ```
186
+
187
+ Print out every HTTP redirect:
188
+
189
+ ```ruby
190
+ Spidr.host('company.com') do |spider|
191
+ spider.every_redirect_page do |page|
192
+ puts "#{page.url} -> #{page.headers['Location']}"
193
+ end
194
+ end
195
+ ```
147
196
 
148
197
  Find what kinds of web servers a host is using, by accessing the headers:
149
198
 
150
- servers = Set[]
199
+ ```ruby
200
+ servers = Set[]
151
201
 
152
- Spidr.host('company.com') do |spider|
153
- spider.all_headers do |headers|
154
- servers << headers['server']
155
- end
156
- end
202
+ Spidr.host('company.com') do |spider|
203
+ spider.all_headers do |headers|
204
+ servers << headers['server']
205
+ end
206
+ end
207
+ ```
157
208
 
158
209
  Pause the spider on a forbidden page:
159
210
 
160
- Spidr.host('company.com') do |spider|
161
- spider.every_forbidden_page do |page|
162
- spider.pause!
163
- end
164
- end
211
+ ```ruby
212
+ Spidr.host('company.com') do |spider|
213
+ spider.every_forbidden_page do |page|
214
+ spider.pause!
215
+ end
216
+ end
217
+ ```
165
218
 
166
219
  Skip the processing of a page:
167
220
 
168
- Spidr.host('company.com') do |spider|
169
- spider.every_missing_page do |page|
170
- spider.skip_page!
171
- end
172
- end
221
+ ```ruby
222
+ Spidr.host('company.com') do |spider|
223
+ spider.every_missing_page do |page|
224
+ spider.skip_page!
225
+ end
226
+ end
227
+ ```
173
228
 
174
229
  Skip the processing of links:
175
230
 
176
- Spidr.host('company.com') do |spider|
177
- spider.every_url do |url|
178
- if url.path.split('/').find { |dir| dir.to_i > 1000 }
179
- spider.skip_link!
180
- end
181
- end
231
+ ```ruby
232
+ Spidr.host('company.com') do |spider|
233
+ spider.every_url do |url|
234
+ if url.path.split('/').find { |dir| dir.to_i > 1000 }
235
+ spider.skip_link!
182
236
  end
237
+ end
238
+ end
239
+ ```
183
240
 
184
241
  ## Requirements
185
242
 
@@ -188,12 +245,12 @@ Skip the processing of links:
188
245
 
189
246
  ## Install
190
247
 
191
- $ gem install spidr
248
+ ```shell
249
+ $ gem install spidr
250
+ ```
192
251
 
193
252
  ## License
194
253
 
195
- Copyright (c) 2008-2016 Hal Brodigan
196
-
197
254
  See {file:LICENSE.txt} for license information.
198
255
 
199
256
  [ruby]: https://www.ruby-lang.org/
data/Rakefile CHANGED
@@ -12,6 +12,7 @@ Gem::Tasks.new
12
12
 
13
13
  require 'rspec/core/rake_task'
14
14
  RSpec::Core::RakeTask.new
15
+ task :test => :spec
15
16
  task :default => :spec
16
17
 
17
18
  require 'yard'
data/gemspec.yml CHANGED
@@ -11,6 +11,13 @@ email: postmodern.mod3@gmail.com
11
11
  homepage: https://github.com/postmodern/spidr#readme
12
12
  has_yard: true
13
13
 
14
+ metadata:
15
+ documentation_uri: https://rubydoc.info/gems/spidr
16
+ source_code_uri: https://github.com/postmodern/spidr.rb
17
+ bug_tracker_uri: https://github.com/postmodern/spidr.rb/issues
18
+ changelog_uri: https://github.com/postmodern/spidr.rb/blob/master/ChangeLog.md
19
+ rubygems_mfa_required: 'true'
20
+
14
21
  required_ruby_version: ">= 2.0.0"
15
22
 
16
23
  dependencies:
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Spidr
2
4
  class Agent
3
5
  module Actions
@@ -96,7 +98,7 @@ module Spidr
96
98
 
97
99
  protected
98
100
 
99
- def initialize_actions(options={})
101
+ def initialize_actions
100
102
  @paused = false
101
103
  end
102
104
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Spidr
2
4
  class Agent
3
5
  #
@@ -520,7 +522,7 @@ module Spidr
520
522
 
521
523
  protected
522
524
 
523
- def initialize_events(options={})
525
+ def initialize_events
524
526
  @every_url_blocks = []
525
527
  @every_failed_url_blocks = []
526
528
  @every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
@@ -1,4 +1,6 @@
1
- require 'spidr/rules'
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../rules'
2
4
 
3
5
  module Spidr
4
6
  class Agent
@@ -170,7 +172,7 @@ module Spidr
170
172
  #
171
173
  # @yieldparam [String] link
172
174
  # A link to accept or reject.
173
- #
175
+ #
174
176
  # @since 0.2.4
175
177
  #
176
178
  def visit_links_like(pattern=nil,&block)
@@ -238,7 +240,7 @@ module Spidr
238
240
  #
239
241
  # @yieldparam [URI::HTTP, URI::HTTPS] url
240
242
  # A URL to accept or reject.
241
- #
243
+ #
242
244
  # @since 0.2.4
243
245
  #
244
246
  def visit_urls_like(pattern=nil,&block)
@@ -356,89 +358,88 @@ module Spidr
356
358
  #
357
359
  # Initializes filtering rules.
358
360
  #
359
- # @param [Hash] options
360
- # Additional options.
361
- #
362
- # @option options [Array] :schemes (['http', 'https'])
361
+ # @param [Array<String>] schemes
363
362
  # The list of acceptable URI schemes to visit.
364
363
  # The `https` scheme will be ignored if `net/https` cannot be loaded.
365
364
  #
366
- # @option options [String] :host
365
+ # @param [String] host
367
366
  # The host-name to visit.
368
367
  #
369
- # @option options [Array<String, Regexp, Proc>] :hosts
368
+ # @param [Array<String, Regexp, Proc>] hosts
370
369
  # The patterns which match the host-names to visit.
371
370
  #
372
- # @option options [Array<String, Regexp, Proc>] :ignore_hosts
371
+ # @param [Array<String, Regexp, Proc>] ignore_hosts
373
372
  # The patterns which match the host-names to not visit.
374
373
  #
375
- # @option options [Array<Integer, Regexp, Proc>] :ports
374
+ # @param [Array<Integer, Regexp, Proc>] ports
376
375
  # The patterns which match the ports to visit.
377
376
  #
378
- # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
377
+ # @param [Array<Integer, Regexp, Proc>] ignore_ports
379
378
  # The patterns which match the ports to not visit.
380
379
  #
381
- # @option options [Array<String, Regexp, Proc>] :links
380
+ # @param [Array<String, Regexp, Proc>] links
382
381
  # The patterns which match the links to visit.
383
382
  #
384
- # @option options [Array<String, Regexp, Proc>] :ignore_links
383
+ # @param [Array<String, Regexp, Proc>] ignore_links
385
384
  # The patterns which match the links to not visit.
386
385
  #
387
- # @option options [Array<String, Regexp, Proc>] :urls
386
+ # @param [Array<String, Regexp, Proc>] urls
388
387
  # The patterns which match the URLs to visit.
389
388
  #
390
- # @option options [Array<String, Regexp, Proc>] :ignore_urls
389
+ # @param [Array<String, Regexp, Proc>] ignore_urls
391
390
  # The patterns which match the URLs to not visit.
392
391
  #
393
- # @option options [Array<String, Regexp, Proc>] :exts
392
+ # @param [Array<String, Regexp, Proc>] exts
394
393
  # The patterns which match the URI path extensions to visit.
395
394
  #
396
- # @option options [Array<String, Regexp, Proc>] :ignore_exts
395
+ # @param [Array<String, Regexp, Proc>] ignore_exts
397
396
  # The patterns which match the URI path extensions to not visit.
398
397
  #
399
- def initialize_filters(options={})
400
- @schemes = []
398
+ def initialize_filters(schemes: self.class.default_schemes,
399
+ host: nil,
400
+ hosts: nil,
401
+ ignore_hosts: nil,
402
+ ports: nil,
403
+ ignore_ports: nil,
404
+ links: nil,
405
+ ignore_links: nil,
406
+ urls: nil,
407
+ ignore_urls: nil,
408
+ exts: nil,
409
+ ignore_exts: nil)
410
+ @schemes = schemes.map(&:to_s)
411
+
412
+ @host_rules = Rules.new(accept: hosts, reject: ignore_hosts)
413
+ @port_rules = Rules.new(accept: ports, reject: ignore_ports)
414
+ @link_rules = Rules.new(accept: links, reject: ignore_links)
415
+ @url_rules = Rules.new(accept: urls, reject: ignore_urls)
416
+ @ext_rules = Rules.new(accept: exts, reject: ignore_exts)
417
+
418
+ visit_hosts_like(host) if host
419
+ end
401
420
 
402
- if options[:schemes]
403
- self.schemes = options[:schemes]
404
- else
405
- @schemes << 'http'
421
+ #
422
+ # Determines the default URI schemes to follow.
423
+ #
424
+ # @return [Array<String>]
425
+ # The default URI schemes to follow.
426
+ #
427
+ # @since 0.6.2
428
+ #
429
+ def self.default_schemes
430
+ schemes = ['http']
406
431
 
407
- begin
408
- require 'net/https'
432
+ begin
433
+ require 'net/https'
409
434
 
410
- @schemes << 'https'
411
- rescue Gem::LoadError => e
412
- raise(e)
413
- rescue ::LoadError
414
- warn "Warning: cannot load 'net/https', https support disabled"
415
- end
435
+ schemes << 'https'
436
+ rescue Gem::LoadError => e
437
+ raise(e)
438
+ rescue ::LoadError
439
+ warn "Warning: cannot load 'net/https', https support disabled"
416
440
  end
417
441
 
418
- @host_rules = Rules.new(
419
- accept: options[:hosts],
420
- reject: options[:ignore_hosts]
421
- )
422
- @port_rules = Rules.new(
423
- accept: options[:ports],
424
- reject: options[:ignore_ports]
425
- )
426
- @link_rules = Rules.new(
427
- accept: options[:links],
428
- reject: options[:ignore_links]
429
- )
430
- @url_rules = Rules.new(
431
- accept: options[:urls],
432
- reject: options[:ignore_urls]
433
- )
434
- @ext_rules = Rules.new(
435
- accept: options[:exts],
436
- reject: options[:ignore_exts]
437
- )
438
-
439
- if options[:host]
440
- visit_hosts_like(options[:host])
441
- end
442
+ return schemes
442
443
  end
443
444
 
444
445
  #
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  begin
2
4
  require 'robots'
3
5
  rescue LoadError
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'uri'
2
4
 
3
5
  module Spidr
@@ -34,20 +36,17 @@ module Spidr
34
36
  #
35
37
  # Initializes the Sanitizer rules.
36
38
  #
37
- # @param [Hash] options
38
- # Additional options.
39
- #
40
- # @option options [Boolean] :strip_fragments (true)
39
+ # @param [Boolean] strip_fragments
41
40
  # Specifies whether or not to strip the fragment component from URLs.
42
41
  #
43
- # @option options [Boolean] :strip_query (false)
42
+ # @param [Boolean] strip_query
44
43
  # Specifies whether or not to strip the query component from URLs.
45
44
  #
46
45
  # @since 0.2.2
47
46
  #
48
- def initialize_sanitizers(options={})
49
- @strip_fragments = options.fetch(:strip_fragments,true)
50
- @strip_query = options.fetch(:strip_query,false)
47
+ def initialize_sanitizers(strip_fragments: true, strip_query: false)
48
+ @strip_fragments = strip_fragments
49
+ @strip_query = strip_query
51
50
  end
52
51
 
53
52
  end