spidr 0.6.1 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.editorconfig +11 -0
  3. data/.github/workflows/ruby.yml +26 -0
  4. data/.gitignore +4 -5
  5. data/ChangeLog.md +19 -1
  6. data/Gemfile +7 -4
  7. data/LICENSE.txt +1 -1
  8. data/README.md +136 -79
  9. data/Rakefile +1 -0
  10. data/gemspec.yml +7 -0
  11. data/lib/spidr/agent/actions.rb +3 -1
  12. data/lib/spidr/agent/events.rb +3 -1
  13. data/lib/spidr/agent/filters.rb +57 -56
  14. data/lib/spidr/agent/robots.rb +2 -0
  15. data/lib/spidr/agent/sanitizers.rb +7 -8
  16. data/lib/spidr/agent.rb +232 -108
  17. data/lib/spidr/auth_credential.rb +2 -0
  18. data/lib/spidr/auth_store.rb +9 -7
  19. data/lib/spidr/cookie_jar.rb +7 -5
  20. data/lib/spidr/extensions/uri.rb +3 -1
  21. data/lib/spidr/extensions.rb +3 -1
  22. data/lib/spidr/page/content_types.rb +53 -0
  23. data/lib/spidr/page/cookies.rb +2 -0
  24. data/lib/spidr/page/html.rb +21 -20
  25. data/lib/spidr/page/status_codes.rb +15 -11
  26. data/lib/spidr/page.rb +3 -1
  27. data/lib/spidr/proxy.rb +8 -14
  28. data/lib/spidr/rules.rb +7 -8
  29. data/lib/spidr/session_cache.rb +26 -22
  30. data/lib/spidr/settings/proxy.rb +22 -6
  31. data/lib/spidr/settings/timeouts.rb +2 -0
  32. data/lib/spidr/settings/user_agent.rb +2 -0
  33. data/lib/spidr/settings.rb +5 -3
  34. data/lib/spidr/spidr.rb +22 -11
  35. data/lib/spidr/version.rb +3 -1
  36. data/lib/spidr.rb +5 -3
  37. data/spec/agent_spec.rb +356 -7
  38. data/spec/example_page.rb +2 -0
  39. data/spec/page/content_types_spec.rb +22 -0
  40. data/spec/page/html_spec.rb +255 -51
  41. data/spec/page/status_codes_spec.rb +4 -4
  42. data/spec/proxy_spec.rb +2 -2
  43. data/spec/settings/proxy_examples.rb +31 -11
  44. data/spec/spec_helper.rb +3 -0
  45. data/spidr.gemspec +1 -4
  46. metadata +8 -7
  47. data/.travis.yml +0 -16
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e2202e0ce389cbbc6f88360d7e7b430328bc6da973b69929ebc54b1d92c104bb
4
- data.tar.gz: 0777e972ef2cb1d540ee138a7703ee67f88b482260074c13e8c08a8da963aa77
3
+ metadata.gz: 471764341b98b0cfeb57db24ac34a849dcfdcf43a751b648451a20c29c1ec051
4
+ data.tar.gz: '009c903cf30a13e55bbb8029fe2fdbfa4f8a8af32126b74aeb558f1afd3d3d88'
5
5
  SHA512:
6
- metadata.gz: 45e923ad3aa59812de4af67cc3a1739d7751749b3aa3205c5979e1f29c302abd43316cd74ad7c61befda2d31b0ff080872047e9713537d3bfe2509a8d555156a
7
- data.tar.gz: deae3dbd7d9566723ca8760064c715de8795b1d4e4be1ae0866497d697a411af3e8bbe2a677028a5bf57bba8e681bd9d712cef3a2d0d123bc903af28f5e32a77
6
+ metadata.gz: bddb65750dce8f6193764ac9d372adfa1893dc8743c24c383c359069043b51cd94e09ecd8bffad16bb8b4d92f99324c98ca95f8f59a9c9655a3f2fb7c42b9f57
7
+ data.tar.gz: c02f98806d9297ee22c6552eaaf6bb82f619001af25b0d8eeaabf91d0e32ab7154b5436de71ed4773b15353ba5556b52ece92a6035a891eb001c27b90e5cdda5
data/.editorconfig ADDED
@@ -0,0 +1,11 @@
1
+ root = true
2
+
3
+ [*]
4
+ end_of_line = lf
5
+ insert_final_newline = true
6
+ tab_width = 8
7
+ trim_trailing_whitespace = true
8
+
9
+ [{Gemfile,Rakefile,*.rb,*.gemspec,*.yml}]
10
+ indent_style = space
11
+ indent_size = 2
@@ -0,0 +1,26 @@
1
+ name: CI
2
+
3
+ on: [ push, pull_request ]
4
+
5
+ jobs:
6
+ tests:
7
+ runs-on: ubuntu-latest
8
+ strategy:
9
+ fail-fast: false
10
+ matrix:
11
+ ruby:
12
+ - '3.0'
13
+ - '3.1'
14
+ - '3.2'
15
+ - '3.3'
16
+ - jruby
17
+ name: Ruby ${{ matrix.ruby }}
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+ - name: Set up Ruby
21
+ uses: ruby/setup-ruby@v1
22
+ with:
23
+ ruby-version: ${{ matrix.ruby }}
24
+ bundler-cache: true
25
+ - name: Run tests
26
+ run: bundle exec rake test
data/.gitignore CHANGED
@@ -1,8 +1,7 @@
1
- pkg
2
- doc
3
- web
4
- tmp
5
- Gemfile.lock
1
+ /Gemfile.lock
2
+ /coverage
3
+ /doc
4
+ /pkg
6
5
  .DS_Store
7
6
  .bundle
8
7
  .yardoc
data/ChangeLog.md CHANGED
@@ -1,6 +1,24 @@
1
+ ### 0.7.1 / 2024-01-25
2
+
3
+ * Switched to using `require_relative` to improve load-times.
4
+ * Added `# frozen_string_literal: true` to all files.
5
+ * Use keyword arguments for {Spidr.domain}.
6
+ * Rescue `URI::Error` instead of `Exception` when calling `URI::HTTP#merge` in
7
+ {Spidr::Page#to_absolute}.
8
+
9
+ ### 0.7.0 / 2022-12-31
10
+
11
+ * Added {Spidr.domain} and {Spidr::Agent.domain}.
12
+ * Added {Spidr::Page#gif?}.
13
+ * Added {Spidr::Page#jpeg?}.
14
+ * Added {Spidr::Page#icon?} and {Spidr::Page#ico?}.
15
+ * Added {Spidr::Page#png?}.
16
+ * {Spidr.proxy=} and {Spidr::Agent#proxy=} can now accept a `String` or a
17
+ `URI::HTTP` object.
18
+
1
19
  ### 0.6.1 / 2019-10-24
2
20
 
3
- * Check for opaque component of URIs before attempting to set the path
21
+ * Check for the opaque component of URIs before attempting to set the path
4
22
  component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
5
23
  opaque` exceptions.
6
24
  * Fix `@robots` instance variable warning (@spk).
data/Gemfile CHANGED
@@ -12,10 +12,13 @@ group :development do
12
12
  gem 'rake'
13
13
  gem 'rubygems-tasks', '~> 0.2'
14
14
 
15
- gem 'rspec', '~> 3.0'
16
- gem 'webmock', '~> 3.0'
17
- gem 'sinatra', '~> 1.0'
15
+ gem 'rspec', '~> 3.0'
16
+ gem 'webmock', '~> 3.0'
17
+ gem 'sinatra', '~> 2.0'
18
+ gem 'simplecov', '~> 0.20'
18
19
 
19
20
  gem 'kramdown'
20
- gem 'yard', '~> 0.9'
21
+ gem 'redcarpet', platform: :mri
22
+ gem 'yard', '~> 0.9'
23
+ gem 'yard-spellcheck', require: false
21
24
  end
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2008-2016 Hal Brodigan
1
+ Copyright (c) 2008-2024 Hal Brodigan
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -1,11 +1,11 @@
1
1
  # Spidr
2
2
 
3
+ [![CI](https://github.com/postmodern/spidr/actions/workflows/ruby.yml/badge.svg)](https://github.com/postmodern/spidr/actions/workflows/ruby.yml)
4
+
3
5
  * [Homepage](https://github.com/postmodern/spidr#readme)
4
6
  * [Source](https://github.com/postmodern/spidr)
5
7
  * [Issues](https://github.com/postmodern/spidr/issues)
6
8
  * [Mailing List](http://groups.google.com/group/spidr)
7
- * [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
8
- * [![Build Status](https://travis-ci.org/postmodern/spidr.svg)](https://travis-ci.org/postmodern/spidr)
9
9
 
10
10
  ## Description
11
11
 
@@ -49,137 +49,194 @@ and easy to use.
49
49
 
50
50
  Start spidering from a URL:
51
51
 
52
- Spidr.start_at('http://tenderlovemaking.com/')
52
+ ```ruby
53
+ Spidr.start_at('http://tenderlovemaking.com/') do |agent|
54
+ # ...
55
+ end
56
+ ```
53
57
 
54
58
  Spider a host:
55
59
 
56
- Spidr.host('solnic.eu')
60
+ ```ruby
61
+ Spidr.host('solnic.eu') do |agent|
62
+ # ...
63
+ end
64
+ ```
65
+
66
+ Spider a domain (and any sub-domains):
67
+
68
+ ```ruby
69
+ Spidr.domain('ruby-lang.org') do |agent|
70
+ # ...
71
+ end
72
+ ```
57
73
 
58
74
  Spider a site:
59
75
 
60
- Spidr.site('http://www.rubyflow.com/')
76
+ ```ruby
77
+ Spidr.site('http://www.rubyflow.com/') do |agent|
78
+ # ...
79
+ end
80
+ ```
61
81
 
62
82
  Spider multiple hosts:
63
83
 
64
- Spidr.start_at(
65
- 'http://company.com/',
66
- hosts: [
67
- 'company.com',
68
- /host[\d]+\.company\.com/
69
- ]
70
- )
84
+ ```ruby
85
+ Spidr.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
86
+ # ...
87
+ end
88
+ ```
71
89
 
72
90
  Do not spider certain links:
73
91
 
74
- Spidr.site('http://company.com/', ignore_links: [%{^/blog/}])
92
+ ```ruby
93
+ Spidr.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
94
+ # ...
95
+ end
96
+ ```
75
97
 
76
98
  Do not spider links on certain ports:
77
99
 
78
- Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080])
100
+ ```ruby
101
+ Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
102
+ # ...
103
+ end
104
+ ```
79
105
 
80
106
  Do not spider links blacklisted in robots.txt:
81
107
 
82
- Spidr.site(
83
- 'http://company.com/',
84
- robots: true
85
- )
108
+ ```ruby
109
+ Spidr.site('http://company.com/', robots: true) do |agent|
110
+ # ...
111
+ end
112
+ ```
86
113
 
87
114
  Print out visited URLs:
88
115
 
89
- Spidr.site('http://www.rubyinside.com/') do |spider|
90
- spider.every_url { |url| puts url }
91
- end
116
+ ```ruby
117
+ Spidr.site('http://www.rubyinside.com/') do |spider|
118
+ spider.every_url { |url| puts url }
119
+ end
120
+ ```
92
121
 
93
122
  Build a URL map of a site:
94
123
 
95
- url_map = Hash.new { |hash,key| hash[key] = [] }
124
+ ```ruby
125
+ url_map = Hash.new { |hash,key| hash[key] = [] }
96
126
 
97
- Spidr.site('http://intranet.com/') do |spider|
98
- spider.every_link do |origin,dest|
99
- url_map[dest] << origin
100
- end
101
- end
127
+ Spidr.site('http://intranet.com/') do |spider|
128
+ spider.every_link do |origin,dest|
129
+ url_map[dest] << origin
130
+ end
131
+ end
132
+ ```
102
133
 
103
134
  Print out the URLs that could not be requested:
104
135
 
105
- Spidr.site('http://company.com/') do |spider|
106
- spider.every_failed_url { |url| puts url }
107
- end
136
+ ```ruby
137
+ Spidr.site('http://company.com/') do |spider|
138
+ spider.every_failed_url { |url| puts url }
139
+ end
140
+ ```
108
141
 
109
142
  Finds all pages which have broken links:
110
143
 
111
- url_map = Hash.new { |hash,key| hash[key] = [] }
144
+ ```ruby
145
+ url_map = Hash.new { |hash,key| hash[key] = [] }
112
146
 
113
- spider = Spidr.site('http://intranet.com/') do |spider|
114
- spider.every_link do |origin,dest|
115
- url_map[dest] << origin
116
- end
117
- end
147
+ spider = Spidr.site('http://intranet.com/') do |spider|
148
+ spider.every_link do |origin,dest|
149
+ url_map[dest] << origin
150
+ end
151
+ end
118
152
 
119
- spider.failures.each do |url|
120
- puts "Broken link #{url} found in:"
153
+ spider.failures.each do |url|
154
+ puts "Broken link #{url} found in:"
121
155
 
122
- url_map[url].each { |page| puts " #{page}" }
123
- end
156
+ url_map[url].each { |page| puts " #{page}" }
157
+ end
158
+ ```
124
159
 
125
160
  Search HTML and XML pages:
126
161
 
127
- Spidr.site('http://company.com/') do |spider|
128
- spider.every_page do |page|
129
- puts ">>> #{page.url}"
162
+ ```ruby
163
+ Spidr.site('http://company.com/') do |spider|
164
+ spider.every_page do |page|
165
+ puts ">>> #{page.url}"
130
166
 
131
- page.search('//meta').each do |meta|
132
- name = (meta.attributes['name'] || meta.attributes['http-equiv'])
133
- value = meta.attributes['content']
167
+ page.search('//meta').each do |meta|
168
+ name = (meta.attributes['name'] || meta.attributes['http-equiv'])
169
+ value = meta.attributes['content']
134
170
 
135
- puts " #{name} = #{value}"
136
- end
137
- end
171
+ puts " #{name} = #{value}"
138
172
  end
173
+ end
174
+ end
175
+ ```
139
176
 
140
177
  Print out the titles from every page:
141
178
 
142
- Spidr.site('https://www.ruby-lang.org/') do |spider|
143
- spider.every_html_page do |page|
144
- puts page.title
145
- end
146
- end
179
+ ```ruby
180
+ Spidr.site('https://www.ruby-lang.org/') do |spider|
181
+ spider.every_html_page do |page|
182
+ puts page.title
183
+ end
184
+ end
185
+ ```
186
+
187
+ Print out every HTTP redirect:
188
+
189
+ ```ruby
190
+ Spidr.host('company.com') do |spider|
191
+ spider.every_redirect_page do |page|
192
+ puts "#{page.url} -> #{page.headers['Location']}"
193
+ end
194
+ end
195
+ ```
147
196
 
148
197
  Find what kinds of web servers a host is using, by accessing the headers:
149
198
 
150
- servers = Set[]
199
+ ```ruby
200
+ servers = Set[]
151
201
 
152
- Spidr.host('company.com') do |spider|
153
- spider.all_headers do |headers|
154
- servers << headers['server']
155
- end
156
- end
202
+ Spidr.host('company.com') do |spider|
203
+ spider.all_headers do |headers|
204
+ servers << headers['server']
205
+ end
206
+ end
207
+ ```
157
208
 
158
209
  Pause the spider on a forbidden page:
159
210
 
160
- Spidr.host('company.com') do |spider|
161
- spider.every_forbidden_page do |page|
162
- spider.pause!
163
- end
164
- end
211
+ ```ruby
212
+ Spidr.host('company.com') do |spider|
213
+ spider.every_forbidden_page do |page|
214
+ spider.pause!
215
+ end
216
+ end
217
+ ```
165
218
 
166
219
  Skip the processing of a page:
167
220
 
168
- Spidr.host('company.com') do |spider|
169
- spider.every_missing_page do |page|
170
- spider.skip_page!
171
- end
172
- end
221
+ ```ruby
222
+ Spidr.host('company.com') do |spider|
223
+ spider.every_missing_page do |page|
224
+ spider.skip_page!
225
+ end
226
+ end
227
+ ```
173
228
 
174
229
  Skip the processing of links:
175
230
 
176
- Spidr.host('company.com') do |spider|
177
- spider.every_url do |url|
178
- if url.path.split('/').find { |dir| dir.to_i > 1000 }
179
- spider.skip_link!
180
- end
181
- end
231
+ ```ruby
232
+ Spidr.host('company.com') do |spider|
233
+ spider.every_url do |url|
234
+ if url.path.split('/').find { |dir| dir.to_i > 1000 }
235
+ spider.skip_link!
182
236
  end
237
+ end
238
+ end
239
+ ```
183
240
 
184
241
  ## Requirements
185
242
 
@@ -188,12 +245,12 @@ Skip the processing of links:
188
245
 
189
246
  ## Install
190
247
 
191
- $ gem install spidr
248
+ ```shell
249
+ $ gem install spidr
250
+ ```
192
251
 
193
252
  ## License
194
253
 
195
- Copyright (c) 2008-2016 Hal Brodigan
196
-
197
254
  See {file:LICENSE.txt} for license information.
198
255
 
199
256
  [ruby]: https://www.ruby-lang.org/
data/Rakefile CHANGED
@@ -12,6 +12,7 @@ Gem::Tasks.new
12
12
 
13
13
  require 'rspec/core/rake_task'
14
14
  RSpec::Core::RakeTask.new
15
+ task :test => :spec
15
16
  task :default => :spec
16
17
 
17
18
  require 'yard'
data/gemspec.yml CHANGED
@@ -11,6 +11,13 @@ email: postmodern.mod3@gmail.com
11
11
  homepage: https://github.com/postmodern/spidr#readme
12
12
  has_yard: true
13
13
 
14
+ metadata:
15
+ documentation_uri: https://rubydoc.info/gems/spidr
16
+ source_code_uri: https://github.com/postmodern/spidr.rb
17
+ bug_tracker_uri: https://github.com/postmodern/spidr.rb/issues
18
+ changelog_uri: https://github.com/postmodern/spidr.rb/blob/master/ChangeLog.md
19
+ rubygems_mfa_required: 'true'
20
+
14
21
  required_ruby_version: ">= 2.0.0"
15
22
 
16
23
  dependencies:
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Spidr
2
4
  class Agent
3
5
  module Actions
@@ -96,7 +98,7 @@ module Spidr
96
98
 
97
99
  protected
98
100
 
99
- def initialize_actions(options={})
101
+ def initialize_actions
100
102
  @paused = false
101
103
  end
102
104
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Spidr
2
4
  class Agent
3
5
  #
@@ -520,7 +522,7 @@ module Spidr
520
522
 
521
523
  protected
522
524
 
523
- def initialize_events(options={})
525
+ def initialize_events
524
526
  @every_url_blocks = []
525
527
  @every_failed_url_blocks = []
526
528
  @every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
@@ -1,4 +1,6 @@
1
- require 'spidr/rules'
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../rules'
2
4
 
3
5
  module Spidr
4
6
  class Agent
@@ -170,7 +172,7 @@ module Spidr
170
172
  #
171
173
  # @yieldparam [String] link
172
174
  # A link to accept or reject.
173
- #
175
+ #
174
176
  # @since 0.2.4
175
177
  #
176
178
  def visit_links_like(pattern=nil,&block)
@@ -238,7 +240,7 @@ module Spidr
238
240
  #
239
241
  # @yieldparam [URI::HTTP, URI::HTTPS] url
240
242
  # A URL to accept or reject.
241
- #
243
+ #
242
244
  # @since 0.2.4
243
245
  #
244
246
  def visit_urls_like(pattern=nil,&block)
@@ -356,89 +358,88 @@ module Spidr
356
358
  #
357
359
  # Initializes filtering rules.
358
360
  #
359
- # @param [Hash] options
360
- # Additional options.
361
- #
362
- # @option options [Array] :schemes (['http', 'https'])
361
+ # @param [Array<String>] schemes
363
362
  # The list of acceptable URI schemes to visit.
364
363
  # The `https` scheme will be ignored if `net/https` cannot be loaded.
365
364
  #
366
- # @option options [String] :host
365
+ # @param [String] host
367
366
  # The host-name to visit.
368
367
  #
369
- # @option options [Array<String, Regexp, Proc>] :hosts
368
+ # @param [Array<String, Regexp, Proc>] hosts
370
369
  # The patterns which match the host-names to visit.
371
370
  #
372
- # @option options [Array<String, Regexp, Proc>] :ignore_hosts
371
+ # @param [Array<String, Regexp, Proc>] ignore_hosts
373
372
  # The patterns which match the host-names to not visit.
374
373
  #
375
- # @option options [Array<Integer, Regexp, Proc>] :ports
374
+ # @param [Array<Integer, Regexp, Proc>] ports
376
375
  # The patterns which match the ports to visit.
377
376
  #
378
- # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
377
+ # @param [Array<Integer, Regexp, Proc>] ignore_ports
379
378
  # The patterns which match the ports to not visit.
380
379
  #
381
- # @option options [Array<String, Regexp, Proc>] :links
380
+ # @param [Array<String, Regexp, Proc>] links
382
381
  # The patterns which match the links to visit.
383
382
  #
384
- # @option options [Array<String, Regexp, Proc>] :ignore_links
383
+ # @param [Array<String, Regexp, Proc>] ignore_links
385
384
  # The patterns which match the links to not visit.
386
385
  #
387
- # @option options [Array<String, Regexp, Proc>] :urls
386
+ # @param [Array<String, Regexp, Proc>] urls
388
387
  # The patterns which match the URLs to visit.
389
388
  #
390
- # @option options [Array<String, Regexp, Proc>] :ignore_urls
389
+ # @param [Array<String, Regexp, Proc>] ignore_urls
391
390
  # The patterns which match the URLs to not visit.
392
391
  #
393
- # @option options [Array<String, Regexp, Proc>] :exts
392
+ # @param [Array<String, Regexp, Proc>] exts
394
393
  # The patterns which match the URI path extensions to visit.
395
394
  #
396
- # @option options [Array<String, Regexp, Proc>] :ignore_exts
395
+ # @param [Array<String, Regexp, Proc>] ignore_exts
397
396
  # The patterns which match the URI path extensions to not visit.
398
397
  #
399
- def initialize_filters(options={})
400
- @schemes = []
398
+ def initialize_filters(schemes: self.class.default_schemes,
399
+ host: nil,
400
+ hosts: nil,
401
+ ignore_hosts: nil,
402
+ ports: nil,
403
+ ignore_ports: nil,
404
+ links: nil,
405
+ ignore_links: nil,
406
+ urls: nil,
407
+ ignore_urls: nil,
408
+ exts: nil,
409
+ ignore_exts: nil)
410
+ @schemes = schemes.map(&:to_s)
411
+
412
+ @host_rules = Rules.new(accept: hosts, reject: ignore_hosts)
413
+ @port_rules = Rules.new(accept: ports, reject: ignore_ports)
414
+ @link_rules = Rules.new(accept: links, reject: ignore_links)
415
+ @url_rules = Rules.new(accept: urls, reject: ignore_urls)
416
+ @ext_rules = Rules.new(accept: exts, reject: ignore_exts)
417
+
418
+ visit_hosts_like(host) if host
419
+ end
401
420
 
402
- if options[:schemes]
403
- self.schemes = options[:schemes]
404
- else
405
- @schemes << 'http'
421
+ #
422
+ # Determines the default URI schemes to follow.
423
+ #
424
+ # @return [Array<String>]
425
+ # The default URI schemes to follow.
426
+ #
427
+ # @since 0.6.2
428
+ #
429
+ def self.default_schemes
430
+ schemes = ['http']
406
431
 
407
- begin
408
- require 'net/https'
432
+ begin
433
+ require 'net/https'
409
434
 
410
- @schemes << 'https'
411
- rescue Gem::LoadError => e
412
- raise(e)
413
- rescue ::LoadError
414
- warn "Warning: cannot load 'net/https', https support disabled"
415
- end
435
+ schemes << 'https'
436
+ rescue Gem::LoadError => e
437
+ raise(e)
438
+ rescue ::LoadError
439
+ warn "Warning: cannot load 'net/https', https support disabled"
416
440
  end
417
441
 
418
- @host_rules = Rules.new(
419
- accept: options[:hosts],
420
- reject: options[:ignore_hosts]
421
- )
422
- @port_rules = Rules.new(
423
- accept: options[:ports],
424
- reject: options[:ignore_ports]
425
- )
426
- @link_rules = Rules.new(
427
- accept: options[:links],
428
- reject: options[:ignore_links]
429
- )
430
- @url_rules = Rules.new(
431
- accept: options[:urls],
432
- reject: options[:ignore_urls]
433
- )
434
- @ext_rules = Rules.new(
435
- accept: options[:exts],
436
- reject: options[:ignore_exts]
437
- )
438
-
439
- if options[:host]
440
- visit_hosts_like(options[:host])
441
- end
442
+ return schemes
442
443
  end
443
444
 
444
445
  #
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  begin
2
4
  require 'robots'
3
5
  rescue LoadError
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'uri'
2
4
 
3
5
  module Spidr
@@ -34,20 +36,17 @@ module Spidr
34
36
  #
35
37
  # Initializes the Sanitizer rules.
36
38
  #
37
- # @param [Hash] options
38
- # Additional options.
39
- #
40
- # @option options [Boolean] :strip_fragments (true)
39
+ # @param [Boolean] strip_fragments
41
40
  # Specifies whether or not to strip the fragment component from URLs.
42
41
  #
43
- # @option options [Boolean] :strip_query (false)
42
+ # @param [Boolean] strip_query
44
43
  # Specifies whether or not to strip the query component from URLs.
45
44
  #
46
45
  # @since 0.2.2
47
46
  #
48
- def initialize_sanitizers(options={})
49
- @strip_fragments = options.fetch(:strip_fragments,true)
50
- @strip_query = options.fetch(:strip_query,false)
47
+ def initialize_sanitizers(strip_fragments: true, strip_query: false)
48
+ @strip_fragments = strip_fragments
49
+ @strip_query = strip_query
51
50
  end
52
51
 
53
52
  end