spidr 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 114364609c8da8613e22e9f18777cd9c79e1ac8a
4
- data.tar.gz: edb694a4a695217a2adf2cd44ffeafb679e8292c
2
+ SHA256:
3
+ metadata.gz: 46a2f2ad2ca789b83fac0e2519294403734e2ad6d647fbc3a612d429e57c1b43
4
+ data.tar.gz: b72f561e337c6a0fcdbca9f59562e06f0b5854b15d321f90be1a4168b352faca
5
5
  SHA512:
6
- metadata.gz: 50064bb0227d7dc0b3ff4bf55ec72b74635c89d6a7bfe24c6948c73ff439b74dbe6f2c72276586389340ed050a30b4814faa0d2584d490badc337c97393bf4f3
7
- data.tar.gz: 46326b368267b66d647ea09712ec499dc5dbed82fb28b65d5573832d661328a768e140fcec72a75e68516dae6401ded9558978b13e7c6158249fd14737d1c93f
6
+ metadata.gz: ced221d8cdbeaf95df12d6c038de6539a5148657209137433cc82c5abc69779a13376a7e6becdf423d2f2bdd9ebfaf8c7b94a51dda70ffcbab932da4fc5260b3
7
+ data.tar.gz: f54bedf3648dd033b8a37388413ae4ab71b4b09f16cc508b8e43e72f2ef870c59fe325e3f36a841791d9d843acb08bb02009469168e9b231a9835a0249b55b6c
data/.editorconfig ADDED
@@ -0,0 +1,11 @@
1
+ root = true
2
+
3
+ [*]
4
+ end_of_line = lf
5
+ insert_final_newline = true
6
+ tab_width = 8
7
+ trim_trailing_whitespace = true
8
+
9
+ [{Gemfile,Rakefile,*.rb,*.gemspec,*.yml}]
10
+ indent_style = space
11
+ indent_size = 2
@@ -0,0 +1,26 @@
1
+ name: CI
2
+
3
+ on: [ push, pull_request ]
4
+
5
+ jobs:
6
+ tests:
7
+ runs-on: ubuntu-latest
8
+ strategy:
9
+ fail-fast: false
10
+ matrix:
11
+ ruby:
12
+ - 2.7
13
+ - '3.0'
14
+ - '3.1'
15
+ - jruby
16
+ name: Ruby ${{ matrix.ruby }}
17
+ steps:
18
+ - uses: actions/checkout@v2
19
+ - name: Set up Ruby
20
+ uses: ruby/setup-ruby@v1
21
+ with:
22
+ ruby-version: ${{ matrix.ruby }}
23
+ - name: Install dependencies
24
+ run: bundle install --jobs 4 --retry 3
25
+ - name: Run tests
26
+ run: bundle exec rake test
data/.gitignore CHANGED
@@ -1,8 +1,7 @@
1
- pkg
2
- doc
3
- web
4
- tmp
5
- Gemfile.lock
1
+ /Gemfile.lock
2
+ /coverage
3
+ /doc
4
+ /pkg
6
5
  .DS_Store
7
6
  .bundle
8
7
  .yardoc
data/ChangeLog.md CHANGED
@@ -1,3 +1,20 @@
1
+ ### 0.7.0 / 2022-12-31
2
+
3
+ * Added {Spidr.domain} and {Spidr::Agent.domain}.
4
+ * Added {Spidr::Page#gif?}.
5
+ * Added {Spidr::Page#jpeg?}.
6
+ * Added {Spidr::Page#icon?} and {Spidr::Page#ico?}.
7
+ * Added {Spidr::Page#png?}.
8
+ * {Spidr.proxy=} and {Spidr::Agent#proxy=} can now accept a `String` or a
9
+ `URI::HTTP` object.
10
+
11
+ ### 0.6.1 / 2019-10-24
12
+
13
+ * Check for the opaque component of URIs before attempting to set the path
14
+ component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
15
+ opaque` exceptions.
16
+ * Fix `@robots` instance variable warning (@spk).
17
+
1
18
  ### 0.6.0 / 2016-08-04
2
19
 
3
20
  * Added {Spidr::Proxy}.
data/Gemfile CHANGED
@@ -12,10 +12,13 @@ group :development do
12
12
  gem 'rake'
13
13
  gem 'rubygems-tasks', '~> 0.2'
14
14
 
15
- gem 'rspec', '~> 3.0'
16
- gem 'webmock', '~> 2.0'
17
- gem 'sinatra', '~> 1.0'
15
+ gem 'rspec', '~> 3.0'
16
+ gem 'webmock', '~> 3.0'
17
+ gem 'sinatra', '~> 2.0'
18
+ gem 'simplecov', '~> 0.20'
18
19
 
19
- gem 'kramdown', '~> 0.12'
20
- gem 'yard', '~> 0.8'
20
+ gem 'kramdown'
21
+ gem 'redcarpet', platform: :mri
22
+ gem 'yard', '~> 0.9'
23
+ gem 'yard-spellcheck', require: false
21
24
  end
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2008-2016 Hal Brodigan
1
+ Copyright (c) 2008-2022 Hal Brodigan
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -1,11 +1,11 @@
1
1
  # Spidr
2
2
 
3
+ [![CI](https://github.com/postmodern/spidr/actions/workflows/ruby.yml/badge.svg)](https://github.com/postmodern/spidr/actions/workflows/ruby.yml)
4
+
3
5
  * [Homepage](https://github.com/postmodern/spidr#readme)
4
6
  * [Source](https://github.com/postmodern/spidr)
5
7
  * [Issues](https://github.com/postmodern/spidr/issues)
6
8
  * [Mailing List](http://groups.google.com/group/spidr)
7
- * [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
8
- * [![Build Status](https://travis-ci.org/postmodern/spidr.svg)](https://travis-ci.org/postmodern/spidr)
9
9
 
10
10
  ## Description
11
11
 
@@ -49,137 +49,194 @@ and easy to use.
49
49
 
50
50
  Start spidering from a URL:
51
51
 
52
- Spidr.start_at('http://tenderlovemaking.com/')
52
+ ```ruby
53
+ Spidr.start_at('http://tenderlovemaking.com/') do |agent|
54
+ # ...
55
+ end
56
+ ```
53
57
 
54
58
  Spider a host:
55
59
 
56
- Spidr.host('solnic.eu')
60
+ ```ruby
61
+ Spidr.host('solnic.eu') do |agent|
62
+ # ...
63
+ end
64
+ ```
65
+
66
+ Spider a domain (and any sub-domains):
67
+
68
+ ```ruby
69
+ Spidr.domain('ruby-lang.org') do |agent|
70
+ # ...
71
+ end
72
+ ```
57
73
 
58
74
  Spider a site:
59
75
 
60
- Spidr.site('http://www.rubyflow.com/')
76
+ ```ruby
77
+ Spidr.site('http://www.rubyflow.com/') do |agent|
78
+ # ...
79
+ end
80
+ ```
61
81
 
62
82
  Spider multiple hosts:
63
83
 
64
- Spidr.start_at(
65
- 'http://company.com/',
66
- hosts: [
67
- 'company.com',
68
- /host[\d]+\.company\.com/
69
- ]
70
- )
84
+ ```ruby
85
+ Spidr.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
86
+ # ...
87
+ end
88
+ ```
71
89
 
72
90
  Do not spider certain links:
73
91
 
74
- Spidr.site('http://company.com/', ignore_links: [%{^/blog/}])
92
+ ```ruby
93
+ Spidr.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
94
+ # ...
95
+ end
96
+ ```
75
97
 
76
98
  Do not spider links on certain ports:
77
99
 
78
- Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080])
100
+ ```ruby
101
+ Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
102
+ # ...
103
+ end
104
+ ```
79
105
 
80
106
  Do not spider links blacklisted in robots.txt:
81
107
 
82
- Spidr.site(
83
- 'http://company.com/',
84
- robots: true
85
- )
108
+ ```ruby
109
+ Spidr.site('http://company.com/', robots: true) do |agent|
110
+ # ...
111
+ end
112
+ ```
86
113
 
87
114
  Print out visited URLs:
88
115
 
89
- Spidr.site('http://www.rubyinside.com/') do |spider|
90
- spider.every_url { |url| puts url }
91
- end
116
+ ```ruby
117
+ Spidr.site('http://www.rubyinside.com/') do |spider|
118
+ spider.every_url { |url| puts url }
119
+ end
120
+ ```
92
121
 
93
122
  Build a URL map of a site:
94
123
 
95
- url_map = Hash.new { |hash,key| hash[key] = [] }
124
+ ```ruby
125
+ url_map = Hash.new { |hash,key| hash[key] = [] }
96
126
 
97
- Spidr.site('http://intranet.com/') do |spider|
98
- spider.every_link do |origin,dest|
99
- url_map[dest] << origin
100
- end
101
- end
127
+ Spidr.site('http://intranet.com/') do |spider|
128
+ spider.every_link do |origin,dest|
129
+ url_map[dest] << origin
130
+ end
131
+ end
132
+ ```
102
133
 
103
134
  Print out the URLs that could not be requested:
104
135
 
105
- Spidr.site('http://company.com/') do |spider|
106
- spider.every_failed_url { |url| puts url }
107
- end
136
+ ```ruby
137
+ Spidr.site('http://company.com/') do |spider|
138
+ spider.every_failed_url { |url| puts url }
139
+ end
140
+ ```
108
141
 
109
142
  Finds all pages which have broken links:
110
143
 
111
- url_map = Hash.new { |hash,key| hash[key] = [] }
144
+ ```ruby
145
+ url_map = Hash.new { |hash,key| hash[key] = [] }
112
146
 
113
- spider = Spidr.site('http://intranet.com/') do |spider|
114
- spider.every_link do |origin,dest|
115
- url_map[dest] << origin
116
- end
117
- end
147
+ spider = Spidr.site('http://intranet.com/') do |spider|
148
+ spider.every_link do |origin,dest|
149
+ url_map[dest] << origin
150
+ end
151
+ end
118
152
 
119
- spider.failures.each do |url|
120
- puts "Broken link #{url} found in:"
153
+ spider.failures.each do |url|
154
+ puts "Broken link #{url} found in:"
121
155
 
122
- url_map[url].each { |page| puts " #{page}" }
123
- end
156
+ url_map[url].each { |page| puts " #{page}" }
157
+ end
158
+ ```
124
159
 
125
160
  Search HTML and XML pages:
126
161
 
127
- Spidr.site('http://company.com/') do |spider|
128
- spider.every_page do |page|
129
- puts ">>> #{page.url}"
162
+ ```ruby
163
+ Spidr.site('http://company.com/') do |spider|
164
+ spider.every_page do |page|
165
+ puts ">>> #{page.url}"
130
166
 
131
- page.search('//meta').each do |meta|
132
- name = (meta.attributes['name'] || meta.attributes['http-equiv'])
133
- value = meta.attributes['content']
167
+ page.search('//meta').each do |meta|
168
+ name = (meta.attributes['name'] || meta.attributes['http-equiv'])
169
+ value = meta.attributes['content']
134
170
 
135
- puts " #{name} = #{value}"
136
- end
137
- end
171
+ puts " #{name} = #{value}"
138
172
  end
173
+ end
174
+ end
175
+ ```
139
176
 
140
177
  Print out the titles from every page:
141
178
 
142
- Spidr.site('https://www.ruby-lang.org/') do |spider|
143
- spider.every_html_page do |page|
144
- puts page.title
145
- end
146
- end
179
+ ```ruby
180
+ Spidr.site('https://www.ruby-lang.org/') do |spider|
181
+ spider.every_html_page do |page|
182
+ puts page.title
183
+ end
184
+ end
185
+ ```
186
+
187
+ Print out every HTTP redirect:
188
+
189
+ ```ruby
190
+ Spidr.host('company.com') do |spider|
191
+ spider.every_redirect_page do |page|
192
+ puts "#{page.url} -> #{page.headers['Location']}"
193
+ end
194
+ end
195
+ ```
147
196
 
148
197
  Find what kinds of web servers a host is using, by accessing the headers:
149
198
 
150
- servers = Set[]
199
+ ```ruby
200
+ servers = Set[]
151
201
 
152
- Spidr.host('company.com') do |spider|
153
- spider.all_headers do |headers|
154
- servers << headers['server']
155
- end
156
- end
202
+ Spidr.host('company.com') do |spider|
203
+ spider.all_headers do |headers|
204
+ servers << headers['server']
205
+ end
206
+ end
207
+ ```
157
208
 
158
209
  Pause the spider on a forbidden page:
159
210
 
160
- spider = Spidr.host('company.com') do |spider|
161
- spider.every_forbidden_page do |page|
162
- spider.pause!
163
- end
164
- end
211
+ ```ruby
212
+ Spidr.host('company.com') do |spider|
213
+ spider.every_forbidden_page do |page|
214
+ spider.pause!
215
+ end
216
+ end
217
+ ```
165
218
 
166
219
  Skip the processing of a page:
167
220
 
168
- Spidr.host('company.com') do |spider|
169
- spider.every_missing_page do |page|
170
- spider.skip_page!
171
- end
172
- end
221
+ ```ruby
222
+ Spidr.host('company.com') do |spider|
223
+ spider.every_missing_page do |page|
224
+ spider.skip_page!
225
+ end
226
+ end
227
+ ```
173
228
 
174
229
  Skip the processing of links:
175
230
 
176
- Spidr.host('company.com') do |spider|
177
- spider.every_url do |url|
178
- if url.path.split('/').find { |dir| dir.to_i > 1000 }
179
- spider.skip_link!
180
- end
181
- end
231
+ ```ruby
232
+ Spidr.host('company.com') do |spider|
233
+ spider.every_url do |url|
234
+ if url.path.split('/').find { |dir| dir.to_i > 1000 }
235
+ spider.skip_link!
182
236
  end
237
+ end
238
+ end
239
+ ```
183
240
 
184
241
  ## Requirements
185
242
 
@@ -188,11 +245,13 @@ Skip the processing of links:
188
245
 
189
246
  ## Install
190
247
 
191
- $ gem install spidr
248
+ ```shell
249
+ $ gem install spidr
250
+ ```
192
251
 
193
252
  ## License
194
253
 
195
- Copyright (c) 2008-2016 Hal Brodigan
254
+ Copyright (c) 2008-2022 Hal Brodigan
196
255
 
197
256
  See {file:LICENSE.txt} for license information.
198
257
 
data/Rakefile CHANGED
@@ -12,6 +12,7 @@ Gem::Tasks.new
12
12
 
13
13
  require 'rspec/core/rake_task'
14
14
  RSpec::Core::RakeTask.new
15
+ task :test => :spec
15
16
  task :default => :spec
16
17
 
17
18
  require 'yard'
data/gemspec.yml CHANGED
@@ -11,10 +11,17 @@ email: postmodern.mod3@gmail.com
11
11
  homepage: https://github.com/postmodern/spidr#readme
12
12
  has_yard: true
13
13
 
14
+ metadata:
15
+ documentation_uri: https://rubydoc.info/gems/spidr
16
+ source_code_uri: https://github.com/postmodern/spidr.rb
17
+ bug_tracker_uri: https://github.com/postmodern/spidr.rb/issues
18
+ changelog_uri: https://github.com/postmodern/spidr.rb/blob/master/ChangeLog.md
19
+ rubygems_mfa_required: 'true'
20
+
14
21
  required_ruby_version: ">= 2.0.0"
15
22
 
16
23
  dependencies:
17
24
  nokogiri: ~> 1.3
18
25
 
19
26
  development_dependencies:
20
- bundler: ~> 1.0
27
+ bundler: ~> 2.0
@@ -96,7 +96,7 @@ module Spidr
96
96
 
97
97
  protected
98
98
 
99
- def initialize_actions(options={})
99
+ def initialize_actions
100
100
  @paused = false
101
101
  end
102
102
  end
@@ -520,7 +520,7 @@ module Spidr
520
520
 
521
521
  protected
522
522
 
523
- def initialize_events(options={})
523
+ def initialize_events
524
524
  @every_url_blocks = []
525
525
  @every_failed_url_blocks = []
526
526
  @every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
@@ -16,7 +16,7 @@ module Spidr
16
16
  # agent.schemes = ['http']
17
17
  #
18
18
  def schemes=(new_schemes)
19
- @schemes = new_schemes.map { |scheme| scheme.to_s }
19
+ @schemes = new_schemes.map(&:to_s)
20
20
  end
21
21
 
22
22
  #
@@ -356,89 +356,88 @@ module Spidr
356
356
  #
357
357
  # Initializes filtering rules.
358
358
  #
359
- # @param [Hash] options
360
- # Additional options.
361
- #
362
- # @option options [Array] :schemes (['http', 'https'])
359
+ # @param [Array<String>] schemes
363
360
  # The list of acceptable URI schemes to visit.
364
361
  # The `https` scheme will be ignored if `net/https` cannot be loaded.
365
362
  #
366
- # @option options [String] :host
363
+ # @param [String] host
367
364
  # The host-name to visit.
368
365
  #
369
- # @option options [Array<String, Regexp, Proc>] :hosts
366
+ # @param [Array<String, Regexp, Proc>] hosts
370
367
  # The patterns which match the host-names to visit.
371
368
  #
372
- # @option options [Array<String, Regexp, Proc>] :ignore_hosts
369
+ # @param [Array<String, Regexp, Proc>] ignore_hosts
373
370
  # The patterns which match the host-names to not visit.
374
371
  #
375
- # @option options [Array<Integer, Regexp, Proc>] :ports
372
+ # @param [Array<Integer, Regexp, Proc>] ports
376
373
  # The patterns which match the ports to visit.
377
374
  #
378
- # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
375
+ # @param [Array<Integer, Regexp, Proc>] ignore_ports
379
376
  # The patterns which match the ports to not visit.
380
377
  #
381
- # @option options [Array<String, Regexp, Proc>] :links
378
+ # @param [Array<String, Regexp, Proc>] links
382
379
  # The patterns which match the links to visit.
383
380
  #
384
- # @option options [Array<String, Regexp, Proc>] :ignore_links
381
+ # @param [Array<String, Regexp, Proc>] ignore_links
385
382
  # The patterns which match the links to not visit.
386
383
  #
387
- # @option options [Array<String, Regexp, Proc>] :urls
384
+ # @param [Array<String, Regexp, Proc>] urls
388
385
  # The patterns which match the URLs to visit.
389
386
  #
390
- # @option options [Array<String, Regexp, Proc>] :ignore_urls
387
+ # @param [Array<String, Regexp, Proc>] ignore_urls
391
388
  # The patterns which match the URLs to not visit.
392
389
  #
393
- # @option options [Array<String, Regexp, Proc>] :exts
390
+ # @param [Array<String, Regexp, Proc>] exts
394
391
  # The patterns which match the URI path extensions to visit.
395
392
  #
396
- # @option options [Array<String, Regexp, Proc>] :ignore_exts
393
+ # @param [Array<String, Regexp, Proc>] ignore_exts
397
394
  # The patterns which match the URI path extensions to not visit.
398
395
  #
399
- def initialize_filters(options={})
400
- @schemes = []
396
+ def initialize_filters(schemes: self.class.default_schemes,
397
+ host: nil,
398
+ hosts: nil,
399
+ ignore_hosts: nil,
400
+ ports: nil,
401
+ ignore_ports: nil,
402
+ links: nil,
403
+ ignore_links: nil,
404
+ urls: nil,
405
+ ignore_urls: nil,
406
+ exts: nil,
407
+ ignore_exts: nil)
408
+ @schemes = schemes.map(&:to_s)
409
+
410
+ @host_rules = Rules.new(accept: hosts, reject: ignore_hosts)
411
+ @port_rules = Rules.new(accept: ports, reject: ignore_ports)
412
+ @link_rules = Rules.new(accept: links, reject: ignore_links)
413
+ @url_rules = Rules.new(accept: urls, reject: ignore_urls)
414
+ @ext_rules = Rules.new(accept: exts, reject: ignore_exts)
415
+
416
+ visit_hosts_like(host) if host
417
+ end
401
418
 
402
- if options[:schemes]
403
- self.schemes = options[:schemes]
404
- else
405
- @schemes << 'http'
419
+ #
420
+ # Determines the default URI schemes to follow.
421
+ #
422
+ # @return [Array<String>]
423
+ # The default URI schemes to follow.
424
+ #
425
+ # @since 0.6.2
426
+ #
427
+ def self.default_schemes
428
+ schemes = ['http']
406
429
 
407
- begin
408
- require 'net/https'
430
+ begin
431
+ require 'net/https'
409
432
 
410
- @schemes << 'https'
411
- rescue Gem::LoadError => e
412
- raise(e)
413
- rescue ::LoadError
414
- warn "Warning: cannot load 'net/https', https support disabled"
415
- end
433
+ schemes << 'https'
434
+ rescue Gem::LoadError => e
435
+ raise(e)
436
+ rescue ::LoadError
437
+ warn "Warning: cannot load 'net/https', https support disabled"
416
438
  end
417
439
 
418
- @host_rules = Rules.new(
419
- accept: options[:hosts],
420
- reject: options[:ignore_hosts]
421
- )
422
- @port_rules = Rules.new(
423
- accept: options[:ports],
424
- reject: options[:ignore_ports]
425
- )
426
- @link_rules = Rules.new(
427
- accept: options[:links],
428
- reject: options[:ignore_links]
429
- )
430
- @url_rules = Rules.new(
431
- accept: options[:urls],
432
- reject: options[:ignore_urls]
433
- )
434
- @ext_rules = Rules.new(
435
- accept: options[:exts],
436
- reject: options[:ignore_exts]
437
- )
438
-
439
- if options[:host]
440
- visit_hosts_like(options[:host])
441
- end
440
+ return schemes
442
441
  end
443
442
 
444
443
  #
@@ -452,9 +451,9 @@ module Spidr
452
451
  #
453
452
  def visit_scheme?(scheme)
454
453
  if scheme
455
- return @schemes.include?(scheme)
454
+ @schemes.include?(scheme)
456
455
  else
457
- return true
456
+ true
458
457
  end
459
458
  end
460
459
 
@@ -21,7 +21,7 @@ module Spidr
21
21
  # @since 0.2.2
22
22
  #
23
23
  def sanitize_url(url)
24
- url = URI(url.to_s) unless url.kind_of?(URI)
24
+ url = URI(url)
25
25
 
26
26
  url.fragment = nil if @strip_fragments
27
27
  url.query = nil if @strip_query
@@ -34,20 +34,17 @@ module Spidr
34
34
  #
35
35
  # Initializes the Sanitizer rules.
36
36
  #
37
- # @param [Hash] options
38
- # Additional options.
39
- #
40
- # @option options [Boolean] :strip_fragments (true)
37
+ # @param [Boolean] strip_fragments
41
38
  # Specifies whether or not to strip the fragment component from URLs.
42
39
  #
43
- # @option options [Boolean] :strip_query (false)
40
+ # @param [Boolean] strip_query
44
41
  # Specifies whether or not to strip the query component from URLs.
45
42
  #
46
43
  # @since 0.2.2
47
44
  #
48
- def initialize_sanitizers(options={})
49
- @strip_fragments = options.fetch(:strip_fragments,true)
50
- @strip_query = options.fetch(:strip_query,false)
45
+ def initialize_sanitizers(strip_fragments: true, strip_query: false)
46
+ @strip_fragments = strip_fragments
47
+ @strip_query = strip_query
51
48
  end
52
49
 
53
50
  end