spidr 0.6.1 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e2202e0ce389cbbc6f88360d7e7b430328bc6da973b69929ebc54b1d92c104bb
4
- data.tar.gz: 0777e972ef2cb1d540ee138a7703ee67f88b482260074c13e8c08a8da963aa77
3
+ metadata.gz: 46a2f2ad2ca789b83fac0e2519294403734e2ad6d647fbc3a612d429e57c1b43
4
+ data.tar.gz: b72f561e337c6a0fcdbca9f59562e06f0b5854b15d321f90be1a4168b352faca
5
5
  SHA512:
6
- metadata.gz: 45e923ad3aa59812de4af67cc3a1739d7751749b3aa3205c5979e1f29c302abd43316cd74ad7c61befda2d31b0ff080872047e9713537d3bfe2509a8d555156a
7
- data.tar.gz: deae3dbd7d9566723ca8760064c715de8795b1d4e4be1ae0866497d697a411af3e8bbe2a677028a5bf57bba8e681bd9d712cef3a2d0d123bc903af28f5e32a77
6
+ metadata.gz: ced221d8cdbeaf95df12d6c038de6539a5148657209137433cc82c5abc69779a13376a7e6becdf423d2f2bdd9ebfaf8c7b94a51dda70ffcbab932da4fc5260b3
7
+ data.tar.gz: f54bedf3648dd033b8a37388413ae4ab71b4b09f16cc508b8e43e72f2ef870c59fe325e3f36a841791d9d843acb08bb02009469168e9b231a9835a0249b55b6c
data/.editorconfig ADDED
@@ -0,0 +1,11 @@
1
+ root = true
2
+
3
+ [*]
4
+ end_of_line = lf
5
+ insert_final_newline = true
6
+ tab_width = 8
7
+ trim_trailing_whitespace = true
8
+
9
+ [{Gemfile,Rakefile,*.rb,*.gemspec,*.yml}]
10
+ indent_style = space
11
+ indent_size = 2
@@ -0,0 +1,26 @@
1
+ name: CI
2
+
3
+ on: [ push, pull_request ]
4
+
5
+ jobs:
6
+ tests:
7
+ runs-on: ubuntu-latest
8
+ strategy:
9
+ fail-fast: false
10
+ matrix:
11
+ ruby:
12
+ - 2.7
13
+ - '3.0'
14
+ - '3.1'
15
+ - jruby
16
+ name: Ruby ${{ matrix.ruby }}
17
+ steps:
18
+ - uses: actions/checkout@v2
19
+ - name: Set up Ruby
20
+ uses: ruby/setup-ruby@v1
21
+ with:
22
+ ruby-version: ${{ matrix.ruby }}
23
+ - name: Install dependencies
24
+ run: bundle install --jobs 4 --retry 3
25
+ - name: Run tests
26
+ run: bundle exec rake test
data/.gitignore CHANGED
@@ -1,8 +1,7 @@
1
- pkg
2
- doc
3
- web
4
- tmp
5
- Gemfile.lock
1
+ /Gemfile.lock
2
+ /coverage
3
+ /doc
4
+ /pkg
6
5
  .DS_Store
7
6
  .bundle
8
7
  .yardoc
data/ChangeLog.md CHANGED
@@ -1,6 +1,16 @@
1
+ ### 0.7.0 / 2022-12-31
2
+
3
+ * Added {Spidr.domain} and {Spidr::Agent.domain}.
4
+ * Added {Spidr::Page#gif?}.
5
+ * Added {Spidr::Page#jpeg?}.
6
+ * Added {Spidr::Page#icon?} and {Spidr::Page#ico?}.
7
+ * Added {Spidr::Page#png?}.
8
+ * {Spidr.proxy=} and {Spidr::Agent#proxy=} can now accept a `String` or a
9
+ `URI::HTTP` object.
10
+
1
11
  ### 0.6.1 / 2019-10-24
2
12
 
3
- * Check for opaque component of URIs before attempting to set the path
13
+ * Check for the opaque component of URIs before attempting to set the path
4
14
  component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
5
15
  opaque` exceptions.
6
16
  * Fix `@robots` instance variable warning (@spk).
data/Gemfile CHANGED
@@ -12,10 +12,13 @@ group :development do
12
12
  gem 'rake'
13
13
  gem 'rubygems-tasks', '~> 0.2'
14
14
 
15
- gem 'rspec', '~> 3.0'
16
- gem 'webmock', '~> 3.0'
17
- gem 'sinatra', '~> 1.0'
15
+ gem 'rspec', '~> 3.0'
16
+ gem 'webmock', '~> 3.0'
17
+ gem 'sinatra', '~> 2.0'
18
+ gem 'simplecov', '~> 0.20'
18
19
 
19
20
  gem 'kramdown'
20
- gem 'yard', '~> 0.9'
21
+ gem 'redcarpet', platform: :mri
22
+ gem 'yard', '~> 0.9'
23
+ gem 'yard-spellcheck', require: false
21
24
  end
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2008-2016 Hal Brodigan
1
+ Copyright (c) 2008-2022 Hal Brodigan
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -1,11 +1,11 @@
1
1
  # Spidr
2
2
 
3
+ [![CI](https://github.com/postmodern/spidr/actions/workflows/ruby.yml/badge.svg)](https://github.com/postmodern/spidr/actions/workflows/ruby.yml)
4
+
3
5
  * [Homepage](https://github.com/postmodern/spidr#readme)
4
6
  * [Source](https://github.com/postmodern/spidr)
5
7
  * [Issues](https://github.com/postmodern/spidr/issues)
6
8
  * [Mailing List](http://groups.google.com/group/spidr)
7
- * [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
8
- * [![Build Status](https://travis-ci.org/postmodern/spidr.svg)](https://travis-ci.org/postmodern/spidr)
9
9
 
10
10
  ## Description
11
11
 
@@ -49,137 +49,194 @@ and easy to use.
49
49
 
50
50
  Start spidering from a URL:
51
51
 
52
- Spidr.start_at('http://tenderlovemaking.com/')
52
+ ```ruby
53
+ Spidr.start_at('http://tenderlovemaking.com/') do |agent|
54
+ # ...
55
+ end
56
+ ```
53
57
 
54
58
  Spider a host:
55
59
 
56
- Spidr.host('solnic.eu')
60
+ ```ruby
61
+ Spidr.host('solnic.eu') do |agent|
62
+ # ...
63
+ end
64
+ ```
65
+
66
+ Spider a domain (and any sub-domains):
67
+
68
+ ```ruby
69
+ Spidr.domain('ruby-lang.org') do |agent|
70
+ # ...
71
+ end
72
+ ```
57
73
 
58
74
  Spider a site:
59
75
 
60
- Spidr.site('http://www.rubyflow.com/')
76
+ ```ruby
77
+ Spidr.site('http://www.rubyflow.com/') do |agent|
78
+ # ...
79
+ end
80
+ ```
61
81
 
62
82
  Spider multiple hosts:
63
83
 
64
- Spidr.start_at(
65
- 'http://company.com/',
66
- hosts: [
67
- 'company.com',
68
- /host[\d]+\.company\.com/
69
- ]
70
- )
84
+ ```ruby
85
+ Spidr.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
86
+ # ...
87
+ end
88
+ ```
71
89
 
72
90
  Do not spider certain links:
73
91
 
74
- Spidr.site('http://company.com/', ignore_links: [%{^/blog/}])
92
+ ```ruby
93
+ Spidr.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
94
+ # ...
95
+ end
96
+ ```
75
97
 
76
98
  Do not spider links on certain ports:
77
99
 
78
- Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080])
100
+ ```ruby
101
+ Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
102
+ # ...
103
+ end
104
+ ```
79
105
 
80
106
  Do not spider links blacklisted in robots.txt:
81
107
 
82
- Spidr.site(
83
- 'http://company.com/',
84
- robots: true
85
- )
108
+ ```ruby
109
+ Spidr.site('http://company.com/', robots: true) do |agent|
110
+ # ...
111
+ end
112
+ ```
86
113
 
87
114
  Print out visited URLs:
88
115
 
89
- Spidr.site('http://www.rubyinside.com/') do |spider|
90
- spider.every_url { |url| puts url }
91
- end
116
+ ```ruby
117
+ Spidr.site('http://www.rubyinside.com/') do |spider|
118
+ spider.every_url { |url| puts url }
119
+ end
120
+ ```
92
121
 
93
122
  Build a URL map of a site:
94
123
 
95
- url_map = Hash.new { |hash,key| hash[key] = [] }
124
+ ```ruby
125
+ url_map = Hash.new { |hash,key| hash[key] = [] }
96
126
 
97
- Spidr.site('http://intranet.com/') do |spider|
98
- spider.every_link do |origin,dest|
99
- url_map[dest] << origin
100
- end
101
- end
127
+ Spidr.site('http://intranet.com/') do |spider|
128
+ spider.every_link do |origin,dest|
129
+ url_map[dest] << origin
130
+ end
131
+ end
132
+ ```
102
133
 
103
134
  Print out the URLs that could not be requested:
104
135
 
105
- Spidr.site('http://company.com/') do |spider|
106
- spider.every_failed_url { |url| puts url }
107
- end
136
+ ```ruby
137
+ Spidr.site('http://company.com/') do |spider|
138
+ spider.every_failed_url { |url| puts url }
139
+ end
140
+ ```
108
141
 
109
142
  Finds all pages which have broken links:
110
143
 
111
- url_map = Hash.new { |hash,key| hash[key] = [] }
144
+ ```ruby
145
+ url_map = Hash.new { |hash,key| hash[key] = [] }
112
146
 
113
- spider = Spidr.site('http://intranet.com/') do |spider|
114
- spider.every_link do |origin,dest|
115
- url_map[dest] << origin
116
- end
117
- end
147
+ spider = Spidr.site('http://intranet.com/') do |spider|
148
+ spider.every_link do |origin,dest|
149
+ url_map[dest] << origin
150
+ end
151
+ end
118
152
 
119
- spider.failures.each do |url|
120
- puts "Broken link #{url} found in:"
153
+ spider.failures.each do |url|
154
+ puts "Broken link #{url} found in:"
121
155
 
122
- url_map[url].each { |page| puts " #{page}" }
123
- end
156
+ url_map[url].each { |page| puts " #{page}" }
157
+ end
158
+ ```
124
159
 
125
160
  Search HTML and XML pages:
126
161
 
127
- Spidr.site('http://company.com/') do |spider|
128
- spider.every_page do |page|
129
- puts ">>> #{page.url}"
162
+ ```ruby
163
+ Spidr.site('http://company.com/') do |spider|
164
+ spider.every_page do |page|
165
+ puts ">>> #{page.url}"
130
166
 
131
- page.search('//meta').each do |meta|
132
- name = (meta.attributes['name'] || meta.attributes['http-equiv'])
133
- value = meta.attributes['content']
167
+ page.search('//meta').each do |meta|
168
+ name = (meta.attributes['name'] || meta.attributes['http-equiv'])
169
+ value = meta.attributes['content']
134
170
 
135
- puts " #{name} = #{value}"
136
- end
137
- end
171
+ puts " #{name} = #{value}"
138
172
  end
173
+ end
174
+ end
175
+ ```
139
176
 
140
177
  Print out the titles from every page:
141
178
 
142
- Spidr.site('https://www.ruby-lang.org/') do |spider|
143
- spider.every_html_page do |page|
144
- puts page.title
145
- end
146
- end
179
+ ```ruby
180
+ Spidr.site('https://www.ruby-lang.org/') do |spider|
181
+ spider.every_html_page do |page|
182
+ puts page.title
183
+ end
184
+ end
185
+ ```
186
+
187
+ Print out every HTTP redirect:
188
+
189
+ ```ruby
190
+ Spidr.host('company.com') do |spider|
191
+ spider.every_redirect_page do |page|
192
+ puts "#{page.url} -> #{page.headers['Location']}"
193
+ end
194
+ end
195
+ ```
147
196
 
148
197
  Find what kinds of web servers a host is using, by accessing the headers:
149
198
 
150
- servers = Set[]
199
+ ```ruby
200
+ servers = Set[]
151
201
 
152
- Spidr.host('company.com') do |spider|
153
- spider.all_headers do |headers|
154
- servers << headers['server']
155
- end
156
- end
202
+ Spidr.host('company.com') do |spider|
203
+ spider.all_headers do |headers|
204
+ servers << headers['server']
205
+ end
206
+ end
207
+ ```
157
208
 
158
209
  Pause the spider on a forbidden page:
159
210
 
160
- Spidr.host('company.com') do |spider|
161
- spider.every_forbidden_page do |page|
162
- spider.pause!
163
- end
164
- end
211
+ ```ruby
212
+ Spidr.host('company.com') do |spider|
213
+ spider.every_forbidden_page do |page|
214
+ spider.pause!
215
+ end
216
+ end
217
+ ```
165
218
 
166
219
  Skip the processing of a page:
167
220
 
168
- Spidr.host('company.com') do |spider|
169
- spider.every_missing_page do |page|
170
- spider.skip_page!
171
- end
172
- end
221
+ ```ruby
222
+ Spidr.host('company.com') do |spider|
223
+ spider.every_missing_page do |page|
224
+ spider.skip_page!
225
+ end
226
+ end
227
+ ```
173
228
 
174
229
  Skip the processing of links:
175
230
 
176
- Spidr.host('company.com') do |spider|
177
- spider.every_url do |url|
178
- if url.path.split('/').find { |dir| dir.to_i > 1000 }
179
- spider.skip_link!
180
- end
181
- end
231
+ ```ruby
232
+ Spidr.host('company.com') do |spider|
233
+ spider.every_url do |url|
234
+ if url.path.split('/').find { |dir| dir.to_i > 1000 }
235
+ spider.skip_link!
182
236
  end
237
+ end
238
+ end
239
+ ```
183
240
 
184
241
  ## Requirements
185
242
 
@@ -188,11 +245,13 @@ Skip the processing of links:
188
245
 
189
246
  ## Install
190
247
 
191
- $ gem install spidr
248
+ ```shell
249
+ $ gem install spidr
250
+ ```
192
251
 
193
252
  ## License
194
253
 
195
- Copyright (c) 2008-2016 Hal Brodigan
254
+ Copyright (c) 2008-2022 Hal Brodigan
196
255
 
197
256
  See {file:LICENSE.txt} for license information.
198
257
 
data/Rakefile CHANGED
@@ -12,6 +12,7 @@ Gem::Tasks.new
12
12
 
13
13
  require 'rspec/core/rake_task'
14
14
  RSpec::Core::RakeTask.new
15
+ task :test => :spec
15
16
  task :default => :spec
16
17
 
17
18
  require 'yard'
data/gemspec.yml CHANGED
@@ -11,6 +11,13 @@ email: postmodern.mod3@gmail.com
11
11
  homepage: https://github.com/postmodern/spidr#readme
12
12
  has_yard: true
13
13
 
14
+ metadata:
15
+ documentation_uri: https://rubydoc.info/gems/spidr
16
+ source_code_uri: https://github.com/postmodern/spidr.rb
17
+ bug_tracker_uri: https://github.com/postmodern/spidr.rb/issues
18
+ changelog_uri: https://github.com/postmodern/spidr.rb/blob/master/ChangeLog.md
19
+ rubygems_mfa_required: 'true'
20
+
14
21
  required_ruby_version: ">= 2.0.0"
15
22
 
16
23
  dependencies:
@@ -96,7 +96,7 @@ module Spidr
96
96
 
97
97
  protected
98
98
 
99
- def initialize_actions(options={})
99
+ def initialize_actions
100
100
  @paused = false
101
101
  end
102
102
  end
@@ -520,7 +520,7 @@ module Spidr
520
520
 
521
521
  protected
522
522
 
523
- def initialize_events(options={})
523
+ def initialize_events
524
524
  @every_url_blocks = []
525
525
  @every_failed_url_blocks = []
526
526
  @every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
@@ -356,89 +356,88 @@ module Spidr
356
356
  #
357
357
  # Initializes filtering rules.
358
358
  #
359
- # @param [Hash] options
360
- # Additional options.
361
- #
362
- # @option options [Array] :schemes (['http', 'https'])
359
+ # @param [Array<String>] schemes
363
360
  # The list of acceptable URI schemes to visit.
364
361
  # The `https` scheme will be ignored if `net/https` cannot be loaded.
365
362
  #
366
- # @option options [String] :host
363
+ # @param [String] host
367
364
  # The host-name to visit.
368
365
  #
369
- # @option options [Array<String, Regexp, Proc>] :hosts
366
+ # @param [Array<String, Regexp, Proc>] hosts
370
367
  # The patterns which match the host-names to visit.
371
368
  #
372
- # @option options [Array<String, Regexp, Proc>] :ignore_hosts
369
+ # @param [Array<String, Regexp, Proc>] ignore_hosts
373
370
  # The patterns which match the host-names to not visit.
374
371
  #
375
- # @option options [Array<Integer, Regexp, Proc>] :ports
372
+ # @param [Array<Integer, Regexp, Proc>] ports
376
373
  # The patterns which match the ports to visit.
377
374
  #
378
- # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
375
+ # @param [Array<Integer, Regexp, Proc>] ignore_ports
379
376
  # The patterns which match the ports to not visit.
380
377
  #
381
- # @option options [Array<String, Regexp, Proc>] :links
378
+ # @param [Array<String, Regexp, Proc>] links
382
379
  # The patterns which match the links to visit.
383
380
  #
384
- # @option options [Array<String, Regexp, Proc>] :ignore_links
381
+ # @param [Array<String, Regexp, Proc>] ignore_links
385
382
  # The patterns which match the links to not visit.
386
383
  #
387
- # @option options [Array<String, Regexp, Proc>] :urls
384
+ # @param [Array<String, Regexp, Proc>] urls
388
385
  # The patterns which match the URLs to visit.
389
386
  #
390
- # @option options [Array<String, Regexp, Proc>] :ignore_urls
387
+ # @param [Array<String, Regexp, Proc>] ignore_urls
391
388
  # The patterns which match the URLs to not visit.
392
389
  #
393
- # @option options [Array<String, Regexp, Proc>] :exts
390
+ # @param [Array<String, Regexp, Proc>] exts
394
391
  # The patterns which match the URI path extensions to visit.
395
392
  #
396
- # @option options [Array<String, Regexp, Proc>] :ignore_exts
393
+ # @param [Array<String, Regexp, Proc>] ignore_exts
397
394
  # The patterns which match the URI path extensions to not visit.
398
395
  #
399
- def initialize_filters(options={})
400
- @schemes = []
396
+ def initialize_filters(schemes: self.class.default_schemes,
397
+ host: nil,
398
+ hosts: nil,
399
+ ignore_hosts: nil,
400
+ ports: nil,
401
+ ignore_ports: nil,
402
+ links: nil,
403
+ ignore_links: nil,
404
+ urls: nil,
405
+ ignore_urls: nil,
406
+ exts: nil,
407
+ ignore_exts: nil)
408
+ @schemes = schemes.map(&:to_s)
409
+
410
+ @host_rules = Rules.new(accept: hosts, reject: ignore_hosts)
411
+ @port_rules = Rules.new(accept: ports, reject: ignore_ports)
412
+ @link_rules = Rules.new(accept: links, reject: ignore_links)
413
+ @url_rules = Rules.new(accept: urls, reject: ignore_urls)
414
+ @ext_rules = Rules.new(accept: exts, reject: ignore_exts)
415
+
416
+ visit_hosts_like(host) if host
417
+ end
401
418
 
402
- if options[:schemes]
403
- self.schemes = options[:schemes]
404
- else
405
- @schemes << 'http'
419
+ #
420
+ # Determines the default URI schemes to follow.
421
+ #
422
+ # @return [Array<String>]
423
+ # The default URI schemes to follow.
424
+ #
425
+ # @since 0.6.2
426
+ #
427
+ def self.default_schemes
428
+ schemes = ['http']
406
429
 
407
- begin
408
- require 'net/https'
430
+ begin
431
+ require 'net/https'
409
432
 
410
- @schemes << 'https'
411
- rescue Gem::LoadError => e
412
- raise(e)
413
- rescue ::LoadError
414
- warn "Warning: cannot load 'net/https', https support disabled"
415
- end
433
+ schemes << 'https'
434
+ rescue Gem::LoadError => e
435
+ raise(e)
436
+ rescue ::LoadError
437
+ warn "Warning: cannot load 'net/https', https support disabled"
416
438
  end
417
439
 
418
- @host_rules = Rules.new(
419
- accept: options[:hosts],
420
- reject: options[:ignore_hosts]
421
- )
422
- @port_rules = Rules.new(
423
- accept: options[:ports],
424
- reject: options[:ignore_ports]
425
- )
426
- @link_rules = Rules.new(
427
- accept: options[:links],
428
- reject: options[:ignore_links]
429
- )
430
- @url_rules = Rules.new(
431
- accept: options[:urls],
432
- reject: options[:ignore_urls]
433
- )
434
- @ext_rules = Rules.new(
435
- accept: options[:exts],
436
- reject: options[:ignore_exts]
437
- )
438
-
439
- if options[:host]
440
- visit_hosts_like(options[:host])
441
- end
440
+ return schemes
442
441
  end
443
442
 
444
443
  #
@@ -34,20 +34,17 @@ module Spidr
34
34
  #
35
35
  # Initializes the Sanitizer rules.
36
36
  #
37
- # @param [Hash] options
38
- # Additional options.
39
- #
40
- # @option options [Boolean] :strip_fragments (true)
37
+ # @param [Boolean] strip_fragments
41
38
  # Specifies whether or not to strip the fragment component from URLs.
42
39
  #
43
- # @option options [Boolean] :strip_query (false)
40
+ # @param [Boolean] strip_query
44
41
  # Specifies whether or not to strip the query component from URLs.
45
42
  #
46
43
  # @since 0.2.2
47
44
  #
48
- def initialize_sanitizers(options={})
49
- @strip_fragments = options.fetch(:strip_fragments,true)
50
- @strip_query = options.fetch(:strip_query,false)
45
+ def initialize_sanitizers(strip_fragments: true, strip_query: false)
46
+ @strip_fragments = strip_fragments
47
+ @strip_query = strip_query
51
48
  end
52
49
 
53
50
  end