spidr 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +11 -0
- data/.github/workflows/ruby.yml +26 -0
- data/.gitignore +4 -5
- data/ChangeLog.md +11 -1
- data/Gemfile +7 -4
- data/LICENSE.txt +1 -1
- data/README.md +137 -78
- data/Rakefile +1 -0
- data/gemspec.yml +7 -0
- data/lib/spidr/agent/actions.rb +1 -1
- data/lib/spidr/agent/events.rb +1 -1
- data/lib/spidr/agent/filters.rb +52 -53
- data/lib/spidr/agent/sanitizers.rb +5 -8
- data/lib/spidr/agent.rb +219 -97
- data/lib/spidr/auth_store.rb +1 -1
- data/lib/spidr/page/content_types.rb +51 -0
- data/lib/spidr/page/html.rb +16 -18
- data/lib/spidr/page/status_codes.rb +12 -10
- data/lib/spidr/proxy.rb +6 -14
- data/lib/spidr/rules.rb +5 -8
- data/lib/spidr/session_cache.rb +21 -19
- data/lib/spidr/settings/proxy.rb +19 -5
- data/lib/spidr/spidr.rb +15 -6
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +356 -7
- data/spec/example_page.rb +2 -0
- data/spec/page/content_types_spec.rb +22 -0
- data/spec/page/html_spec.rb +255 -51
- data/spec/page/status_codes_spec.rb +4 -4
- data/spec/proxy_spec.rb +2 -2
- data/spec/settings/proxy_examples.rb +31 -11
- data/spec/spec_helper.rb +3 -0
- metadata +8 -7
- data/.travis.yml +0 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 46a2f2ad2ca789b83fac0e2519294403734e2ad6d647fbc3a612d429e57c1b43
|
4
|
+
data.tar.gz: b72f561e337c6a0fcdbca9f59562e06f0b5854b15d321f90be1a4168b352faca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ced221d8cdbeaf95df12d6c038de6539a5148657209137433cc82c5abc69779a13376a7e6becdf423d2f2bdd9ebfaf8c7b94a51dda70ffcbab932da4fc5260b3
|
7
|
+
data.tar.gz: f54bedf3648dd033b8a37388413ae4ab71b4b09f16cc508b8e43e72f2ef870c59fe325e3f36a841791d9d843acb08bb02009469168e9b231a9835a0249b55b6c
|
data/.editorconfig
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on: [ push, pull_request ]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
tests:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
strategy:
|
9
|
+
fail-fast: false
|
10
|
+
matrix:
|
11
|
+
ruby:
|
12
|
+
- 2.7
|
13
|
+
- '3.0'
|
14
|
+
- '3.1'
|
15
|
+
- jruby
|
16
|
+
name: Ruby ${{ matrix.ruby }}
|
17
|
+
steps:
|
18
|
+
- uses: actions/checkout@v2
|
19
|
+
- name: Set up Ruby
|
20
|
+
uses: ruby/setup-ruby@v1
|
21
|
+
with:
|
22
|
+
ruby-version: ${{ matrix.ruby }}
|
23
|
+
- name: Install dependencies
|
24
|
+
run: bundle install --jobs 4 --retry 3
|
25
|
+
- name: Run tests
|
26
|
+
run: bundle exec rake test
|
data/.gitignore
CHANGED
data/ChangeLog.md
CHANGED
@@ -1,6 +1,16 @@
|
|
1
|
+
### 0.7.0 / 2022-12-31
|
2
|
+
|
3
|
+
* Added {Spidr.domain} and {Spidr::Agent.domain}.
|
4
|
+
* Added {Spidr::Page#gif?}.
|
5
|
+
* Added {Spidr::Page#jpeg?}.
|
6
|
+
* Added {Spidr::Page#icon?} and {Spidr::Page#ico?}.
|
7
|
+
* Added {Spidr::Page#png?}.
|
8
|
+
* {Spidr.proxy=} and {Spidr::Agent#proxy=} can now accept a `String` or a
|
9
|
+
`URI::HTTP` object.
|
10
|
+
|
1
11
|
### 0.6.1 / 2019-10-24
|
2
12
|
|
3
|
-
* Check for opaque component of URIs before attempting to set the path
|
13
|
+
* Check for the opaque component of URIs before attempting to set the path
|
4
14
|
component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
|
5
15
|
opaque` exceptions.
|
6
16
|
* Fix `@robots` instance variable warning (@spk).
|
data/Gemfile
CHANGED
@@ -12,10 +12,13 @@ group :development do
|
|
12
12
|
gem 'rake'
|
13
13
|
gem 'rubygems-tasks', '~> 0.2'
|
14
14
|
|
15
|
-
gem 'rspec',
|
16
|
-
gem 'webmock',
|
17
|
-
gem 'sinatra',
|
15
|
+
gem 'rspec', '~> 3.0'
|
16
|
+
gem 'webmock', '~> 3.0'
|
17
|
+
gem 'sinatra', '~> 2.0'
|
18
|
+
gem 'simplecov', '~> 0.20'
|
18
19
|
|
19
20
|
gem 'kramdown'
|
20
|
-
gem '
|
21
|
+
gem 'redcarpet', platform: :mri
|
22
|
+
gem 'yard', '~> 0.9'
|
23
|
+
gem 'yard-spellcheck', require: false
|
21
24
|
end
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
# Spidr
|
2
2
|
|
3
|
+
[](https://github.com/postmodern/spidr/actions/workflows/ruby.yml)
|
4
|
+
|
3
5
|
* [Homepage](https://github.com/postmodern/spidr#readme)
|
4
6
|
* [Source](https://github.com/postmodern/spidr)
|
5
7
|
* [Issues](https://github.com/postmodern/spidr/issues)
|
6
8
|
* [Mailing List](http://groups.google.com/group/spidr)
|
7
|
-
* [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
|
8
|
-
* [](https://travis-ci.org/postmodern/spidr)
|
9
9
|
|
10
10
|
## Description
|
11
11
|
|
@@ -49,137 +49,194 @@ and easy to use.
|
|
49
49
|
|
50
50
|
Start spidering from a URL:
|
51
51
|
|
52
|
-
|
52
|
+
```ruby
|
53
|
+
Spidr.start_at('http://tenderlovemaking.com/') do |agent|
|
54
|
+
# ...
|
55
|
+
end
|
56
|
+
```
|
53
57
|
|
54
58
|
Spider a host:
|
55
59
|
|
56
|
-
|
60
|
+
```ruby
|
61
|
+
Spidr.host('solnic.eu') do |agent|
|
62
|
+
# ...
|
63
|
+
end
|
64
|
+
```
|
65
|
+
|
66
|
+
Spider a domain (and any sub-domains):
|
67
|
+
|
68
|
+
```ruby
|
69
|
+
Spidr.domain('ruby-lang.org') do |agent|
|
70
|
+
# ...
|
71
|
+
end
|
72
|
+
```
|
57
73
|
|
58
74
|
Spider a site:
|
59
75
|
|
60
|
-
|
76
|
+
```ruby
|
77
|
+
Spidr.site('http://www.rubyflow.com/') do |agent|
|
78
|
+
# ...
|
79
|
+
end
|
80
|
+
```
|
61
81
|
|
62
82
|
Spider multiple hosts:
|
63
83
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
]
|
70
|
-
)
|
84
|
+
```ruby
|
85
|
+
Spidr.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
|
86
|
+
# ...
|
87
|
+
end
|
88
|
+
```
|
71
89
|
|
72
90
|
Do not spider certain links:
|
73
91
|
|
74
|
-
|
92
|
+
```ruby
|
93
|
+
Spidr.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
|
94
|
+
# ...
|
95
|
+
end
|
96
|
+
```
|
75
97
|
|
76
98
|
Do not spider links on certain ports:
|
77
99
|
|
78
|
-
|
100
|
+
```ruby
|
101
|
+
Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
|
102
|
+
# ...
|
103
|
+
end
|
104
|
+
```
|
79
105
|
|
80
106
|
Do not spider links blacklisted in robots.txt:
|
81
107
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
108
|
+
```ruby
|
109
|
+
Spidr.site('http://company.com/', robots: true) do |agent|
|
110
|
+
# ...
|
111
|
+
end
|
112
|
+
```
|
86
113
|
|
87
114
|
Print out visited URLs:
|
88
115
|
|
89
|
-
|
90
|
-
|
91
|
-
|
116
|
+
```ruby
|
117
|
+
Spidr.site('http://www.rubyinside.com/') do |spider|
|
118
|
+
spider.every_url { |url| puts url }
|
119
|
+
end
|
120
|
+
```
|
92
121
|
|
93
122
|
Build a URL map of a site:
|
94
123
|
|
95
|
-
|
124
|
+
```ruby
|
125
|
+
url_map = Hash.new { |hash,key| hash[key] = [] }
|
96
126
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
127
|
+
Spidr.site('http://intranet.com/') do |spider|
|
128
|
+
spider.every_link do |origin,dest|
|
129
|
+
url_map[dest] << origin
|
130
|
+
end
|
131
|
+
end
|
132
|
+
```
|
102
133
|
|
103
134
|
Print out the URLs that could not be requested:
|
104
135
|
|
105
|
-
|
106
|
-
|
107
|
-
|
136
|
+
```ruby
|
137
|
+
Spidr.site('http://company.com/') do |spider|
|
138
|
+
spider.every_failed_url { |url| puts url }
|
139
|
+
end
|
140
|
+
```
|
108
141
|
|
109
142
|
Finds all pages which have broken links:
|
110
143
|
|
111
|
-
|
144
|
+
```ruby
|
145
|
+
url_map = Hash.new { |hash,key| hash[key] = [] }
|
112
146
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
147
|
+
spider = Spidr.site('http://intranet.com/') do |spider|
|
148
|
+
spider.every_link do |origin,dest|
|
149
|
+
url_map[dest] << origin
|
150
|
+
end
|
151
|
+
end
|
118
152
|
|
119
|
-
|
120
|
-
|
153
|
+
spider.failures.each do |url|
|
154
|
+
puts "Broken link #{url} found in:"
|
121
155
|
|
122
|
-
|
123
|
-
|
156
|
+
url_map[url].each { |page| puts " #{page}" }
|
157
|
+
end
|
158
|
+
```
|
124
159
|
|
125
160
|
Search HTML and XML pages:
|
126
161
|
|
127
|
-
|
128
|
-
|
129
|
-
|
162
|
+
```ruby
|
163
|
+
Spidr.site('http://company.com/') do |spider|
|
164
|
+
spider.every_page do |page|
|
165
|
+
puts ">>> #{page.url}"
|
130
166
|
|
131
|
-
|
132
|
-
|
133
|
-
|
167
|
+
page.search('//meta').each do |meta|
|
168
|
+
name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
169
|
+
value = meta.attributes['content']
|
134
170
|
|
135
|
-
|
136
|
-
end
|
137
|
-
end
|
171
|
+
puts " #{name} = #{value}"
|
138
172
|
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
```
|
139
176
|
|
140
177
|
Print out the titles from every page:
|
141
178
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
179
|
+
```ruby
|
180
|
+
Spidr.site('https://www.ruby-lang.org/') do |spider|
|
181
|
+
spider.every_html_page do |page|
|
182
|
+
puts page.title
|
183
|
+
end
|
184
|
+
end
|
185
|
+
```
|
186
|
+
|
187
|
+
Print out every HTTP redirect:
|
188
|
+
|
189
|
+
```ruby
|
190
|
+
Spidr.host('company.com') do |spider|
|
191
|
+
spider.every_redirect_page do |page|
|
192
|
+
puts "#{page.url} -> #{page.headers['Location']}"
|
193
|
+
end
|
194
|
+
end
|
195
|
+
```
|
147
196
|
|
148
197
|
Find what kinds of web servers a host is using, by accessing the headers:
|
149
198
|
|
150
|
-
|
199
|
+
```ruby
|
200
|
+
servers = Set[]
|
151
201
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
202
|
+
Spidr.host('company.com') do |spider|
|
203
|
+
spider.all_headers do |headers|
|
204
|
+
servers << headers['server']
|
205
|
+
end
|
206
|
+
end
|
207
|
+
```
|
157
208
|
|
158
209
|
Pause the spider on a forbidden page:
|
159
210
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
211
|
+
```ruby
|
212
|
+
Spidr.host('company.com') do |spider|
|
213
|
+
spider.every_forbidden_page do |page|
|
214
|
+
spider.pause!
|
215
|
+
end
|
216
|
+
end
|
217
|
+
```
|
165
218
|
|
166
219
|
Skip the processing of a page:
|
167
220
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
221
|
+
```ruby
|
222
|
+
Spidr.host('company.com') do |spider|
|
223
|
+
spider.every_missing_page do |page|
|
224
|
+
spider.skip_page!
|
225
|
+
end
|
226
|
+
end
|
227
|
+
```
|
173
228
|
|
174
229
|
Skip the processing of links:
|
175
230
|
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
end
|
231
|
+
```ruby
|
232
|
+
Spidr.host('company.com') do |spider|
|
233
|
+
spider.every_url do |url|
|
234
|
+
if url.path.split('/').find { |dir| dir.to_i > 1000 }
|
235
|
+
spider.skip_link!
|
182
236
|
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
```
|
183
240
|
|
184
241
|
## Requirements
|
185
242
|
|
@@ -188,11 +245,13 @@ Skip the processing of links:
|
|
188
245
|
|
189
246
|
## Install
|
190
247
|
|
191
|
-
|
248
|
+
```shell
|
249
|
+
$ gem install spidr
|
250
|
+
```
|
192
251
|
|
193
252
|
## License
|
194
253
|
|
195
|
-
Copyright (c) 2008-
|
254
|
+
Copyright (c) 2008-2022 Hal Brodigan
|
196
255
|
|
197
256
|
See {file:LICENSE.txt} for license information.
|
198
257
|
|
data/Rakefile
CHANGED
data/gemspec.yml
CHANGED
@@ -11,6 +11,13 @@ email: postmodern.mod3@gmail.com
|
|
11
11
|
homepage: https://github.com/postmodern/spidr#readme
|
12
12
|
has_yard: true
|
13
13
|
|
14
|
+
metadata:
|
15
|
+
documentation_uri: https://rubydoc.info/gems/spidr
|
16
|
+
source_code_uri: https://github.com/postmodern/spidr.rb
|
17
|
+
bug_tracker_uri: https://github.com/postmodern/spidr.rb/issues
|
18
|
+
changelog_uri: https://github.com/postmodern/spidr.rb/blob/master/ChangeLog.md
|
19
|
+
rubygems_mfa_required: 'true'
|
20
|
+
|
14
21
|
required_ruby_version: ">= 2.0.0"
|
15
22
|
|
16
23
|
dependencies:
|
data/lib/spidr/agent/actions.rb
CHANGED
data/lib/spidr/agent/events.rb
CHANGED
data/lib/spidr/agent/filters.rb
CHANGED
@@ -356,89 +356,88 @@ module Spidr
|
|
356
356
|
#
|
357
357
|
# Initializes filtering rules.
|
358
358
|
#
|
359
|
-
# @param [
|
360
|
-
# Additional options.
|
361
|
-
#
|
362
|
-
# @option options [Array] :schemes (['http', 'https'])
|
359
|
+
# @param [Array<String>] schemes
|
363
360
|
# The list of acceptable URI schemes to visit.
|
364
361
|
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
365
362
|
#
|
366
|
-
# @
|
363
|
+
# @param [String] host
|
367
364
|
# The host-name to visit.
|
368
365
|
#
|
369
|
-
# @
|
366
|
+
# @param [Array<String, Regexp, Proc>] hosts
|
370
367
|
# The patterns which match the host-names to visit.
|
371
368
|
#
|
372
|
-
# @
|
369
|
+
# @param [Array<String, Regexp, Proc>] ignore_hosts
|
373
370
|
# The patterns which match the host-names to not visit.
|
374
371
|
#
|
375
|
-
# @
|
372
|
+
# @param [Array<Integer, Regexp, Proc>] ports
|
376
373
|
# The patterns which match the ports to visit.
|
377
374
|
#
|
378
|
-
# @
|
375
|
+
# @param [Array<Integer, Regexp, Proc>] ignore_ports
|
379
376
|
# The patterns which match the ports to not visit.
|
380
377
|
#
|
381
|
-
# @
|
378
|
+
# @param [Array<String, Regexp, Proc>] links
|
382
379
|
# The patterns which match the links to visit.
|
383
380
|
#
|
384
|
-
# @
|
381
|
+
# @param [Array<String, Regexp, Proc>] ignore_links
|
385
382
|
# The patterns which match the links to not visit.
|
386
383
|
#
|
387
|
-
# @
|
384
|
+
# @param [Array<String, Regexp, Proc>] urls
|
388
385
|
# The patterns which match the URLs to visit.
|
389
386
|
#
|
390
|
-
# @
|
387
|
+
# @param [Array<String, Regexp, Proc>] ignore_urls
|
391
388
|
# The patterns which match the URLs to not visit.
|
392
389
|
#
|
393
|
-
# @
|
390
|
+
# @param [Array<String, Regexp, Proc>] exts
|
394
391
|
# The patterns which match the URI path extensions to visit.
|
395
392
|
#
|
396
|
-
# @
|
393
|
+
# @param [Array<String, Regexp, Proc>] ignore_exts
|
397
394
|
# The patterns which match the URI path extensions to not visit.
|
398
395
|
#
|
399
|
-
def initialize_filters(
|
400
|
-
|
396
|
+
def initialize_filters(schemes: self.class.default_schemes,
|
397
|
+
host: nil,
|
398
|
+
hosts: nil,
|
399
|
+
ignore_hosts: nil,
|
400
|
+
ports: nil,
|
401
|
+
ignore_ports: nil,
|
402
|
+
links: nil,
|
403
|
+
ignore_links: nil,
|
404
|
+
urls: nil,
|
405
|
+
ignore_urls: nil,
|
406
|
+
exts: nil,
|
407
|
+
ignore_exts: nil)
|
408
|
+
@schemes = schemes.map(&:to_s)
|
409
|
+
|
410
|
+
@host_rules = Rules.new(accept: hosts, reject: ignore_hosts)
|
411
|
+
@port_rules = Rules.new(accept: ports, reject: ignore_ports)
|
412
|
+
@link_rules = Rules.new(accept: links, reject: ignore_links)
|
413
|
+
@url_rules = Rules.new(accept: urls, reject: ignore_urls)
|
414
|
+
@ext_rules = Rules.new(accept: exts, reject: ignore_exts)
|
415
|
+
|
416
|
+
visit_hosts_like(host) if host
|
417
|
+
end
|
401
418
|
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
419
|
+
#
|
420
|
+
# Determines the default URI schemes to follow.
|
421
|
+
#
|
422
|
+
# @return [Array<String>]
|
423
|
+
# The default URI schemes to follow.
|
424
|
+
#
|
425
|
+
# @since 0.6.2
|
426
|
+
#
|
427
|
+
def self.default_schemes
|
428
|
+
schemes = ['http']
|
406
429
|
|
407
|
-
|
408
|
-
|
430
|
+
begin
|
431
|
+
require 'net/https'
|
409
432
|
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
end
|
433
|
+
schemes << 'https'
|
434
|
+
rescue Gem::LoadError => e
|
435
|
+
raise(e)
|
436
|
+
rescue ::LoadError
|
437
|
+
warn "Warning: cannot load 'net/https', https support disabled"
|
416
438
|
end
|
417
439
|
|
418
|
-
|
419
|
-
accept: options[:hosts],
|
420
|
-
reject: options[:ignore_hosts]
|
421
|
-
)
|
422
|
-
@port_rules = Rules.new(
|
423
|
-
accept: options[:ports],
|
424
|
-
reject: options[:ignore_ports]
|
425
|
-
)
|
426
|
-
@link_rules = Rules.new(
|
427
|
-
accept: options[:links],
|
428
|
-
reject: options[:ignore_links]
|
429
|
-
)
|
430
|
-
@url_rules = Rules.new(
|
431
|
-
accept: options[:urls],
|
432
|
-
reject: options[:ignore_urls]
|
433
|
-
)
|
434
|
-
@ext_rules = Rules.new(
|
435
|
-
accept: options[:exts],
|
436
|
-
reject: options[:ignore_exts]
|
437
|
-
)
|
438
|
-
|
439
|
-
if options[:host]
|
440
|
-
visit_hosts_like(options[:host])
|
441
|
-
end
|
440
|
+
return schemes
|
442
441
|
end
|
443
442
|
|
444
443
|
#
|
@@ -34,20 +34,17 @@ module Spidr
|
|
34
34
|
#
|
35
35
|
# Initializes the Sanitizer rules.
|
36
36
|
#
|
37
|
-
# @param [
|
38
|
-
# Additional options.
|
39
|
-
#
|
40
|
-
# @option options [Boolean] :strip_fragments (true)
|
37
|
+
# @param [Boolean] strip_fragments
|
41
38
|
# Specifies whether or not to strip the fragment component from URLs.
|
42
39
|
#
|
43
|
-
# @
|
40
|
+
# @param [Boolean] strip_query
|
44
41
|
# Specifies whether or not to strip the query component from URLs.
|
45
42
|
#
|
46
43
|
# @since 0.2.2
|
47
44
|
#
|
48
|
-
def initialize_sanitizers(
|
49
|
-
@strip_fragments =
|
50
|
-
@strip_query =
|
45
|
+
def initialize_sanitizers(strip_fragments: true, strip_query: false)
|
46
|
+
@strip_fragments = strip_fragments
|
47
|
+
@strip_query = strip_query
|
51
48
|
end
|
52
49
|
|
53
50
|
end
|