spidr 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +9 -7
- data/ChangeLog.md +7 -0
- data/Gemfile +3 -3
- data/README.md +1 -1
- data/gemspec.yml +1 -1
- data/lib/spidr/agent.rb +11 -23
- data/lib/spidr/agent/filters.rb +3 -3
- data/lib/spidr/agent/sanitizers.rb +1 -1
- data/lib/spidr/auth_store.rb +9 -5
- data/lib/spidr/page/html.rb +1 -1
- data/lib/spidr/session_cache.rb +3 -3
- data/lib/spidr/spidr.rb +1 -0
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +1 -3
- metadata +15 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e2202e0ce389cbbc6f88360d7e7b430328bc6da973b69929ebc54b1d92c104bb
|
4
|
+
data.tar.gz: 0777e972ef2cb1d540ee138a7703ee67f88b482260074c13e8c08a8da963aa77
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 45e923ad3aa59812de4af67cc3a1739d7751749b3aa3205c5979e1f29c302abd43316cd74ad7c61befda2d31b0ff080872047e9713537d3bfe2509a8d555156a
|
7
|
+
data.tar.gz: deae3dbd7d9566723ca8760064c715de8795b1d4e4be1ae0866497d697a411af3e8bbe2a677028a5bf57bba8e681bd9d712cef3a2d0d123bc903af28f5e32a77
|
data/.travis.yml
CHANGED
@@ -1,14 +1,16 @@
|
|
1
1
|
---
|
2
|
+
before_install:
|
3
|
+
- gem update --system
|
4
|
+
- gem install bundler -v "~> 2.0"
|
2
5
|
language: ruby
|
6
|
+
sudo: false
|
7
|
+
cache:
|
8
|
+
- bundler
|
3
9
|
rvm:
|
4
|
-
- 2.
|
5
|
-
- 2.
|
6
|
-
- 2.2.4
|
7
|
-
- 2.3.1
|
10
|
+
- 2.5
|
11
|
+
- 2.6
|
8
12
|
- jruby
|
9
|
-
- rbx
|
10
13
|
matrix:
|
11
14
|
allow_failures:
|
12
15
|
- rvm: jruby
|
13
|
-
|
14
|
-
script: rake spec
|
16
|
+
script: bundle exec rake spec
|
data/ChangeLog.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
### 0.6.1 / 2019-10-24
|
2
|
+
|
3
|
+
* Check for opaque component of URIs before attempting to set the path
|
4
|
+
component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
|
5
|
+
opaque` exceptions.
|
6
|
+
* Fix `@robots` instance variable warning (@spk).
|
7
|
+
|
1
8
|
### 0.6.0 / 2016-08-04
|
2
9
|
|
3
10
|
* Added {Spidr::Proxy}.
|
data/Gemfile
CHANGED
@@ -13,9 +13,9 @@ group :development do
|
|
13
13
|
gem 'rubygems-tasks', '~> 0.2'
|
14
14
|
|
15
15
|
gem 'rspec', '~> 3.0'
|
16
|
-
gem 'webmock', '~>
|
16
|
+
gem 'webmock', '~> 3.0'
|
17
17
|
gem 'sinatra', '~> 1.0'
|
18
18
|
|
19
|
-
gem 'kramdown'
|
20
|
-
gem 'yard', '~> 0.
|
19
|
+
gem 'kramdown'
|
20
|
+
gem 'yard', '~> 0.9'
|
21
21
|
end
|
data/README.md
CHANGED
@@ -157,7 +157,7 @@ Find what kinds of web servers a host is using, by accessing the headers:
|
|
157
157
|
|
158
158
|
Pause the spider on a forbidden page:
|
159
159
|
|
160
|
-
|
160
|
+
Spidr.host('company.com') do |spider|
|
161
161
|
spider.every_forbidden_page do |page|
|
162
162
|
spider.pause!
|
163
163
|
end
|
data/gemspec.yml
CHANGED
data/lib/spidr/agent.rb
CHANGED
@@ -268,7 +268,7 @@ module Spidr
|
|
268
268
|
# @see #initialize
|
269
269
|
#
|
270
270
|
def self.site(url,options={},&block)
|
271
|
-
url = URI(url
|
271
|
+
url = URI(url)
|
272
272
|
|
273
273
|
agent = new(options.merge(host: url.host),&block)
|
274
274
|
agent.start_at(url)
|
@@ -408,9 +408,7 @@ module Spidr
|
|
408
408
|
@history.clear
|
409
409
|
|
410
410
|
new_history.each do |url|
|
411
|
-
|
412
|
-
|
413
|
-
@history << url
|
411
|
+
@history << URI(url)
|
414
412
|
end
|
415
413
|
|
416
414
|
return @history
|
@@ -425,7 +423,7 @@ module Spidr
|
|
425
423
|
# The links which have been visited.
|
426
424
|
#
|
427
425
|
def visited_links
|
428
|
-
@history.map
|
426
|
+
@history.map(&:to_s)
|
429
427
|
end
|
430
428
|
|
431
429
|
#
|
@@ -435,7 +433,7 @@ module Spidr
|
|
435
433
|
# The hosts which have been visited.
|
436
434
|
#
|
437
435
|
def visited_hosts
|
438
|
-
visited_urls.map
|
436
|
+
visited_urls.map(&:host).uniq
|
439
437
|
end
|
440
438
|
|
441
439
|
#
|
@@ -448,9 +446,7 @@ module Spidr
|
|
448
446
|
# Specifies whether a URL was visited.
|
449
447
|
#
|
450
448
|
def visited?(url)
|
451
|
-
|
452
|
-
|
453
|
-
return @history.include?(url)
|
449
|
+
@history.include?(URI(url))
|
454
450
|
end
|
455
451
|
|
456
452
|
#
|
@@ -469,9 +465,7 @@ module Spidr
|
|
469
465
|
@failures.clear
|
470
466
|
|
471
467
|
new_failures.each do |url|
|
472
|
-
|
473
|
-
|
474
|
-
@failures << url
|
468
|
+
@failures << URI(url)
|
475
469
|
end
|
476
470
|
|
477
471
|
return @failures
|
@@ -487,9 +481,7 @@ module Spidr
|
|
487
481
|
# Specifies whether the given URL was unable to be visited.
|
488
482
|
#
|
489
483
|
def failed?(url)
|
490
|
-
|
491
|
-
|
492
|
-
return @failures.include?(url)
|
484
|
+
@failures.include?(URI(url))
|
493
485
|
end
|
494
486
|
|
495
487
|
alias pending_urls queue
|
@@ -510,9 +502,7 @@ module Spidr
|
|
510
502
|
@queue.clear
|
511
503
|
|
512
504
|
new_queue.each do |url|
|
513
|
-
|
514
|
-
|
515
|
-
@queue << url
|
505
|
+
@queue << URI(url)
|
516
506
|
end
|
517
507
|
|
518
508
|
return @queue
|
@@ -594,7 +584,7 @@ module Spidr
|
|
594
584
|
# The page for the response, or `nil` if the request failed.
|
595
585
|
#
|
596
586
|
def get_page(url)
|
597
|
-
url = URI(url
|
587
|
+
url = URI(url)
|
598
588
|
|
599
589
|
prepare_request(url) do |session,path,headers|
|
600
590
|
new_page = Page.new(url,session.get(path,headers))
|
@@ -629,7 +619,7 @@ module Spidr
|
|
629
619
|
# @since 0.2.2
|
630
620
|
#
|
631
621
|
def post_page(url,post_data='')
|
632
|
-
url = URI(url
|
622
|
+
url = URI(url)
|
633
623
|
|
634
624
|
prepare_request(url) do |session,path,headers|
|
635
625
|
new_page = Page.new(url,session.post(path,post_data,headers))
|
@@ -725,7 +715,7 @@ module Spidr
|
|
725
715
|
|
726
716
|
unless @host_headers.empty?
|
727
717
|
@host_headers.each do |name,header|
|
728
|
-
if host.match(name)
|
718
|
+
if url.host.match(name)
|
729
719
|
headers['Host'] = header
|
730
720
|
break
|
731
721
|
end
|
@@ -769,8 +759,6 @@ module Spidr
|
|
769
759
|
# @since 0.2.2
|
770
760
|
#
|
771
761
|
def prepare_request(url,&block)
|
772
|
-
host = url.host
|
773
|
-
port = url.port
|
774
762
|
path = unless url.path.empty?
|
775
763
|
url.path
|
776
764
|
else
|
data/lib/spidr/agent/filters.rb
CHANGED
@@ -16,7 +16,7 @@ module Spidr
|
|
16
16
|
# agent.schemes = ['http']
|
17
17
|
#
|
18
18
|
def schemes=(new_schemes)
|
19
|
-
@schemes = new_schemes.map
|
19
|
+
@schemes = new_schemes.map(&:to_s)
|
20
20
|
end
|
21
21
|
|
22
22
|
#
|
@@ -452,9 +452,9 @@ module Spidr
|
|
452
452
|
#
|
453
453
|
def visit_scheme?(scheme)
|
454
454
|
if scheme
|
455
|
-
|
455
|
+
@schemes.include?(scheme)
|
456
456
|
else
|
457
|
-
|
457
|
+
true
|
458
458
|
end
|
459
459
|
end
|
460
460
|
|
data/lib/spidr/auth_store.rb
CHANGED
@@ -34,7 +34,7 @@ module Spidr
|
|
34
34
|
#
|
35
35
|
def [](url)
|
36
36
|
# normalize the url
|
37
|
-
url = URI(url
|
37
|
+
url = URI(url)
|
38
38
|
|
39
39
|
key = [url.scheme, url.host, url.port]
|
40
40
|
paths = @credentials[key]
|
@@ -42,7 +42,7 @@ module Spidr
|
|
42
42
|
return nil unless paths
|
43
43
|
|
44
44
|
# longest path first
|
45
|
-
ordered_paths = paths.keys.sort_by { |
|
45
|
+
ordered_paths = paths.keys.sort_by { |path_key| -path_key.length }
|
46
46
|
|
47
47
|
# directories of the path
|
48
48
|
path_dirs = URI.expand_path(url.path).split('/')
|
@@ -70,7 +70,7 @@ module Spidr
|
|
70
70
|
#
|
71
71
|
def []=(url,auth)
|
72
72
|
# normalize the url
|
73
|
-
url = URI(url
|
73
|
+
url = URI(url)
|
74
74
|
|
75
75
|
# normalize the URL path
|
76
76
|
path = URI.expand_path(url.path)
|
@@ -118,7 +118,7 @@ module Spidr
|
|
118
118
|
#
|
119
119
|
def for_url(url)
|
120
120
|
if (auth = self[url])
|
121
|
-
|
121
|
+
Base64.encode64("#{auth.username}:#{auth.password}")
|
122
122
|
end
|
123
123
|
end
|
124
124
|
|
@@ -144,7 +144,11 @@ module Spidr
|
|
144
144
|
# @since 0.2.2
|
145
145
|
#
|
146
146
|
def size
|
147
|
-
|
147
|
+
total = 0
|
148
|
+
|
149
|
+
@credentials.each_value { |paths| total += paths.length }
|
150
|
+
|
151
|
+
return total
|
148
152
|
end
|
149
153
|
|
150
154
|
#
|
data/lib/spidr/page/html.rb
CHANGED
@@ -271,7 +271,7 @@ module Spidr
|
|
271
271
|
return
|
272
272
|
end
|
273
273
|
|
274
|
-
if (path = new_url.path)
|
274
|
+
if (!new_url.opaque) && (path = new_url.path)
|
275
275
|
# ensure that paths begin with a leading '/' for URI::FTP
|
276
276
|
if (new_url.scheme == 'ftp' && !path.start_with?('/'))
|
277
277
|
path.insert(0,'/')
|
data/lib/spidr/session_cache.rb
CHANGED
@@ -65,7 +65,7 @@ module Spidr
|
|
65
65
|
#
|
66
66
|
def active?(url)
|
67
67
|
# normalize the url
|
68
|
-
url = URI(url
|
68
|
+
url = URI(url)
|
69
69
|
|
70
70
|
# session key
|
71
71
|
key = key_for(url)
|
@@ -84,7 +84,7 @@ module Spidr
|
|
84
84
|
#
|
85
85
|
def [](url)
|
86
86
|
# normalize the url
|
87
|
-
url = URI(url
|
87
|
+
url = URI(url)
|
88
88
|
|
89
89
|
# session key
|
90
90
|
key = key_for(url)
|
@@ -127,7 +127,7 @@ module Spidr
|
|
127
127
|
#
|
128
128
|
def kill!(url)
|
129
129
|
# normalize the url
|
130
|
-
url = URI(url
|
130
|
+
url = URI(url)
|
131
131
|
|
132
132
|
# session key
|
133
133
|
key = key_for(url)
|
data/lib/spidr/spidr.rb
CHANGED
data/lib/spidr/version.rb
CHANGED
data/spec/agent_spec.rb
CHANGED
@@ -786,14 +786,12 @@ describe Agent do
|
|
786
786
|
|
787
787
|
[
|
788
788
|
"User-agent: *",
|
789
|
-
'Disallow: /',
|
789
|
+
'Disallow: /secret',
|
790
790
|
].join($/)
|
791
791
|
end
|
792
792
|
end
|
793
793
|
|
794
794
|
it "should not follow links Disallowed by robots.txt" do
|
795
|
-
pending "https://github.com/bblimke/webmock/issues/642"
|
796
|
-
|
797
795
|
expect(subject.history).to be == Set[
|
798
796
|
URI("http://#{host}/"),
|
799
797
|
URI("http://#{host}/pub")
|
metadata
CHANGED
@@ -1,43 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-10-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.3'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.3'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - ~>
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '2.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '2.0'
|
41
41
|
description: Spidr is a versatile Ruby web spidering library that can spider a site,
|
42
42
|
multiple domains, certain links or infinitely. Spidr is designed to be fast and
|
43
43
|
easy to use.
|
@@ -49,10 +49,10 @@ extra_rdoc_files:
|
|
49
49
|
- LICENSE.txt
|
50
50
|
- README.md
|
51
51
|
files:
|
52
|
-
- .gitignore
|
53
|
-
- .rspec
|
54
|
-
- .travis.yml
|
55
|
-
- .yardopts
|
52
|
+
- ".gitignore"
|
53
|
+
- ".rspec"
|
54
|
+
- ".travis.yml"
|
55
|
+
- ".yardopts"
|
56
56
|
- ChangeLog.md
|
57
57
|
- Gemfile
|
58
58
|
- LICENSE.txt
|
@@ -118,17 +118,16 @@ require_paths:
|
|
118
118
|
- lib
|
119
119
|
required_ruby_version: !ruby/object:Gem::Requirement
|
120
120
|
requirements:
|
121
|
-
- -
|
121
|
+
- - ">="
|
122
122
|
- !ruby/object:Gem::Version
|
123
123
|
version: 2.0.0
|
124
124
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
125
125
|
requirements:
|
126
|
-
- -
|
126
|
+
- - ">="
|
127
127
|
- !ruby/object:Gem::Version
|
128
128
|
version: '0'
|
129
129
|
requirements: []
|
130
|
-
|
131
|
-
rubygems_version: 2.0.14.1
|
130
|
+
rubygems_version: 3.0.3
|
132
131
|
signing_key:
|
133
132
|
specification_version: 4
|
134
133
|
summary: A versatile Ruby web spidering library
|