spidr 0.6.0 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +9 -7
- data/ChangeLog.md +7 -0
- data/Gemfile +3 -3
- data/README.md +1 -1
- data/gemspec.yml +1 -1
- data/lib/spidr/agent.rb +11 -23
- data/lib/spidr/agent/filters.rb +3 -3
- data/lib/spidr/agent/sanitizers.rb +1 -1
- data/lib/spidr/auth_store.rb +9 -5
- data/lib/spidr/page/html.rb +1 -1
- data/lib/spidr/session_cache.rb +3 -3
- data/lib/spidr/spidr.rb +1 -0
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +1 -3
- metadata +15 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e2202e0ce389cbbc6f88360d7e7b430328bc6da973b69929ebc54b1d92c104bb
|
4
|
+
data.tar.gz: 0777e972ef2cb1d540ee138a7703ee67f88b482260074c13e8c08a8da963aa77
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 45e923ad3aa59812de4af67cc3a1739d7751749b3aa3205c5979e1f29c302abd43316cd74ad7c61befda2d31b0ff080872047e9713537d3bfe2509a8d555156a
|
7
|
+
data.tar.gz: deae3dbd7d9566723ca8760064c715de8795b1d4e4be1ae0866497d697a411af3e8bbe2a677028a5bf57bba8e681bd9d712cef3a2d0d123bc903af28f5e32a77
|
data/.travis.yml
CHANGED
@@ -1,14 +1,16 @@
|
|
1
1
|
---
|
2
|
+
before_install:
|
3
|
+
- gem update --system
|
4
|
+
- gem install bundler -v "~> 2.0"
|
2
5
|
language: ruby
|
6
|
+
sudo: false
|
7
|
+
cache:
|
8
|
+
- bundler
|
3
9
|
rvm:
|
4
|
-
- 2.
|
5
|
-
- 2.
|
6
|
-
- 2.2.4
|
7
|
-
- 2.3.1
|
10
|
+
- 2.5
|
11
|
+
- 2.6
|
8
12
|
- jruby
|
9
|
-
- rbx
|
10
13
|
matrix:
|
11
14
|
allow_failures:
|
12
15
|
- rvm: jruby
|
13
|
-
|
14
|
-
script: rake spec
|
16
|
+
script: bundle exec rake spec
|
data/ChangeLog.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
### 0.6.1 / 2019-10-24
|
2
|
+
|
3
|
+
* Check for opaque component of URIs before attempting to set the path
|
4
|
+
component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
|
5
|
+
opaque` exceptions.
|
6
|
+
* Fix `@robots` instance variable warning (@spk).
|
7
|
+
|
1
8
|
### 0.6.0 / 2016-08-04
|
2
9
|
|
3
10
|
* Added {Spidr::Proxy}.
|
data/Gemfile
CHANGED
@@ -13,9 +13,9 @@ group :development do
|
|
13
13
|
gem 'rubygems-tasks', '~> 0.2'
|
14
14
|
|
15
15
|
gem 'rspec', '~> 3.0'
|
16
|
-
gem 'webmock', '~>
|
16
|
+
gem 'webmock', '~> 3.0'
|
17
17
|
gem 'sinatra', '~> 1.0'
|
18
18
|
|
19
|
-
gem 'kramdown'
|
20
|
-
gem 'yard', '~> 0.
|
19
|
+
gem 'kramdown'
|
20
|
+
gem 'yard', '~> 0.9'
|
21
21
|
end
|
data/README.md
CHANGED
@@ -157,7 +157,7 @@ Find what kinds of web servers a host is using, by accessing the headers:
|
|
157
157
|
|
158
158
|
Pause the spider on a forbidden page:
|
159
159
|
|
160
|
-
|
160
|
+
Spidr.host('company.com') do |spider|
|
161
161
|
spider.every_forbidden_page do |page|
|
162
162
|
spider.pause!
|
163
163
|
end
|
data/gemspec.yml
CHANGED
data/lib/spidr/agent.rb
CHANGED
@@ -268,7 +268,7 @@ module Spidr
|
|
268
268
|
# @see #initialize
|
269
269
|
#
|
270
270
|
def self.site(url,options={},&block)
|
271
|
-
url = URI(url
|
271
|
+
url = URI(url)
|
272
272
|
|
273
273
|
agent = new(options.merge(host: url.host),&block)
|
274
274
|
agent.start_at(url)
|
@@ -408,9 +408,7 @@ module Spidr
|
|
408
408
|
@history.clear
|
409
409
|
|
410
410
|
new_history.each do |url|
|
411
|
-
|
412
|
-
|
413
|
-
@history << url
|
411
|
+
@history << URI(url)
|
414
412
|
end
|
415
413
|
|
416
414
|
return @history
|
@@ -425,7 +423,7 @@ module Spidr
|
|
425
423
|
# The links which have been visited.
|
426
424
|
#
|
427
425
|
def visited_links
|
428
|
-
@history.map
|
426
|
+
@history.map(&:to_s)
|
429
427
|
end
|
430
428
|
|
431
429
|
#
|
@@ -435,7 +433,7 @@ module Spidr
|
|
435
433
|
# The hosts which have been visited.
|
436
434
|
#
|
437
435
|
def visited_hosts
|
438
|
-
visited_urls.map
|
436
|
+
visited_urls.map(&:host).uniq
|
439
437
|
end
|
440
438
|
|
441
439
|
#
|
@@ -448,9 +446,7 @@ module Spidr
|
|
448
446
|
# Specifies whether a URL was visited.
|
449
447
|
#
|
450
448
|
def visited?(url)
|
451
|
-
|
452
|
-
|
453
|
-
return @history.include?(url)
|
449
|
+
@history.include?(URI(url))
|
454
450
|
end
|
455
451
|
|
456
452
|
#
|
@@ -469,9 +465,7 @@ module Spidr
|
|
469
465
|
@failures.clear
|
470
466
|
|
471
467
|
new_failures.each do |url|
|
472
|
-
|
473
|
-
|
474
|
-
@failures << url
|
468
|
+
@failures << URI(url)
|
475
469
|
end
|
476
470
|
|
477
471
|
return @failures
|
@@ -487,9 +481,7 @@ module Spidr
|
|
487
481
|
# Specifies whether the given URL was unable to be visited.
|
488
482
|
#
|
489
483
|
def failed?(url)
|
490
|
-
|
491
|
-
|
492
|
-
return @failures.include?(url)
|
484
|
+
@failures.include?(URI(url))
|
493
485
|
end
|
494
486
|
|
495
487
|
alias pending_urls queue
|
@@ -510,9 +502,7 @@ module Spidr
|
|
510
502
|
@queue.clear
|
511
503
|
|
512
504
|
new_queue.each do |url|
|
513
|
-
|
514
|
-
|
515
|
-
@queue << url
|
505
|
+
@queue << URI(url)
|
516
506
|
end
|
517
507
|
|
518
508
|
return @queue
|
@@ -594,7 +584,7 @@ module Spidr
|
|
594
584
|
# The page for the response, or `nil` if the request failed.
|
595
585
|
#
|
596
586
|
def get_page(url)
|
597
|
-
url = URI(url
|
587
|
+
url = URI(url)
|
598
588
|
|
599
589
|
prepare_request(url) do |session,path,headers|
|
600
590
|
new_page = Page.new(url,session.get(path,headers))
|
@@ -629,7 +619,7 @@ module Spidr
|
|
629
619
|
# @since 0.2.2
|
630
620
|
#
|
631
621
|
def post_page(url,post_data='')
|
632
|
-
url = URI(url
|
622
|
+
url = URI(url)
|
633
623
|
|
634
624
|
prepare_request(url) do |session,path,headers|
|
635
625
|
new_page = Page.new(url,session.post(path,post_data,headers))
|
@@ -725,7 +715,7 @@ module Spidr
|
|
725
715
|
|
726
716
|
unless @host_headers.empty?
|
727
717
|
@host_headers.each do |name,header|
|
728
|
-
if host.match(name)
|
718
|
+
if url.host.match(name)
|
729
719
|
headers['Host'] = header
|
730
720
|
break
|
731
721
|
end
|
@@ -769,8 +759,6 @@ module Spidr
|
|
769
759
|
# @since 0.2.2
|
770
760
|
#
|
771
761
|
def prepare_request(url,&block)
|
772
|
-
host = url.host
|
773
|
-
port = url.port
|
774
762
|
path = unless url.path.empty?
|
775
763
|
url.path
|
776
764
|
else
|
data/lib/spidr/agent/filters.rb
CHANGED
@@ -16,7 +16,7 @@ module Spidr
|
|
16
16
|
# agent.schemes = ['http']
|
17
17
|
#
|
18
18
|
def schemes=(new_schemes)
|
19
|
-
@schemes = new_schemes.map
|
19
|
+
@schemes = new_schemes.map(&:to_s)
|
20
20
|
end
|
21
21
|
|
22
22
|
#
|
@@ -452,9 +452,9 @@ module Spidr
|
|
452
452
|
#
|
453
453
|
def visit_scheme?(scheme)
|
454
454
|
if scheme
|
455
|
-
|
455
|
+
@schemes.include?(scheme)
|
456
456
|
else
|
457
|
-
|
457
|
+
true
|
458
458
|
end
|
459
459
|
end
|
460
460
|
|
data/lib/spidr/auth_store.rb
CHANGED
@@ -34,7 +34,7 @@ module Spidr
|
|
34
34
|
#
|
35
35
|
def [](url)
|
36
36
|
# normalize the url
|
37
|
-
url = URI(url
|
37
|
+
url = URI(url)
|
38
38
|
|
39
39
|
key = [url.scheme, url.host, url.port]
|
40
40
|
paths = @credentials[key]
|
@@ -42,7 +42,7 @@ module Spidr
|
|
42
42
|
return nil unless paths
|
43
43
|
|
44
44
|
# longest path first
|
45
|
-
ordered_paths = paths.keys.sort_by { |
|
45
|
+
ordered_paths = paths.keys.sort_by { |path_key| -path_key.length }
|
46
46
|
|
47
47
|
# directories of the path
|
48
48
|
path_dirs = URI.expand_path(url.path).split('/')
|
@@ -70,7 +70,7 @@ module Spidr
|
|
70
70
|
#
|
71
71
|
def []=(url,auth)
|
72
72
|
# normalize the url
|
73
|
-
url = URI(url
|
73
|
+
url = URI(url)
|
74
74
|
|
75
75
|
# normalize the URL path
|
76
76
|
path = URI.expand_path(url.path)
|
@@ -118,7 +118,7 @@ module Spidr
|
|
118
118
|
#
|
119
119
|
def for_url(url)
|
120
120
|
if (auth = self[url])
|
121
|
-
|
121
|
+
Base64.encode64("#{auth.username}:#{auth.password}")
|
122
122
|
end
|
123
123
|
end
|
124
124
|
|
@@ -144,7 +144,11 @@ module Spidr
|
|
144
144
|
# @since 0.2.2
|
145
145
|
#
|
146
146
|
def size
|
147
|
-
|
147
|
+
total = 0
|
148
|
+
|
149
|
+
@credentials.each_value { |paths| total += paths.length }
|
150
|
+
|
151
|
+
return total
|
148
152
|
end
|
149
153
|
|
150
154
|
#
|
data/lib/spidr/page/html.rb
CHANGED
@@ -271,7 +271,7 @@ module Spidr
|
|
271
271
|
return
|
272
272
|
end
|
273
273
|
|
274
|
-
if (path = new_url.path)
|
274
|
+
if (!new_url.opaque) && (path = new_url.path)
|
275
275
|
# ensure that paths begin with a leading '/' for URI::FTP
|
276
276
|
if (new_url.scheme == 'ftp' && !path.start_with?('/'))
|
277
277
|
path.insert(0,'/')
|
data/lib/spidr/session_cache.rb
CHANGED
@@ -65,7 +65,7 @@ module Spidr
|
|
65
65
|
#
|
66
66
|
def active?(url)
|
67
67
|
# normalize the url
|
68
|
-
url = URI(url
|
68
|
+
url = URI(url)
|
69
69
|
|
70
70
|
# session key
|
71
71
|
key = key_for(url)
|
@@ -84,7 +84,7 @@ module Spidr
|
|
84
84
|
#
|
85
85
|
def [](url)
|
86
86
|
# normalize the url
|
87
|
-
url = URI(url
|
87
|
+
url = URI(url)
|
88
88
|
|
89
89
|
# session key
|
90
90
|
key = key_for(url)
|
@@ -127,7 +127,7 @@ module Spidr
|
|
127
127
|
#
|
128
128
|
def kill!(url)
|
129
129
|
# normalize the url
|
130
|
-
url = URI(url
|
130
|
+
url = URI(url)
|
131
131
|
|
132
132
|
# session key
|
133
133
|
key = key_for(url)
|
data/lib/spidr/spidr.rb
CHANGED
data/lib/spidr/version.rb
CHANGED
data/spec/agent_spec.rb
CHANGED
@@ -786,14 +786,12 @@ describe Agent do
|
|
786
786
|
|
787
787
|
[
|
788
788
|
"User-agent: *",
|
789
|
-
'Disallow: /',
|
789
|
+
'Disallow: /secret',
|
790
790
|
].join($/)
|
791
791
|
end
|
792
792
|
end
|
793
793
|
|
794
794
|
it "should not follow links Disallowed by robots.txt" do
|
795
|
-
pending "https://github.com/bblimke/webmock/issues/642"
|
796
|
-
|
797
795
|
expect(subject.history).to be == Set[
|
798
796
|
URI("http://#{host}/"),
|
799
797
|
URI("http://#{host}/pub")
|
metadata
CHANGED
@@ -1,43 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-10-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.3'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.3'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - ~>
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '2.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '2.0'
|
41
41
|
description: Spidr is a versatile Ruby web spidering library that can spider a site,
|
42
42
|
multiple domains, certain links or infinitely. Spidr is designed to be fast and
|
43
43
|
easy to use.
|
@@ -49,10 +49,10 @@ extra_rdoc_files:
|
|
49
49
|
- LICENSE.txt
|
50
50
|
- README.md
|
51
51
|
files:
|
52
|
-
- .gitignore
|
53
|
-
- .rspec
|
54
|
-
- .travis.yml
|
55
|
-
- .yardopts
|
52
|
+
- ".gitignore"
|
53
|
+
- ".rspec"
|
54
|
+
- ".travis.yml"
|
55
|
+
- ".yardopts"
|
56
56
|
- ChangeLog.md
|
57
57
|
- Gemfile
|
58
58
|
- LICENSE.txt
|
@@ -118,17 +118,16 @@ require_paths:
|
|
118
118
|
- lib
|
119
119
|
required_ruby_version: !ruby/object:Gem::Requirement
|
120
120
|
requirements:
|
121
|
-
- -
|
121
|
+
- - ">="
|
122
122
|
- !ruby/object:Gem::Version
|
123
123
|
version: 2.0.0
|
124
124
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
125
125
|
requirements:
|
126
|
-
- -
|
126
|
+
- - ">="
|
127
127
|
- !ruby/object:Gem::Version
|
128
128
|
version: '0'
|
129
129
|
requirements: []
|
130
|
-
|
131
|
-
rubygems_version: 2.0.14.1
|
130
|
+
rubygems_version: 3.0.3
|
132
131
|
signing_key:
|
133
132
|
specification_version: 4
|
134
133
|
summary: A versatile Ruby web spidering library
|