spidr 0.6.0 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 114364609c8da8613e22e9f18777cd9c79e1ac8a
4
- data.tar.gz: edb694a4a695217a2adf2cd44ffeafb679e8292c
2
+ SHA256:
3
+ metadata.gz: e2202e0ce389cbbc6f88360d7e7b430328bc6da973b69929ebc54b1d92c104bb
4
+ data.tar.gz: 0777e972ef2cb1d540ee138a7703ee67f88b482260074c13e8c08a8da963aa77
5
5
  SHA512:
6
- metadata.gz: 50064bb0227d7dc0b3ff4bf55ec72b74635c89d6a7bfe24c6948c73ff439b74dbe6f2c72276586389340ed050a30b4814faa0d2584d490badc337c97393bf4f3
7
- data.tar.gz: 46326b368267b66d647ea09712ec499dc5dbed82fb28b65d5573832d661328a768e140fcec72a75e68516dae6401ded9558978b13e7c6158249fd14737d1c93f
6
+ metadata.gz: 45e923ad3aa59812de4af67cc3a1739d7751749b3aa3205c5979e1f29c302abd43316cd74ad7c61befda2d31b0ff080872047e9713537d3bfe2509a8d555156a
7
+ data.tar.gz: deae3dbd7d9566723ca8760064c715de8795b1d4e4be1ae0866497d697a411af3e8bbe2a677028a5bf57bba8e681bd9d712cef3a2d0d123bc903af28f5e32a77
@@ -1,14 +1,16 @@
1
1
  ---
2
+ before_install:
3
+ - gem update --system
4
+ - gem install bundler -v "~> 2.0"
2
5
  language: ruby
6
+ sudo: false
7
+ cache:
8
+ - bundler
3
9
  rvm:
4
- - 2.0.0
5
- - 2.1.9
6
- - 2.2.4
7
- - 2.3.1
10
+ - 2.5
11
+ - 2.6
8
12
  - jruby
9
- - rbx
10
13
  matrix:
11
14
  allow_failures:
12
15
  - rvm: jruby
13
- - rvm: rbx
14
- script: rake spec
16
+ script: bundle exec rake spec
@@ -1,3 +1,10 @@
1
+ ### 0.6.1 / 2019-10-24
2
+
3
+ * Check for opaque component of URIs before attempting to set the path
4
+ component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
5
+ opaque` exceptions.
6
+ * Fix `@robots` instance variable warning (@spk).
7
+
1
8
  ### 0.6.0 / 2016-08-04
2
9
 
3
10
  * Added {Spidr::Proxy}.
data/Gemfile CHANGED
@@ -13,9 +13,9 @@ group :development do
13
13
  gem 'rubygems-tasks', '~> 0.2'
14
14
 
15
15
  gem 'rspec', '~> 3.0'
16
- gem 'webmock', '~> 2.0'
16
+ gem 'webmock', '~> 3.0'
17
17
  gem 'sinatra', '~> 1.0'
18
18
 
19
- gem 'kramdown', '~> 0.12'
20
- gem 'yard', '~> 0.8'
19
+ gem 'kramdown'
20
+ gem 'yard', '~> 0.9'
21
21
  end
data/README.md CHANGED
@@ -157,7 +157,7 @@ Find what kinds of web servers a host is using, by accessing the headers:
157
157
 
158
158
  Pause the spider on a forbidden page:
159
159
 
160
- spider = Spidr.host('company.com') do |spider|
160
+ Spidr.host('company.com') do |spider|
161
161
  spider.every_forbidden_page do |page|
162
162
  spider.pause!
163
163
  end
@@ -17,4 +17,4 @@ dependencies:
17
17
  nokogiri: ~> 1.3
18
18
 
19
19
  development_dependencies:
20
- bundler: ~> 1.0
20
+ bundler: ~> 2.0
@@ -268,7 +268,7 @@ module Spidr
268
268
  # @see #initialize
269
269
  #
270
270
  def self.site(url,options={},&block)
271
- url = URI(url.to_s) unless url.kind_of?(URI)
271
+ url = URI(url)
272
272
 
273
273
  agent = new(options.merge(host: url.host),&block)
274
274
  agent.start_at(url)
@@ -408,9 +408,7 @@ module Spidr
408
408
  @history.clear
409
409
 
410
410
  new_history.each do |url|
411
- url = URI(url.to_s) unless url.kind_of?(URI)
412
-
413
- @history << url
411
+ @history << URI(url)
414
412
  end
415
413
 
416
414
  return @history
@@ -425,7 +423,7 @@ module Spidr
425
423
  # The links which have been visited.
426
424
  #
427
425
  def visited_links
428
- @history.map { |url| url.to_s }
426
+ @history.map(&:to_s)
429
427
  end
430
428
 
431
429
  #
@@ -435,7 +433,7 @@ module Spidr
435
433
  # The hosts which have been visited.
436
434
  #
437
435
  def visited_hosts
438
- visited_urls.map { |uri| uri.host }.uniq
436
+ visited_urls.map(&:host).uniq
439
437
  end
440
438
 
441
439
  #
@@ -448,9 +446,7 @@ module Spidr
448
446
  # Specifies whether a URL was visited.
449
447
  #
450
448
  def visited?(url)
451
- url = URI(url.to_s) unless url.kind_of?(URI)
452
-
453
- return @history.include?(url)
449
+ @history.include?(URI(url))
454
450
  end
455
451
 
456
452
  #
@@ -469,9 +465,7 @@ module Spidr
469
465
  @failures.clear
470
466
 
471
467
  new_failures.each do |url|
472
- url = URI(url.to_s) unless url.kind_of?(URI)
473
-
474
- @failures << url
468
+ @failures << URI(url)
475
469
  end
476
470
 
477
471
  return @failures
@@ -487,9 +481,7 @@ module Spidr
487
481
  # Specifies whether the given URL was unable to be visited.
488
482
  #
489
483
  def failed?(url)
490
- url = URI(url.to_s) unless url.kind_of?(URI)
491
-
492
- return @failures.include?(url)
484
+ @failures.include?(URI(url))
493
485
  end
494
486
 
495
487
  alias pending_urls queue
@@ -510,9 +502,7 @@ module Spidr
510
502
  @queue.clear
511
503
 
512
504
  new_queue.each do |url|
513
- url = URI(url.to_s) unless url.kind_of?(URI)
514
-
515
- @queue << url
505
+ @queue << URI(url)
516
506
  end
517
507
 
518
508
  return @queue
@@ -594,7 +584,7 @@ module Spidr
594
584
  # The page for the response, or `nil` if the request failed.
595
585
  #
596
586
  def get_page(url)
597
- url = URI(url.to_s)
587
+ url = URI(url)
598
588
 
599
589
  prepare_request(url) do |session,path,headers|
600
590
  new_page = Page.new(url,session.get(path,headers))
@@ -629,7 +619,7 @@ module Spidr
629
619
  # @since 0.2.2
630
620
  #
631
621
  def post_page(url,post_data='')
632
- url = URI(url.to_s) unless url.kind_of?(URI)
622
+ url = URI(url)
633
623
 
634
624
  prepare_request(url) do |session,path,headers|
635
625
  new_page = Page.new(url,session.post(path,post_data,headers))
@@ -725,7 +715,7 @@ module Spidr
725
715
 
726
716
  unless @host_headers.empty?
727
717
  @host_headers.each do |name,header|
728
- if host.match(name)
718
+ if url.host.match(name)
729
719
  headers['Host'] = header
730
720
  break
731
721
  end
@@ -769,8 +759,6 @@ module Spidr
769
759
  # @since 0.2.2
770
760
  #
771
761
  def prepare_request(url,&block)
772
- host = url.host
773
- port = url.port
774
762
  path = unless url.path.empty?
775
763
  url.path
776
764
  else
@@ -16,7 +16,7 @@ module Spidr
16
16
  # agent.schemes = ['http']
17
17
  #
18
18
  def schemes=(new_schemes)
19
- @schemes = new_schemes.map { |scheme| scheme.to_s }
19
+ @schemes = new_schemes.map(&:to_s)
20
20
  end
21
21
 
22
22
  #
@@ -452,9 +452,9 @@ module Spidr
452
452
  #
453
453
  def visit_scheme?(scheme)
454
454
  if scheme
455
- return @schemes.include?(scheme)
455
+ @schemes.include?(scheme)
456
456
  else
457
- return true
457
+ true
458
458
  end
459
459
  end
460
460
 
@@ -21,7 +21,7 @@ module Spidr
21
21
  # @since 0.2.2
22
22
  #
23
23
  def sanitize_url(url)
24
- url = URI(url.to_s) unless url.kind_of?(URI)
24
+ url = URI(url)
25
25
 
26
26
  url.fragment = nil if @strip_fragments
27
27
  url.query = nil if @strip_query
@@ -34,7 +34,7 @@ module Spidr
34
34
  #
35
35
  def [](url)
36
36
  # normalize the url
37
- url = URI(url.to_s) unless url.kind_of?(URI)
37
+ url = URI(url)
38
38
 
39
39
  key = [url.scheme, url.host, url.port]
40
40
  paths = @credentials[key]
@@ -42,7 +42,7 @@ module Spidr
42
42
  return nil unless paths
43
43
 
44
44
  # longest path first
45
- ordered_paths = paths.keys.sort_by { |key| key.length }.reverse
45
+ ordered_paths = paths.keys.sort_by { |path_key| -path_key.length }
46
46
 
47
47
  # directories of the path
48
48
  path_dirs = URI.expand_path(url.path).split('/')
@@ -70,7 +70,7 @@ module Spidr
70
70
  #
71
71
  def []=(url,auth)
72
72
  # normalize the url
73
- url = URI(url.to_s) unless url.kind_of?(URI)
73
+ url = URI(url)
74
74
 
75
75
  # normalize the URL path
76
76
  path = URI.expand_path(url.path)
@@ -118,7 +118,7 @@ module Spidr
118
118
  #
119
119
  def for_url(url)
120
120
  if (auth = self[url])
121
- return Base64.encode64("#{auth.username}:#{auth.password}")
121
+ Base64.encode64("#{auth.username}:#{auth.password}")
122
122
  end
123
123
  end
124
124
 
@@ -144,7 +144,11 @@ module Spidr
144
144
  # @since 0.2.2
145
145
  #
146
146
  def size
147
- @credentials.inject(0) { |res, arr| res + arr[1].length }
147
+ total = 0
148
+
149
+ @credentials.each_value { |paths| total += paths.length }
150
+
151
+ return total
148
152
  end
149
153
 
150
154
  #
@@ -271,7 +271,7 @@ module Spidr
271
271
  return
272
272
  end
273
273
 
274
- if (path = new_url.path)
274
+ if (!new_url.opaque) && (path = new_url.path)
275
275
  # ensure that paths begin with a leading '/' for URI::FTP
276
276
  if (new_url.scheme == 'ftp' && !path.start_with?('/'))
277
277
  path.insert(0,'/')
@@ -65,7 +65,7 @@ module Spidr
65
65
  #
66
66
  def active?(url)
67
67
  # normalize the url
68
- url = URI(url.to_s) unless url.kind_of?(URI)
68
+ url = URI(url)
69
69
 
70
70
  # session key
71
71
  key = key_for(url)
@@ -84,7 +84,7 @@ module Spidr
84
84
  #
85
85
  def [](url)
86
86
  # normalize the url
87
- url = URI(url.to_s) unless url.kind_of?(URI)
87
+ url = URI(url)
88
88
 
89
89
  # session key
90
90
  key = key_for(url)
@@ -127,7 +127,7 @@ module Spidr
127
127
  #
128
128
  def kill!(url)
129
129
  # normalize the url
130
- url = URI(url.to_s) unless url.kind_of?(URI)
130
+ url = URI(url)
131
131
 
132
132
  # session key
133
133
  key = key_for(url)
@@ -16,6 +16,7 @@ module Spidr
16
16
  # @since 0.5.0
17
17
  #
18
18
  def self.robots?
19
+ @robots ||= false
19
20
  @robots
20
21
  end
21
22
 
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.6.0'
3
+ VERSION = '0.6.1'
4
4
  end
@@ -786,14 +786,12 @@ describe Agent do
786
786
 
787
787
  [
788
788
  "User-agent: *",
789
- 'Disallow: /',
789
+ 'Disallow: /secret',
790
790
  ].join($/)
791
791
  end
792
792
  end
793
793
 
794
794
  it "should not follow links Disallowed by robots.txt" do
795
- pending "https://github.com/bblimke/webmock/issues/642"
796
-
797
795
  expect(subject.history).to be == Set[
798
796
  URI("http://#{host}/"),
799
797
  URI("http://#{host}/pub")
metadata CHANGED
@@ -1,43 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-08-05 00:00:00.000000000 Z
11
+ date: 2019-10-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.3'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.3'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ~>
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '1.0'
33
+ version: '2.0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ~>
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '1.0'
40
+ version: '2.0'
41
41
  description: Spidr is a versatile Ruby web spidering library that can spider a site,
42
42
  multiple domains, certain links or infinitely. Spidr is designed to be fast and
43
43
  easy to use.
@@ -49,10 +49,10 @@ extra_rdoc_files:
49
49
  - LICENSE.txt
50
50
  - README.md
51
51
  files:
52
- - .gitignore
53
- - .rspec
54
- - .travis.yml
55
- - .yardopts
52
+ - ".gitignore"
53
+ - ".rspec"
54
+ - ".travis.yml"
55
+ - ".yardopts"
56
56
  - ChangeLog.md
57
57
  - Gemfile
58
58
  - LICENSE.txt
@@ -118,17 +118,16 @@ require_paths:
118
118
  - lib
119
119
  required_ruby_version: !ruby/object:Gem::Requirement
120
120
  requirements:
121
- - - '>='
121
+ - - ">="
122
122
  - !ruby/object:Gem::Version
123
123
  version: 2.0.0
124
124
  required_rubygems_version: !ruby/object:Gem::Requirement
125
125
  requirements:
126
- - - '>='
126
+ - - ">="
127
127
  - !ruby/object:Gem::Version
128
128
  version: '0'
129
129
  requirements: []
130
- rubyforge_project:
131
- rubygems_version: 2.0.14.1
130
+ rubygems_version: 3.0.3
132
131
  signing_key:
133
132
  specification_version: 4
134
133
  summary: A versatile Ruby web spidering library