spidr 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 114364609c8da8613e22e9f18777cd9c79e1ac8a
4
- data.tar.gz: edb694a4a695217a2adf2cd44ffeafb679e8292c
2
+ SHA256:
3
+ metadata.gz: e2202e0ce389cbbc6f88360d7e7b430328bc6da973b69929ebc54b1d92c104bb
4
+ data.tar.gz: 0777e972ef2cb1d540ee138a7703ee67f88b482260074c13e8c08a8da963aa77
5
5
  SHA512:
6
- metadata.gz: 50064bb0227d7dc0b3ff4bf55ec72b74635c89d6a7bfe24c6948c73ff439b74dbe6f2c72276586389340ed050a30b4814faa0d2584d490badc337c97393bf4f3
7
- data.tar.gz: 46326b368267b66d647ea09712ec499dc5dbed82fb28b65d5573832d661328a768e140fcec72a75e68516dae6401ded9558978b13e7c6158249fd14737d1c93f
6
+ metadata.gz: 45e923ad3aa59812de4af67cc3a1739d7751749b3aa3205c5979e1f29c302abd43316cd74ad7c61befda2d31b0ff080872047e9713537d3bfe2509a8d555156a
7
+ data.tar.gz: deae3dbd7d9566723ca8760064c715de8795b1d4e4be1ae0866497d697a411af3e8bbe2a677028a5bf57bba8e681bd9d712cef3a2d0d123bc903af28f5e32a77
@@ -1,14 +1,16 @@
1
1
  ---
2
+ before_install:
3
+ - gem update --system
4
+ - gem install bundler -v "~> 2.0"
2
5
  language: ruby
6
+ sudo: false
7
+ cache:
8
+ - bundler
3
9
  rvm:
4
- - 2.0.0
5
- - 2.1.9
6
- - 2.2.4
7
- - 2.3.1
10
+ - 2.5
11
+ - 2.6
8
12
  - jruby
9
- - rbx
10
13
  matrix:
11
14
  allow_failures:
12
15
  - rvm: jruby
13
- - rvm: rbx
14
- script: rake spec
16
+ script: bundle exec rake spec
@@ -1,3 +1,10 @@
1
+ ### 0.6.1 / 2019-10-24
2
+
3
+ * Check for opaque component of URIs before attempting to set the path
4
+ component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
5
+ opaque` exceptions.
6
+ * Fix `@robots` instance variable warning (@spk).
7
+
1
8
  ### 0.6.0 / 2016-08-04
2
9
 
3
10
  * Added {Spidr::Proxy}.
data/Gemfile CHANGED
@@ -13,9 +13,9 @@ group :development do
13
13
  gem 'rubygems-tasks', '~> 0.2'
14
14
 
15
15
  gem 'rspec', '~> 3.0'
16
- gem 'webmock', '~> 2.0'
16
+ gem 'webmock', '~> 3.0'
17
17
  gem 'sinatra', '~> 1.0'
18
18
 
19
- gem 'kramdown', '~> 0.12'
20
- gem 'yard', '~> 0.8'
19
+ gem 'kramdown'
20
+ gem 'yard', '~> 0.9'
21
21
  end
data/README.md CHANGED
@@ -157,7 +157,7 @@ Find what kinds of web servers a host is using, by accessing the headers:
157
157
 
158
158
  Pause the spider on a forbidden page:
159
159
 
160
- spider = Spidr.host('company.com') do |spider|
160
+ Spidr.host('company.com') do |spider|
161
161
  spider.every_forbidden_page do |page|
162
162
  spider.pause!
163
163
  end
@@ -17,4 +17,4 @@ dependencies:
17
17
  nokogiri: ~> 1.3
18
18
 
19
19
  development_dependencies:
20
- bundler: ~> 1.0
20
+ bundler: ~> 2.0
@@ -268,7 +268,7 @@ module Spidr
268
268
  # @see #initialize
269
269
  #
270
270
  def self.site(url,options={},&block)
271
- url = URI(url.to_s) unless url.kind_of?(URI)
271
+ url = URI(url)
272
272
 
273
273
  agent = new(options.merge(host: url.host),&block)
274
274
  agent.start_at(url)
@@ -408,9 +408,7 @@ module Spidr
408
408
  @history.clear
409
409
 
410
410
  new_history.each do |url|
411
- url = URI(url.to_s) unless url.kind_of?(URI)
412
-
413
- @history << url
411
+ @history << URI(url)
414
412
  end
415
413
 
416
414
  return @history
@@ -425,7 +423,7 @@ module Spidr
425
423
  # The links which have been visited.
426
424
  #
427
425
  def visited_links
428
- @history.map { |url| url.to_s }
426
+ @history.map(&:to_s)
429
427
  end
430
428
 
431
429
  #
@@ -435,7 +433,7 @@ module Spidr
435
433
  # The hosts which have been visited.
436
434
  #
437
435
  def visited_hosts
438
- visited_urls.map { |uri| uri.host }.uniq
436
+ visited_urls.map(&:host).uniq
439
437
  end
440
438
 
441
439
  #
@@ -448,9 +446,7 @@ module Spidr
448
446
  # Specifies whether a URL was visited.
449
447
  #
450
448
  def visited?(url)
451
- url = URI(url.to_s) unless url.kind_of?(URI)
452
-
453
- return @history.include?(url)
449
+ @history.include?(URI(url))
454
450
  end
455
451
 
456
452
  #
@@ -469,9 +465,7 @@ module Spidr
469
465
  @failures.clear
470
466
 
471
467
  new_failures.each do |url|
472
- url = URI(url.to_s) unless url.kind_of?(URI)
473
-
474
- @failures << url
468
+ @failures << URI(url)
475
469
  end
476
470
 
477
471
  return @failures
@@ -487,9 +481,7 @@ module Spidr
487
481
  # Specifies whether the given URL was unable to be visited.
488
482
  #
489
483
  def failed?(url)
490
- url = URI(url.to_s) unless url.kind_of?(URI)
491
-
492
- return @failures.include?(url)
484
+ @failures.include?(URI(url))
493
485
  end
494
486
 
495
487
  alias pending_urls queue
@@ -510,9 +502,7 @@ module Spidr
510
502
  @queue.clear
511
503
 
512
504
  new_queue.each do |url|
513
- url = URI(url.to_s) unless url.kind_of?(URI)
514
-
515
- @queue << url
505
+ @queue << URI(url)
516
506
  end
517
507
 
518
508
  return @queue
@@ -594,7 +584,7 @@ module Spidr
594
584
  # The page for the response, or `nil` if the request failed.
595
585
  #
596
586
  def get_page(url)
597
- url = URI(url.to_s)
587
+ url = URI(url)
598
588
 
599
589
  prepare_request(url) do |session,path,headers|
600
590
  new_page = Page.new(url,session.get(path,headers))
@@ -629,7 +619,7 @@ module Spidr
629
619
  # @since 0.2.2
630
620
  #
631
621
  def post_page(url,post_data='')
632
- url = URI(url.to_s) unless url.kind_of?(URI)
622
+ url = URI(url)
633
623
 
634
624
  prepare_request(url) do |session,path,headers|
635
625
  new_page = Page.new(url,session.post(path,post_data,headers))
@@ -725,7 +715,7 @@ module Spidr
725
715
 
726
716
  unless @host_headers.empty?
727
717
  @host_headers.each do |name,header|
728
- if host.match(name)
718
+ if url.host.match(name)
729
719
  headers['Host'] = header
730
720
  break
731
721
  end
@@ -769,8 +759,6 @@ module Spidr
769
759
  # @since 0.2.2
770
760
  #
771
761
  def prepare_request(url,&block)
772
- host = url.host
773
- port = url.port
774
762
  path = unless url.path.empty?
775
763
  url.path
776
764
  else
@@ -16,7 +16,7 @@ module Spidr
16
16
  # agent.schemes = ['http']
17
17
  #
18
18
  def schemes=(new_schemes)
19
- @schemes = new_schemes.map { |scheme| scheme.to_s }
19
+ @schemes = new_schemes.map(&:to_s)
20
20
  end
21
21
 
22
22
  #
@@ -452,9 +452,9 @@ module Spidr
452
452
  #
453
453
  def visit_scheme?(scheme)
454
454
  if scheme
455
- return @schemes.include?(scheme)
455
+ @schemes.include?(scheme)
456
456
  else
457
- return true
457
+ true
458
458
  end
459
459
  end
460
460
 
@@ -21,7 +21,7 @@ module Spidr
21
21
  # @since 0.2.2
22
22
  #
23
23
  def sanitize_url(url)
24
- url = URI(url.to_s) unless url.kind_of?(URI)
24
+ url = URI(url)
25
25
 
26
26
  url.fragment = nil if @strip_fragments
27
27
  url.query = nil if @strip_query
@@ -34,7 +34,7 @@ module Spidr
34
34
  #
35
35
  def [](url)
36
36
  # normalize the url
37
- url = URI(url.to_s) unless url.kind_of?(URI)
37
+ url = URI(url)
38
38
 
39
39
  key = [url.scheme, url.host, url.port]
40
40
  paths = @credentials[key]
@@ -42,7 +42,7 @@ module Spidr
42
42
  return nil unless paths
43
43
 
44
44
  # longest path first
45
- ordered_paths = paths.keys.sort_by { |key| key.length }.reverse
45
+ ordered_paths = paths.keys.sort_by { |path_key| -path_key.length }
46
46
 
47
47
  # directories of the path
48
48
  path_dirs = URI.expand_path(url.path).split('/')
@@ -70,7 +70,7 @@ module Spidr
70
70
  #
71
71
  def []=(url,auth)
72
72
  # normalize the url
73
- url = URI(url.to_s) unless url.kind_of?(URI)
73
+ url = URI(url)
74
74
 
75
75
  # normalize the URL path
76
76
  path = URI.expand_path(url.path)
@@ -118,7 +118,7 @@ module Spidr
118
118
  #
119
119
  def for_url(url)
120
120
  if (auth = self[url])
121
- return Base64.encode64("#{auth.username}:#{auth.password}")
121
+ Base64.encode64("#{auth.username}:#{auth.password}")
122
122
  end
123
123
  end
124
124
 
@@ -144,7 +144,11 @@ module Spidr
144
144
  # @since 0.2.2
145
145
  #
146
146
  def size
147
- @credentials.inject(0) { |res, arr| res + arr[1].length }
147
+ total = 0
148
+
149
+ @credentials.each_value { |paths| total += paths.length }
150
+
151
+ return total
148
152
  end
149
153
 
150
154
  #
@@ -271,7 +271,7 @@ module Spidr
271
271
  return
272
272
  end
273
273
 
274
- if (path = new_url.path)
274
+ if (!new_url.opaque) && (path = new_url.path)
275
275
  # ensure that paths begin with a leading '/' for URI::FTP
276
276
  if (new_url.scheme == 'ftp' && !path.start_with?('/'))
277
277
  path.insert(0,'/')
@@ -65,7 +65,7 @@ module Spidr
65
65
  #
66
66
  def active?(url)
67
67
  # normalize the url
68
- url = URI(url.to_s) unless url.kind_of?(URI)
68
+ url = URI(url)
69
69
 
70
70
  # session key
71
71
  key = key_for(url)
@@ -84,7 +84,7 @@ module Spidr
84
84
  #
85
85
  def [](url)
86
86
  # normalize the url
87
- url = URI(url.to_s) unless url.kind_of?(URI)
87
+ url = URI(url)
88
88
 
89
89
  # session key
90
90
  key = key_for(url)
@@ -127,7 +127,7 @@ module Spidr
127
127
  #
128
128
  def kill!(url)
129
129
  # normalize the url
130
- url = URI(url.to_s) unless url.kind_of?(URI)
130
+ url = URI(url)
131
131
 
132
132
  # session key
133
133
  key = key_for(url)
@@ -16,6 +16,7 @@ module Spidr
16
16
  # @since 0.5.0
17
17
  #
18
18
  def self.robots?
19
+ @robots ||= false
19
20
  @robots
20
21
  end
21
22
 
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.6.0'
3
+ VERSION = '0.6.1'
4
4
  end
@@ -786,14 +786,12 @@ describe Agent do
786
786
 
787
787
  [
788
788
  "User-agent: *",
789
- 'Disallow: /',
789
+ 'Disallow: /secret',
790
790
  ].join($/)
791
791
  end
792
792
  end
793
793
 
794
794
  it "should not follow links Disallowed by robots.txt" do
795
- pending "https://github.com/bblimke/webmock/issues/642"
796
-
797
795
  expect(subject.history).to be == Set[
798
796
  URI("http://#{host}/"),
799
797
  URI("http://#{host}/pub")
metadata CHANGED
@@ -1,43 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-08-05 00:00:00.000000000 Z
11
+ date: 2019-10-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.3'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.3'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ~>
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '1.0'
33
+ version: '2.0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ~>
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '1.0'
40
+ version: '2.0'
41
41
  description: Spidr is a versatile Ruby web spidering library that can spider a site,
42
42
  multiple domains, certain links or infinitely. Spidr is designed to be fast and
43
43
  easy to use.
@@ -49,10 +49,10 @@ extra_rdoc_files:
49
49
  - LICENSE.txt
50
50
  - README.md
51
51
  files:
52
- - .gitignore
53
- - .rspec
54
- - .travis.yml
55
- - .yardopts
52
+ - ".gitignore"
53
+ - ".rspec"
54
+ - ".travis.yml"
55
+ - ".yardopts"
56
56
  - ChangeLog.md
57
57
  - Gemfile
58
58
  - LICENSE.txt
@@ -118,17 +118,16 @@ require_paths:
118
118
  - lib
119
119
  required_ruby_version: !ruby/object:Gem::Requirement
120
120
  requirements:
121
- - - '>='
121
+ - - ">="
122
122
  - !ruby/object:Gem::Version
123
123
  version: 2.0.0
124
124
  required_rubygems_version: !ruby/object:Gem::Requirement
125
125
  requirements:
126
- - - '>='
126
+ - - ">="
127
127
  - !ruby/object:Gem::Version
128
128
  version: '0'
129
129
  requirements: []
130
- rubyforge_project:
131
- rubygems_version: 2.0.14.1
130
+ rubygems_version: 3.0.3
132
131
  signing_key:
133
132
  specification_version: 4
134
133
  summary: A versatile Ruby web spidering library