spidr 0.6.1 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.editorconfig +11 -0
  3. data/.github/workflows/ruby.yml +26 -0
  4. data/.gitignore +4 -5
  5. data/ChangeLog.md +19 -1
  6. data/Gemfile +7 -4
  7. data/LICENSE.txt +1 -1
  8. data/README.md +136 -79
  9. data/Rakefile +1 -0
  10. data/gemspec.yml +7 -0
  11. data/lib/spidr/agent/actions.rb +3 -1
  12. data/lib/spidr/agent/events.rb +3 -1
  13. data/lib/spidr/agent/filters.rb +57 -56
  14. data/lib/spidr/agent/robots.rb +2 -0
  15. data/lib/spidr/agent/sanitizers.rb +7 -8
  16. data/lib/spidr/agent.rb +232 -108
  17. data/lib/spidr/auth_credential.rb +2 -0
  18. data/lib/spidr/auth_store.rb +9 -7
  19. data/lib/spidr/cookie_jar.rb +7 -5
  20. data/lib/spidr/extensions/uri.rb +3 -1
  21. data/lib/spidr/extensions.rb +3 -1
  22. data/lib/spidr/page/content_types.rb +53 -0
  23. data/lib/spidr/page/cookies.rb +2 -0
  24. data/lib/spidr/page/html.rb +21 -20
  25. data/lib/spidr/page/status_codes.rb +15 -11
  26. data/lib/spidr/page.rb +3 -1
  27. data/lib/spidr/proxy.rb +8 -14
  28. data/lib/spidr/rules.rb +7 -8
  29. data/lib/spidr/session_cache.rb +26 -22
  30. data/lib/spidr/settings/proxy.rb +22 -6
  31. data/lib/spidr/settings/timeouts.rb +2 -0
  32. data/lib/spidr/settings/user_agent.rb +2 -0
  33. data/lib/spidr/settings.rb +5 -3
  34. data/lib/spidr/spidr.rb +22 -11
  35. data/lib/spidr/version.rb +3 -1
  36. data/lib/spidr.rb +5 -3
  37. data/spec/agent_spec.rb +356 -7
  38. data/spec/example_page.rb +2 -0
  39. data/spec/page/content_types_spec.rb +22 -0
  40. data/spec/page/html_spec.rb +255 -51
  41. data/spec/page/status_codes_spec.rb +4 -4
  42. data/spec/proxy_spec.rb +2 -2
  43. data/spec/settings/proxy_examples.rb +31 -11
  44. data/spec/spec_helper.rb +3 -0
  45. data/spidr.gemspec +1 -4
  46. metadata +8 -7
  47. data/.travis.yml +0 -16
@@ -1,5 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../extensions/uri'
4
+
1
5
  require 'nokogiri'
2
- require 'spidr/extensions/uri'
3
6
 
4
7
  module Spidr
5
8
  class Page
@@ -105,7 +108,9 @@ module Spidr
105
108
  def each_redirect(&block)
106
109
  return enum_for(__method__) unless block
107
110
 
108
- if (locations = @response.get_fields('Location'))
111
+ locations = @response.get_fields('Location')
112
+
113
+ unless (locations.nil? || locations.empty?)
109
114
  # Location headers override any meta-refresh redirects in the HTML
110
115
  locations.each(&block)
111
116
  else
@@ -175,34 +180,30 @@ module Spidr
175
180
  #
176
181
  # @since 0.3.0
177
182
  #
178
- def each_link
183
+ def each_link(&block)
179
184
  return enum_for(__method__) unless block_given?
180
185
 
181
- filter = lambda { |url|
182
- yield url unless (url.nil? || url.empty?)
183
- }
184
-
185
- each_redirect(&filter) if is_redirect?
186
+ each_redirect(&block) if is_redirect?
186
187
 
187
188
  if (html? && doc)
188
- doc.search('//a[@href]').each do |a|
189
- filter.call(a.get_attribute('href'))
189
+ doc.search('//a[@href[string()]]').each do |a|
190
+ yield a.get_attribute('href')
190
191
  end
191
192
 
192
- doc.search('//frame[@src]').each do |iframe|
193
- filter.call(iframe.get_attribute('src'))
193
+ doc.search('//frame[@src[string()]]').each do |iframe|
194
+ yield iframe.get_attribute('src')
194
195
  end
195
196
 
196
- doc.search('//iframe[@src]').each do |iframe|
197
- filter.call(iframe.get_attribute('src'))
197
+ doc.search('//iframe[@src[string()]]').each do |iframe|
198
+ yield iframe.get_attribute('src')
198
199
  end
199
200
 
200
- doc.search('//link[@href]').each do |link|
201
- filter.call(link.get_attribute('href'))
201
+ doc.search('//link[@href[string()]]').each do |link|
202
+ yield link.get_attribute('href')
202
203
  end
203
204
 
204
- doc.search('//script[@src]').each do |script|
205
- filter.call(script.get_attribute('src'))
205
+ doc.search('//script[@src[string()]]').each do |script|
206
+ yield script.get_attribute('src')
206
207
  end
207
208
  end
208
209
  end
@@ -211,7 +212,7 @@ module Spidr
211
212
  # The links from within the page.
212
213
  #
213
214
  # @return [Array<String>]
214
- # All links within the HTML page, frame/iframe source URLs and any
215
+ # All links within the HTML page, `frame`/`iframe` source URLs and any
215
216
  # links in the `Location` header.
216
217
  #
217
218
  def links
@@ -267,7 +268,7 @@ module Spidr
267
268
  link = link.to_s
268
269
  new_url = begin
269
270
  url.merge(link)
270
- rescue Exception
271
+ rescue URI::Error
271
272
  return
272
273
  end
273
274
 
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Spidr
2
4
  class Page
3
5
  #
@@ -22,16 +24,6 @@ module Spidr
22
24
 
23
25
  alias ok? is_ok?
24
26
 
25
- #
26
- # Determines if the response code is `308`.
27
- #
28
- # @return [Boolean]
29
- # Specifies whether the response code is `308`.
30
- #
31
- def timedout?
32
- code == 308
33
- end
34
-
35
27
  #
36
28
  # Determines if the response code is `400`.
37
29
  #
@@ -78,6 +70,18 @@ module Spidr
78
70
 
79
71
  alias missing? is_missing?
80
72
 
73
+ #
74
+ # Determines if the response code is `408`.
75
+ #
76
+ # @return [Boolean]
77
+ # Specifies whether the response code is `408`.
78
+ #
79
+ def is_timedout?
80
+ code == 408
81
+ end
82
+
83
+ alias timedout? is_timedout?
84
+
81
85
  #
82
86
  # Determines if the response code is `500`.
83
87
  #
@@ -90,7 +94,7 @@ module Spidr
90
94
 
91
95
  #
92
96
  # Determines if the response code is `300`, `301`, `302`, `303`
93
- # or `307`. Also checks for "soft" redirects added at the page
97
+ # or `307`. Also checks for "soft" redirects added at the page
94
98
  # level by a meta refresh tag.
95
99
  #
96
100
  # @return [Boolean]
data/lib/spidr/page.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Spidr
2
4
  #
3
5
  # Represents a requested page from a website.
@@ -142,7 +144,7 @@ module Spidr
142
144
 
143
145
  return super(name,*arguments,&block)
144
146
  end
145
-
147
+
146
148
  end
147
149
  end
148
150
 
data/lib/spidr/proxy.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Spidr
2
4
  #
3
5
  # @since 0.6.0
@@ -10,28 +12,20 @@ module Spidr
10
12
  #
11
13
  # Initializes the proxy.
12
14
  #
13
- # @param [Hash] attributes
14
- # Attributes for the proxy.
15
- #
16
- # @option attributes [String] :host
15
+ # @param [String] host
17
16
  # The host the proxy is running on.
18
17
  #
19
- # @option attributes [Integer] :port
18
+ # @param [Integer] port
20
19
  # The port the proxy is running on.
21
20
  #
22
- # @option attributes [String] :user
21
+ # @param [String] user
23
22
  # The user to authenticate as with the proxy.
24
23
  #
25
- # @option attributes [String] :password
24
+ # @param [String] password
26
25
  # The password to authenticate with.
27
26
  #
28
- def initialize(attributes={})
29
- super(
30
- attributes[:host],
31
- attributes.fetch(:port,DEFAULT_PORT),
32
- attributes[:user],
33
- attributes[:password]
34
- )
27
+ def initialize(host: nil, port: DEFAULT_PORT, user: nil, password: nil)
28
+ super(host,port,user,password)
35
29
  end
36
30
 
37
31
  #
data/lib/spidr/rules.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Spidr
2
4
  #
3
5
  # The {Rules} class represents collections of acceptance and rejection
@@ -14,21 +16,18 @@ module Spidr
14
16
  #
15
17
  # Creates a new Rules object.
16
18
  #
17
- # @param [Hash] options
18
- # Additional options.
19
- #
20
- # @option options [Array<String, Regexp, Proc>] :accept
19
+ # @param [Array<String, Regexp, Proc>, nil] accept
21
20
  # The patterns to accept data with.
22
21
  #
23
- # @option options [Array<String, Regexp, Proc>] :reject
22
+ # @param [Array<String, Regexp, Proc>, nil] reject
24
23
  # The patterns to reject data with.
25
24
  #
26
- def initialize(options={})
25
+ def initialize(accept: nil, reject: nil)
27
26
  @accept = []
28
27
  @reject = []
29
28
 
30
- @accept += options[:accept] if options[:accept]
31
- @reject += options[:reject] if options[:reject]
29
+ @accept += accept if accept
30
+ @reject += reject if reject
32
31
  end
33
32
 
34
33
  #
@@ -1,6 +1,8 @@
1
- require 'spidr/settings/proxy'
2
- require 'spidr/settings/timeouts'
3
- require 'spidr/spidr'
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'settings/proxy'
4
+ require_relative 'settings/timeouts'
5
+ require_relative 'spidr'
4
6
 
5
7
  require 'net/http'
6
8
  require 'openssl'
@@ -17,37 +19,39 @@ module Spidr
17
19
  #
18
20
  # Creates a new session cache.
19
21
  #
20
- # @param [Hash] options
21
- # Configuration options.
22
- #
23
- # @option [Hash] :proxy (Spidr.proxy)
22
+ # @param [Hash] proxy
24
23
  # Proxy options.
25
24
  #
26
- # @option [Integer] :open_timeout (Spidr.open_timeout)
27
- # Optional open timeout.
25
+ # @param [Integer] open_timeout
26
+ # Optional open connection timeout.
28
27
  #
29
- # @option [Integer] :ssl_timeout (Spidr.ssl_timeout)
30
- # Optional ssl timeout.
28
+ # @param [Integer] ssl_timeout
29
+ # Optional SSL connection timeout.
31
30
  #
32
- # @option [Integer] :read_timeout (Spidr.read_timeout)
31
+ # @param [Integer] read_timeout
33
32
  # Optional read timeout.
34
33
  #
35
- # @option [Integer] :continue_timeout (Spidr.continue_timeout)
34
+ # @param [Integer] continue_timeout
36
35
  # Optional `Continue` timeout.
37
36
  #
38
- # @option [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
37
+ # @param [Integer] keep_alive_timeout
39
38
  # Optional `Keep-Alive` timeout.
40
39
  #
41
40
  # @since 0.6.0
42
41
  #
43
- def initialize(options={})
44
- @proxy = options.fetch(:proxy,Spidr.proxy)
42
+ def initialize(proxy: Spidr.proxy,
43
+ open_timeout: Spidr.open_timeout,
44
+ ssl_timeout: Spidr.ssl_timeout,
45
+ read_timeout: Spidr.read_timeout,
46
+ continue_timeout: Spidr.continue_timeout,
47
+ keep_alive_timeout: Spidr.keep_alive_timeout)
48
+ self.proxy = proxy
45
49
 
46
- @open_timeout = options.fetch(:open_timeout,Spidr.open_timeout)
47
- @ssl_timeout = options.fetch(:ssl_timeout,Spidr.ssl_timeout)
48
- @read_timeout = options.fetch(:read_timeout,Spidr.read_timeout)
49
- @continue_timeout = options.fetch(:continue_timeout,Spidr.continue_timeout)
50
- @keep_alive_timeout = options.fetch(:keep_alive_timeout,Spidr.keep_alive_timeout)
50
+ self.open_timeout = open_timeout
51
+ self.ssl_timeout = ssl_timeout
52
+ self.read_timeout = read_timeout
53
+ self.continue_timeout = continue_timeout
54
+ self.keep_alive_timeout = keep_alive_timeout
51
55
 
52
56
  @sessions = {}
53
57
  end
@@ -133,7 +137,7 @@ module Spidr
133
137
  key = key_for(url)
134
138
 
135
139
  if (sess = @sessions[key])
136
- begin
140
+ begin
137
141
  sess.finish
138
142
  rescue IOError
139
143
  end
@@ -1,4 +1,8 @@
1
- require 'spidr/proxy'
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../proxy'
4
+
5
+ require 'uri/http'
2
6
 
3
7
  module Spidr
4
8
  module Settings
@@ -21,7 +25,7 @@ module Spidr
21
25
  #
22
26
  # Sets the proxy information used by Agent objects.
23
27
  #
24
- # @param [Spidr::Proxy, Hash, nil] new_proxy
28
+ # @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] new_proxy
25
29
  # The new proxy information.
26
30
  #
27
31
  # @option new_proxy [String] :host
@@ -41,11 +45,23 @@ module Spidr
41
45
  #
42
46
  def proxy=(new_proxy)
43
47
  @proxy = case new_proxy
44
- when Spidr::Proxy then new_proxy
45
- when Hash then Spidr::Proxy.new(new_proxy)
46
- when nil then Spidr::Proxy.new
48
+ when Spidr::Proxy
49
+ new_proxy
50
+ when Hash
51
+ Spidr::Proxy.new(**new_proxy)
52
+ when String, URI::HTTP
53
+ proxy_uri = URI(new_proxy)
54
+
55
+ Spidr::Proxy.new(
56
+ host: proxy_uri.host,
57
+ port: proxy_uri.port,
58
+ user: proxy_uri.user,
59
+ password: proxy_uri.password
60
+ )
61
+ when nil
62
+ Spidr::Proxy.new
47
63
  else
48
- raise(TypeError,"#{self.class}#{__method__} only accepts Proxy, Hash or nil")
64
+ raise(TypeError,"#{self.class}#{__method__} only accepts Spidr::Proxy, URI::HTTP, Hash, or nil")
49
65
  end
50
66
  end
51
67
 
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Spidr
2
4
  module Settings
3
5
  #
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Spidr
2
4
  module Settings
3
5
  #
@@ -1,3 +1,5 @@
1
- require 'spidr/settings/proxy'
2
- require 'spidr/settings/timeouts'
3
- require 'spidr/settings/user_agent'
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'settings/proxy'
4
+ require_relative 'settings/timeouts'
5
+ require_relative 'settings/user_agent'
data/lib/spidr/spidr.rb CHANGED
@@ -1,7 +1,9 @@
1
- require 'spidr/settings/proxy'
2
- require 'spidr/settings/timeouts'
3
- require 'spidr/settings/user_agent'
4
- require 'spidr/agent'
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'settings/proxy'
4
+ require_relative 'settings/timeouts'
5
+ require_relative 'settings/user_agent'
6
+ require_relative 'agent'
5
7
 
6
8
  module Spidr
7
9
  extend Settings::Proxy
@@ -36,25 +38,34 @@ module Spidr
36
38
  #
37
39
  # @see Agent.start_at
38
40
  #
39
- def self.start_at(url,options={},&block)
40
- Agent.start_at(url,options,&block)
41
+ def self.start_at(url,**kwargs,&block)
42
+ Agent.start_at(url,**kwargs,&block)
41
43
  end
42
44
 
43
45
  #
44
46
  # @see Agent.host
45
47
  #
46
- def self.host(name,options={},&block)
47
- Agent.host(name,options,&block)
48
+ def self.host(name,**kwargs,&block)
49
+ Agent.host(name,**kwargs,&block)
50
+ end
51
+
52
+ #
53
+ # @see Agent.domain
54
+ #
55
+ # @since 0.7.0
56
+ #
57
+ def self.domain(name,**kwargs,&block)
58
+ Agent.domain(name,**kwargs,&block)
48
59
  end
49
60
 
50
61
  #
51
62
  # @see Agent.site
52
63
  #
53
- def self.site(url,options={},&block)
54
- Agent.site(url,options,&block)
64
+ def self.site(url,**kwargs,&block)
65
+ Agent.site(url,**kwargs,&block)
55
66
  end
56
67
 
57
- #
68
+ #
58
69
  # @abstract
59
70
  #
60
71
  def self.robots
data/lib/spidr/version.rb CHANGED
@@ -1,4 +1,6 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Spidr
2
4
  # Spidr version
3
- VERSION = '0.6.1'
5
+ VERSION = '0.7.1'
4
6
  end
data/lib/spidr.rb CHANGED
@@ -1,3 +1,5 @@
1
- require 'spidr/agent'
2
- require 'spidr/spidr'
3
- require 'spidr/version'
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'spidr/agent'
4
+ require_relative 'spidr/spidr'
5
+ require_relative 'spidr/version'