spidr 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -105,7 +105,9 @@ module Spidr
105
105
  def each_redirect(&block)
106
106
  return enum_for(__method__) unless block
107
107
 
108
- if (locations = @response.get_fields('Location'))
108
+ locations = @response.get_fields('Location')
109
+
110
+ unless (locations.nil? || locations.empty?)
109
111
  # Location headers override any meta-refresh redirects in the HTML
110
112
  locations.each(&block)
111
113
  else
@@ -175,34 +177,30 @@ module Spidr
175
177
  #
176
178
  # @since 0.3.0
177
179
  #
178
- def each_link
180
+ def each_link(&block)
179
181
  return enum_for(__method__) unless block_given?
180
182
 
181
- filter = lambda { |url|
182
- yield url unless (url.nil? || url.empty?)
183
- }
184
-
185
- each_redirect(&filter) if is_redirect?
183
+ each_redirect(&block) if is_redirect?
186
184
 
187
185
  if (html? && doc)
188
- doc.search('//a[@href]').each do |a|
189
- filter.call(a.get_attribute('href'))
186
+ doc.search('//a[@href[string()]]').each do |a|
187
+ yield a.get_attribute('href')
190
188
  end
191
189
 
192
- doc.search('//frame[@src]').each do |iframe|
193
- filter.call(iframe.get_attribute('src'))
190
+ doc.search('//frame[@src[string()]]').each do |iframe|
191
+ yield iframe.get_attribute('src')
194
192
  end
195
193
 
196
- doc.search('//iframe[@src]').each do |iframe|
197
- filter.call(iframe.get_attribute('src'))
194
+ doc.search('//iframe[@src[string()]]').each do |iframe|
195
+ yield iframe.get_attribute('src')
198
196
  end
199
197
 
200
- doc.search('//link[@href]').each do |link|
201
- filter.call(link.get_attribute('href'))
198
+ doc.search('//link[@href[string()]]').each do |link|
199
+ yield link.get_attribute('href')
202
200
  end
203
201
 
204
- doc.search('//script[@src]').each do |script|
205
- filter.call(script.get_attribute('src'))
202
+ doc.search('//script[@src[string()]]').each do |script|
203
+ yield script.get_attribute('src')
206
204
  end
207
205
  end
208
206
  end
@@ -211,7 +209,7 @@ module Spidr
211
209
  # The links from within the page.
212
210
  #
213
211
  # @return [Array<String>]
214
- # All links within the HTML page, frame/iframe source URLs and any
212
+ # All links within the HTML page, `frame`/`iframe` source URLs and any
215
213
  # links in the `Location` header.
216
214
  #
217
215
  def links
@@ -271,7 +269,7 @@ module Spidr
271
269
  return
272
270
  end
273
271
 
274
- if (path = new_url.path)
272
+ if (!new_url.opaque) && (path = new_url.path)
275
273
  # ensure that paths begin with a leading '/' for URI::FTP
276
274
  if (new_url.scheme == 'ftp' && !path.start_with?('/'))
277
275
  path.insert(0,'/')
@@ -22,16 +22,6 @@ module Spidr
22
22
 
23
23
  alias ok? is_ok?
24
24
 
25
- #
26
- # Determines if the response code is `308`.
27
- #
28
- # @return [Boolean]
29
- # Specifies whether the response code is `308`.
30
- #
31
- def timedout?
32
- code == 308
33
- end
34
-
35
25
  #
36
26
  # Determines if the response code is `400`.
37
27
  #
@@ -78,6 +68,18 @@ module Spidr
78
68
 
79
69
  alias missing? is_missing?
80
70
 
71
+ #
72
+ # Determines if the response code is `408`.
73
+ #
74
+ # @return [Boolean]
75
+ # Specifies whether the response code is `408`.
76
+ #
77
+ def is_timedout?
78
+ code == 408
79
+ end
80
+
81
+ alias timedout? is_timedout?
82
+
81
83
  #
82
84
  # Determines if the response code is `500`.
83
85
  #
data/lib/spidr/proxy.rb CHANGED
@@ -10,28 +10,20 @@ module Spidr
10
10
  #
11
11
  # Initializes the proxy.
12
12
  #
13
- # @param [Hash] attributes
14
- # Attributes for the proxy.
15
- #
16
- # @option attributes [String] :host
13
+ # @param [String] host
17
14
  # The host the proxy is running on.
18
15
  #
19
- # @option attributes [Integer] :port
16
+ # @param [Integer] port
20
17
  # The port the proxy is running on.
21
18
  #
22
- # @option attributes [String] :user
19
+ # @param [String] user
23
20
  # The user to authenticate as with the proxy.
24
21
  #
25
- # @option attributes [String] :password
22
+ # @param [String] password
26
23
  # The password to authenticate with.
27
24
  #
28
- def initialize(attributes={})
29
- super(
30
- attributes[:host],
31
- attributes.fetch(:port,DEFAULT_PORT),
32
- attributes[:user],
33
- attributes[:password]
34
- )
25
+ def initialize(host: nil, port: DEFAULT_PORT, user: nil, password: nil)
26
+ super(host,port,user,password)
35
27
  end
36
28
 
37
29
  #
data/lib/spidr/rules.rb CHANGED
@@ -14,21 +14,18 @@ module Spidr
14
14
  #
15
15
  # Creates a new Rules object.
16
16
  #
17
- # @param [Hash] options
18
- # Additional options.
19
- #
20
- # @option options [Array<String, Regexp, Proc>] :accept
17
+ # @param [Array<String, Regexp, Proc>, nil] accept
21
18
  # The patterns to accept data with.
22
19
  #
23
- # @option options [Array<String, Regexp, Proc>] :reject
20
+ # @param [Array<String, Regexp, Proc>, nil] reject
24
21
  # The patterns to reject data with.
25
22
  #
26
- def initialize(options={})
23
+ def initialize(accept: nil, reject: nil)
27
24
  @accept = []
28
25
  @reject = []
29
26
 
30
- @accept += options[:accept] if options[:accept]
31
- @reject += options[:reject] if options[:reject]
27
+ @accept += accept if accept
28
+ @reject += reject if reject
32
29
  end
33
30
 
34
31
  #
@@ -17,37 +17,39 @@ module Spidr
17
17
  #
18
18
  # Creates a new session cache.
19
19
  #
20
- # @param [Hash] options
21
- # Configuration options.
22
- #
23
- # @option [Hash] :proxy (Spidr.proxy)
20
+ # @param [Hash] proxy
24
21
  # Proxy options.
25
22
  #
26
- # @option [Integer] :open_timeout (Spidr.open_timeout)
27
- # Optional open timeout.
23
+ # @param [Integer] open_timeout
24
+ # Optional open connection timeout.
28
25
  #
29
- # @option [Integer] :ssl_timeout (Spidr.ssl_timeout)
30
- # Optional ssl timeout.
26
+ # @param [Integer] ssl_timeout
27
+ # Optional SSL connection timeout.
31
28
  #
32
- # @option [Integer] :read_timeout (Spidr.read_timeout)
29
+ # @param [Integer] read_timeout
33
30
  # Optional read timeout.
34
31
  #
35
- # @option [Integer] :continue_timeout (Spidr.continue_timeout)
32
+ # @param [Integer] continue_timeout
36
33
  # Optional `Continue` timeout.
37
34
  #
38
- # @option [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
35
+ # @param [Integer] keep_alive_timeout
39
36
  # Optional `Keep-Alive` timeout.
40
37
  #
41
38
  # @since 0.6.0
42
39
  #
43
- def initialize(options={})
44
- @proxy = options.fetch(:proxy,Spidr.proxy)
40
+ def initialize(proxy: Spidr.proxy,
41
+ open_timeout: Spidr.open_timeout,
42
+ ssl_timeout: Spidr.ssl_timeout,
43
+ read_timeout: Spidr.read_timeout,
44
+ continue_timeout: Spidr.continue_timeout,
45
+ keep_alive_timeout: Spidr.keep_alive_timeout)
46
+ self.proxy = proxy
45
47
 
46
- @open_timeout = options.fetch(:open_timeout,Spidr.open_timeout)
47
- @ssl_timeout = options.fetch(:ssl_timeout,Spidr.ssl_timeout)
48
- @read_timeout = options.fetch(:read_timeout,Spidr.read_timeout)
49
- @continue_timeout = options.fetch(:continue_timeout,Spidr.continue_timeout)
50
- @keep_alive_timeout = options.fetch(:keep_alive_timeout,Spidr.keep_alive_timeout)
48
+ self.open_timeout = open_timeout
49
+ self.ssl_timeout = ssl_timeout
50
+ self.read_timeout = read_timeout
51
+ self.continue_timeout = continue_timeout
52
+ self.keep_alive_timeout = keep_alive_timeout
51
53
 
52
54
  @sessions = {}
53
55
  end
@@ -65,7 +67,7 @@ module Spidr
65
67
  #
66
68
  def active?(url)
67
69
  # normalize the url
68
- url = URI(url.to_s) unless url.kind_of?(URI)
70
+ url = URI(url)
69
71
 
70
72
  # session key
71
73
  key = key_for(url)
@@ -84,7 +86,7 @@ module Spidr
84
86
  #
85
87
  def [](url)
86
88
  # normalize the url
87
- url = URI(url.to_s) unless url.kind_of?(URI)
89
+ url = URI(url)
88
90
 
89
91
  # session key
90
92
  key = key_for(url)
@@ -127,7 +129,7 @@ module Spidr
127
129
  #
128
130
  def kill!(url)
129
131
  # normalize the url
130
- url = URI(url.to_s) unless url.kind_of?(URI)
132
+ url = URI(url)
131
133
 
132
134
  # session key
133
135
  key = key_for(url)
@@ -1,5 +1,7 @@
1
1
  require 'spidr/proxy'
2
2
 
3
+ require 'uri/http'
4
+
3
5
  module Spidr
4
6
  module Settings
5
7
  #
@@ -21,7 +23,7 @@ module Spidr
21
23
  #
22
24
  # Sets the proxy information used by Agent objects.
23
25
  #
24
- # @param [Spidr::Proxy, Hash, nil] new_proxy
26
+ # @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] new_proxy
25
27
  # The new proxy information.
26
28
  #
27
29
  # @option new_proxy [String] :host
@@ -41,11 +43,23 @@ module Spidr
41
43
  #
42
44
  def proxy=(new_proxy)
43
45
  @proxy = case new_proxy
44
- when Spidr::Proxy then new_proxy
45
- when Hash then Spidr::Proxy.new(new_proxy)
46
- when nil then Spidr::Proxy.new
46
+ when Spidr::Proxy
47
+ new_proxy
48
+ when Hash
49
+ Spidr::Proxy.new(**new_proxy)
50
+ when String, URI::HTTP
51
+ proxy_uri = URI(new_proxy)
52
+
53
+ Spidr::Proxy.new(
54
+ host: proxy_uri.host,
55
+ port: proxy_uri.port,
56
+ user: proxy_uri.user,
57
+ password: proxy_uri.password
58
+ )
59
+ when nil
60
+ Spidr::Proxy.new
47
61
  else
48
- raise(TypeError,"#{self.class}#{__method__} only accepts Proxy, Hash or nil")
62
+ raise(TypeError,"#{self.class}#{__method__} only accepts Spidr::Proxy, URI::HTTP, Hash, or nil")
49
63
  end
50
64
  end
51
65
 
data/lib/spidr/spidr.rb CHANGED
@@ -16,6 +16,7 @@ module Spidr
16
16
  # @since 0.5.0
17
17
  #
18
18
  def self.robots?
19
+ @robots ||= false
19
20
  @robots
20
21
  end
21
22
 
@@ -35,22 +36,31 @@ module Spidr
35
36
  #
36
37
  # @see Agent.start_at
37
38
  #
38
- def self.start_at(url,options={},&block)
39
- Agent.start_at(url,options,&block)
39
+ def self.start_at(url,**kwargs,&block)
40
+ Agent.start_at(url,**kwargs,&block)
40
41
  end
41
42
 
42
43
  #
43
44
  # @see Agent.host
44
45
  #
45
- def self.host(name,options={},&block)
46
- Agent.host(name,options,&block)
46
+ def self.host(name,**kwargs,&block)
47
+ Agent.host(name,**kwargs,&block)
48
+ end
49
+
50
+ #
51
+ # @see Agent.domain
52
+ #
53
+ # @since 0.7.0
54
+ #
55
+ def self.domain(name,options={},&block)
56
+ Agent.domain(name,options,&block)
47
57
  end
48
58
 
49
59
  #
50
60
  # @see Agent.site
51
61
  #
52
- def self.site(url,options={},&block)
53
- Agent.site(url,options,&block)
62
+ def self.site(url,**kwargs,&block)
63
+ Agent.site(url,**kwargs,&block)
54
64
  end
55
65
 
56
66
  #
data/lib/spidr/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.6.0'
3
+ VERSION = '0.7.0'
4
4
  end