webinspector 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
  require 'uri'
3
5
  require 'open-uri'
@@ -5,133 +7,211 @@ require 'open_uri_redirections'
5
7
  require 'faraday'
6
8
  require 'public_suffix'
7
9
 
10
+ # Explicitly load Faraday::Retry if available
11
+ begin
12
+ require 'faraday/retry'
13
+ rescue LoadError
14
+ # Faraday retry is not available
15
+ end
16
+
8
17
  require File.expand_path(File.join(File.dirname(__FILE__), 'inspector'))
9
18
  require File.expand_path(File.join(File.dirname(__FILE__), 'request'))
10
19
 
11
20
  module WebInspector
12
21
  class Page
13
- attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links, :domain_links, :domain_images, :images, :response
14
-
22
+ attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links,
23
+ :domain_links, :domain_images, :images, :response, :status_code, :favicon
24
+
25
+ DEFAULT_TIMEOUT = 30
26
+ DEFAULT_RETRIES = 3
27
+ DEFAULT_USER_AGENT = -> { "WebInspector/#{WebInspector::VERSION} (+https://github.com/davidesantangelo/webinspector)" }
28
+
29
+ # Initialize a new WebInspector Page
30
+ #
31
+ # @param url [String] The URL to inspect
32
+ # @param options [Hash] Optional parameters
33
+ # @option options [Integer] :timeout Request timeout in seconds
34
+ # @option options [Integer] :retries Number of retries for failed requests
35
+ # @option options [Hash] :headers Custom HTTP headers
36
+ # @option options [Boolean] :allow_redirections Whether to follow redirects
37
+ # @option options [String] :user_agent Custom user agent
15
38
  def initialize(url, options = {})
16
39
  @url = url
17
40
  @options = options
41
+ @retries = options[:retries] || DEFAULT_RETRIES
42
+ @timeout = options[:timeout] || DEFAULT_TIMEOUT
43
+ @headers = options[:headers] || { 'User-Agent' => options[:user_agent] || DEFAULT_USER_AGENT.call }
44
+ @allow_redirections = options[:allow_redirections].nil? || options[:allow_redirections]
45
+
18
46
  @request = WebInspector::Request.new(url)
19
- @inspector = WebInspector::Inspector.new(page)
20
- end
21
47
 
22
- def title
23
- @inspector.title
48
+ begin
49
+ @inspector = WebInspector::Inspector.new(page)
50
+ @inspector.set_url(url, host)
51
+ @status_code = 200
52
+ rescue StandardError => e
53
+ @error = e
54
+ @status_code = e.respond_to?(:status_code) ? e.status_code : 500
55
+ end
24
56
  end
25
57
 
26
- def description
27
- @inspector.description
58
+ # Check if the page was successfully loaded
59
+ #
60
+ # @return [Boolean] true if the page was loaded, false otherwise
61
+ def success?
62
+ !@inspector.nil? && !@error
28
63
  end
29
64
 
30
- def body
31
- @inspector.body
32
- end
33
-
34
- def links
35
- @inspector.links
65
+ # Get the error message if any
66
+ #
67
+ # @return [String, nil] The error message or nil if no error
68
+ def error_message
69
+ @error&.message
36
70
  end
37
71
 
38
- def images
39
- @inspector.images
40
- end
72
+ # Delegate methods to inspector
73
+ %i[title description body links images meta].each do |method|
74
+ define_method(method) do
75
+ return nil unless success?
41
76
 
42
- def meta
43
- @inspector.meta
77
+ @inspector.send(method)
78
+ end
44
79
  end
45
80
 
81
+ # Special case for find method that takes arguments
46
82
  def find(words)
83
+ return nil unless success?
84
+
47
85
  @inspector.find(words)
48
86
  end
49
87
 
50
- def url
51
- @request.url
88
+ # Delegate methods to request
89
+ %i[url host domain scheme port].each do |method|
90
+ define_method(method) do
91
+ @request.send(method)
92
+ end
52
93
  end
53
94
 
54
- def host
55
- @request.host
56
- end
95
+ # Get the favicon URL if available
96
+ #
97
+ # @return [String, nil] The favicon URL or nil if not found
98
+ def favicon
99
+ return @favicon if defined?(@favicon)
57
100
 
58
- def domain
59
- @request.domain
60
- end
101
+ return nil unless success?
61
102
 
62
- def scheme
63
- @request.scheme
64
- end
103
+ @favicon = begin
104
+ # Try multiple approaches to find favicon
105
+
106
+ # 1. Look for standard favicon link tags
107
+ favicon_link = @inspector.page.css("link[rel='shortcut icon'], link[rel='icon'], link[rel='apple-touch-icon']").first
108
+ if favicon_link && favicon_link['href']
109
+ begin
110
+ return URI.join(url, favicon_link['href']).to_s
111
+ rescue URI::InvalidURIError
112
+ # Try next method
113
+ end
114
+ end
65
115
 
66
- def port
67
- @request.port
116
+ # 2. Try the default location /favicon.ico
117
+ "#{scheme}://#{host}/favicon.ico"
118
+ rescue StandardError
119
+ nil
120
+ end
68
121
  end
69
122
 
70
123
  def domain_links(u = domain)
124
+ return [] unless success?
125
+
71
126
  @inspector.domain_links(u, host)
72
127
  end
73
128
 
74
129
  def domain_images(u = domain)
130
+ return [] unless success?
131
+
75
132
  @inspector.domain_images(u, host)
76
133
  end
77
134
 
135
+ # Get full JSON representation of the page
136
+ #
137
+ # @return [Hash] JSON representation of the page
78
138
  def to_hash
79
139
  {
80
- 'url' => url,
81
- 'scheme' => scheme,
82
- 'host' => host,
83
- 'port' => port,
84
- 'title' => title,
85
- 'description' => description,
86
- 'meta' => meta,
87
- 'links' => links,
88
- 'images' => images,
89
- 'response' => { 'status' => response.status,
90
- 'headers' => response.headers }
140
+ 'url' => url,
141
+ 'scheme' => scheme,
142
+ 'host' => host,
143
+ 'port' => port,
144
+ 'title' => title,
145
+ 'description' => description,
146
+ 'meta' => meta,
147
+ 'links' => links,
148
+ 'images' => images,
149
+ 'favicon' => favicon,
150
+ 'response' => {
151
+ 'status' => status_code,
152
+ 'headers' => response&.headers || {},
153
+ 'success' => success?
154
+ },
155
+ 'error' => error_message
91
156
  }
92
157
  end
93
158
 
94
159
  def response
95
160
  @response ||= fetch
96
- rescue Faraday::TimeoutError, Faraday::Error::ConnectionFailed, RuntimeError, URI::InvalidURIError => e
161
+ rescue StandardError => e
162
+ @error = e
97
163
  nil
98
164
  end
99
165
 
100
166
  private
101
-
167
+
102
168
  def fetch
103
- session = Faraday.new(:url => url) do |faraday|
104
- faraday.request :retry, max: @retries
169
+ session = Faraday.new(url: url) do |faraday|
170
+ # Configure retries based on available middleware
171
+ faraday.request :retry, { max: @retries } if defined?(Faraday::Retry)
105
172
 
173
+ # Configure redirect handling
106
174
  if @allow_redirections
107
- faraday.use FaradayMiddleware::FollowRedirects, limit: 10
108
- faraday.use :cookie_jar
175
+ begin
176
+ faraday.use FaradayMiddleware::FollowRedirects, limit: 10
177
+ faraday.use :cookie_jar
178
+ rescue NameError, NoMethodError
179
+ # Continue without middleware if not available
180
+ end
109
181
  end
110
182
 
111
- faraday.headers.merge!(@headers || {})
183
+ faraday.headers.merge!(@headers)
112
184
  faraday.adapter :net_http
113
185
  end
114
186
 
115
- response = session.get do |req|
116
- req.options.timeout = @connection_timeout
117
- req.options.open_timeout = @read_timeout
118
- end
187
+ # Manual retry mechanism as a backup
188
+ retries = 0
119
189
 
120
- @url = response.env.url.to_s
190
+ begin
191
+ response = session.get do |req|
192
+ req.options.timeout = @timeout
193
+ req.options.open_timeout = @timeout
194
+ end
121
195
 
122
- response
196
+ @url = response.env.url.to_s
197
+ response
198
+ rescue Faraday::TimeoutError, Faraday::ConnectionFailed => e
199
+ retries += 1
200
+ retry if retries <= @retries
201
+ raise e
202
+ end
123
203
  end
124
204
 
125
205
  def with_default_scheme(request)
126
- request.url && request.scheme.nil? ? 'http://' + request.url : request.url
127
- end
128
-
129
- def default_user_agent
130
- "WebInspector/#{WebInspector::VERSION} (+https://github.com/davidesantangelo/webinspector)"
206
+ request.url && request.scheme.nil? ? "http://#{request.url}" : request.url
131
207
  end
132
208
 
133
209
  def page
134
- Nokogiri::HTML(open(with_default_scheme(@request), allow_redirections: :safe))
210
+ # Use URI.open instead of open for Ruby 3.0+ compatibility
211
+ Nokogiri::HTML(URI.open(with_default_scheme(@request),
212
+ allow_redirections: :safe,
213
+ read_timeout: @timeout,
214
+ 'User-Agent' => @headers['User-Agent']))
135
215
  end
136
216
  end
137
217
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'addressable/uri'
2
4
 
3
5
  module WebInspector
@@ -13,7 +15,7 @@ module WebInspector
13
15
  def host
14
16
  uri.host
15
17
  end
16
-
18
+
17
19
  def domain
18
20
  suffix_domain
19
21
  end
@@ -24,23 +26,23 @@ module WebInspector
24
26
 
25
27
  def port
26
28
  URI(normalized_uri).port
27
- end
29
+ end
28
30
 
29
31
  private
30
-
32
+
31
33
  def suffix_domain
32
34
  return @domain if @domain
33
-
35
+
34
36
  begin
35
37
  @domain = PublicSuffix.parse(host).domain
36
- rescue URI::InvalidURIError, PublicSuffix::DomainInvalid => e
38
+ rescue URI::InvalidURIError, PublicSuffix::DomainInvalid
37
39
  @domain = ''
38
40
  end
39
41
  end
40
-
42
+
41
43
  def uri
42
44
  Addressable::URI.parse(@url)
43
- rescue Addressable::URI::InvalidURIError => e
45
+ rescue Addressable::URI::InvalidURIError
44
46
  nil
45
47
  end
46
48
 
@@ -48,4 +50,4 @@ module WebInspector
48
50
  uri.normalize.to_s
49
51
  end
50
52
  end
51
- end
53
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module WebInspector
2
- VERSION = "0.5.0"
4
+ VERSION = '1.0.0'
3
5
  end
data/lib/web_inspector.rb CHANGED
@@ -1,10 +1,12 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require File.expand_path(File.join(File.dirname(__FILE__), 'web_inspector/page'))
2
4
  require File.expand_path(File.join(File.dirname(__FILE__), 'web_inspector/version'))
3
5
 
4
6
  module WebInspector
5
- extend self
7
+ module_function
6
8
 
7
9
  def new(url, options = {})
8
10
  Page.new(url, options)
9
11
  end
10
- end
12
+ end
data/lib/webinspector.rb CHANGED
@@ -1 +1,3 @@
1
- require File.expand_path(File.join(File.dirname(__FILE__), './web_inspector'))
1
+ # frozen_string_literal: true
2
+
3
+ require File.expand_path(File.join(File.dirname(__FILE__), './web_inspector'))
data/webinspector.gemspec CHANGED
@@ -1,38 +1,45 @@
1
- # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path('lib', __dir__)
3
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require File.expand_path('../lib/web_inspector/version', __FILE__)
5
+ require File.expand_path('lib/web_inspector/version', __dir__)
5
6
 
6
7
  Gem::Specification.new do |spec|
7
- spec.name = "webinspector"
8
+ spec.name = 'webinspector'
8
9
  spec.version = WebInspector::VERSION
9
- spec.authors = ["Davide Santangelo"]
10
- spec.email = ["davide.santangelo@gmail.com"]
10
+ spec.authors = ['Davide Santangelo']
11
+ spec.email = ['davide.santangelo@gmail.com']
11
12
 
12
- spec.summary = %q{Ruby gem to inspect completely a web page.}
13
- spec.description = %q{Ruby gem to inspect completely a web page. It scrapes a given URL, and returns you its meta, links, images and more.}
14
- spec.homepage = ""
15
- spec.license = "MIT"
13
+ spec.summary = 'Ruby gem to inspect completely a web page.'
14
+ spec.description = 'Ruby gem to inspect completely a web page. It scrapes a given URL, and returns you its meta, links, images and more.'
15
+ spec.homepage = 'https://github.com/davidesantangelo/webinspector'
16
+ spec.license = 'MIT'
16
17
 
17
18
  spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
- spec.bindir = "exe"
19
+ spec.bindir = 'exe'
19
20
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
- spec.require_paths = ["lib"]
21
-
22
- spec.add_development_dependency "bundler", "~> 1.8"
23
- spec.add_development_dependency "rake", "~> 10.0"
21
+ spec.require_paths = ['lib']
22
+ spec.metadata = {
23
+ 'source_code_uri' => 'https://github.com/davidesantangelo/webinspector',
24
+ 'bug_tracker_uri' => 'https://github.com/davidesantangelo/webinspector/issues'
25
+ }
24
26
 
25
- spec.add_development_dependency 'rspec'
26
- spec.add_development_dependency "vcr"
27
- spec.add_development_dependency "typhoeus"
27
+ spec.required_ruby_version = '>= 3.0.0'
28
28
 
29
- spec.required_ruby_version = ">= 1.9.3"
29
+ spec.add_development_dependency 'rake', '~> 13.0'
30
+ spec.add_development_dependency 'rspec', '~> 3.12'
31
+ spec.add_development_dependency 'rubocop', '~> 1.50'
32
+ spec.add_development_dependency 'vcr', '~> 6.1'
33
+ spec.add_development_dependency 'webmock', '~> 3.18'
30
34
 
31
- spec.add_dependency "faraday"
32
- spec.add_dependency "json"
33
- spec.add_dependency "addressable"
34
- spec.add_dependency "nokogiri"
35
- spec.add_dependency "open_uri_redirections"
36
- spec.add_dependency "openurl"
37
- spec.add_dependency "public_suffix"
35
+ spec.add_dependency 'addressable', '~> 2.8'
36
+ spec.add_dependency 'faraday', '~> 2.7'
37
+ spec.add_dependency 'faraday-cookie_jar', '~> 0.0.7'
38
+ spec.add_dependency 'faraday-follow_redirects', '~> 0.3'
39
+ spec.add_dependency 'faraday-retry', '~> 2.1'
40
+ spec.add_dependency 'json', '~> 2.6'
41
+ spec.add_dependency 'nokogiri', '~> 1.14'
42
+ spec.add_dependency 'open_uri_redirections', '~> 0.2'
43
+ spec.add_dependency 'openurl', '~> 1.0'
44
+ spec.add_dependency 'public_suffix', '~> 5.0'
38
45
  end