webinspector 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/Gemfile +2 -0
- data/README.md +52 -28
- data/Rakefile +2 -1
- data/bin/console +4 -3
- data/lib/web_inspector/inspector.rb +187 -95
- data/lib/web_inspector/meta.rb +36 -15
- data/lib/web_inspector/page.rb +142 -62
- data/lib/web_inspector/request.rb +10 -8
- data/lib/web_inspector/version.rb +3 -1
- data/lib/web_inspector.rb +4 -2
- data/lib/webinspector.rb +3 -1
- data/webinspector.gemspec +33 -26
- metadata +103 -60
data/lib/web_inspector/page.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
require 'uri'
|
3
5
|
require 'open-uri'
|
@@ -5,133 +7,211 @@ require 'open_uri_redirections'
|
|
5
7
|
require 'faraday'
|
6
8
|
require 'public_suffix'
|
7
9
|
|
10
|
+
# Explicitly load Faraday::Retry if available
|
11
|
+
begin
|
12
|
+
require 'faraday/retry'
|
13
|
+
rescue LoadError
|
14
|
+
# Faraday retry is not available
|
15
|
+
end
|
16
|
+
|
8
17
|
require File.expand_path(File.join(File.dirname(__FILE__), 'inspector'))
|
9
18
|
require File.expand_path(File.join(File.dirname(__FILE__), 'request'))
|
10
19
|
|
11
20
|
module WebInspector
|
12
21
|
class Page
|
13
|
-
attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links,
|
14
|
-
|
22
|
+
attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links,
|
23
|
+
:domain_links, :domain_images, :images, :response, :status_code, :favicon
|
24
|
+
|
25
|
+
DEFAULT_TIMEOUT = 30
|
26
|
+
DEFAULT_RETRIES = 3
|
27
|
+
DEFAULT_USER_AGENT = -> { "WebInspector/#{WebInspector::VERSION} (+https://github.com/davidesantangelo/webinspector)" }
|
28
|
+
|
29
|
+
# Initialize a new WebInspector Page
|
30
|
+
#
|
31
|
+
# @param url [String] The URL to inspect
|
32
|
+
# @param options [Hash] Optional parameters
|
33
|
+
# @option options [Integer] :timeout Request timeout in seconds
|
34
|
+
# @option options [Integer] :retries Number of retries for failed requests
|
35
|
+
# @option options [Hash] :headers Custom HTTP headers
|
36
|
+
# @option options [Boolean] :allow_redirections Whether to follow redirects
|
37
|
+
# @option options [String] :user_agent Custom user agent
|
15
38
|
def initialize(url, options = {})
|
16
39
|
@url = url
|
17
40
|
@options = options
|
41
|
+
@retries = options[:retries] || DEFAULT_RETRIES
|
42
|
+
@timeout = options[:timeout] || DEFAULT_TIMEOUT
|
43
|
+
@headers = options[:headers] || { 'User-Agent' => options[:user_agent] || DEFAULT_USER_AGENT.call }
|
44
|
+
@allow_redirections = options[:allow_redirections].nil? || options[:allow_redirections]
|
45
|
+
|
18
46
|
@request = WebInspector::Request.new(url)
|
19
|
-
@inspector = WebInspector::Inspector.new(page)
|
20
|
-
end
|
21
47
|
|
22
|
-
|
23
|
-
|
48
|
+
begin
|
49
|
+
@inspector = WebInspector::Inspector.new(page)
|
50
|
+
@inspector.set_url(url, host)
|
51
|
+
@status_code = 200
|
52
|
+
rescue StandardError => e
|
53
|
+
@error = e
|
54
|
+
@status_code = e.respond_to?(:status_code) ? e.status_code : 500
|
55
|
+
end
|
24
56
|
end
|
25
57
|
|
26
|
-
|
27
|
-
|
58
|
+
# Check if the page was successfully loaded
|
59
|
+
#
|
60
|
+
# @return [Boolean] true if the page was loaded, false otherwise
|
61
|
+
def success?
|
62
|
+
!@inspector.nil? && !@error
|
28
63
|
end
|
29
64
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
@inspector.links
|
65
|
+
# Get the error message if any
|
66
|
+
#
|
67
|
+
# @return [String, nil] The error message or nil if no error
|
68
|
+
def error_message
|
69
|
+
@error&.message
|
36
70
|
end
|
37
71
|
|
38
|
-
|
39
|
-
|
40
|
-
|
72
|
+
# Delegate methods to inspector
|
73
|
+
%i[title description body links images meta].each do |method|
|
74
|
+
define_method(method) do
|
75
|
+
return nil unless success?
|
41
76
|
|
42
|
-
|
43
|
-
|
77
|
+
@inspector.send(method)
|
78
|
+
end
|
44
79
|
end
|
45
80
|
|
81
|
+
# Special case for find method that takes arguments
|
46
82
|
def find(words)
|
83
|
+
return nil unless success?
|
84
|
+
|
47
85
|
@inspector.find(words)
|
48
86
|
end
|
49
87
|
|
50
|
-
|
51
|
-
|
88
|
+
# Delegate methods to request
|
89
|
+
%i[url host domain scheme port].each do |method|
|
90
|
+
define_method(method) do
|
91
|
+
@request.send(method)
|
92
|
+
end
|
52
93
|
end
|
53
94
|
|
54
|
-
|
55
|
-
|
56
|
-
|
95
|
+
# Get the favicon URL if available
|
96
|
+
#
|
97
|
+
# @return [String, nil] The favicon URL or nil if not found
|
98
|
+
def favicon
|
99
|
+
return @favicon if defined?(@favicon)
|
57
100
|
|
58
|
-
|
59
|
-
@request.domain
|
60
|
-
end
|
101
|
+
return nil unless success?
|
61
102
|
|
62
|
-
|
63
|
-
|
64
|
-
|
103
|
+
@favicon = begin
|
104
|
+
# Try multiple approaches to find favicon
|
105
|
+
|
106
|
+
# 1. Look for standard favicon link tags
|
107
|
+
favicon_link = @inspector.page.css("link[rel='shortcut icon'], link[rel='icon'], link[rel='apple-touch-icon']").first
|
108
|
+
if favicon_link && favicon_link['href']
|
109
|
+
begin
|
110
|
+
return URI.join(url, favicon_link['href']).to_s
|
111
|
+
rescue URI::InvalidURIError
|
112
|
+
# Try next method
|
113
|
+
end
|
114
|
+
end
|
65
115
|
|
66
|
-
|
67
|
-
|
116
|
+
# 2. Try the default location /favicon.ico
|
117
|
+
"#{scheme}://#{host}/favicon.ico"
|
118
|
+
rescue StandardError
|
119
|
+
nil
|
120
|
+
end
|
68
121
|
end
|
69
122
|
|
70
123
|
def domain_links(u = domain)
|
124
|
+
return [] unless success?
|
125
|
+
|
71
126
|
@inspector.domain_links(u, host)
|
72
127
|
end
|
73
128
|
|
74
129
|
def domain_images(u = domain)
|
130
|
+
return [] unless success?
|
131
|
+
|
75
132
|
@inspector.domain_images(u, host)
|
76
133
|
end
|
77
134
|
|
135
|
+
# Get full JSON representation of the page
|
136
|
+
#
|
137
|
+
# @return [Hash] JSON representation of the page
|
78
138
|
def to_hash
|
79
139
|
{
|
80
|
-
'url'
|
81
|
-
'scheme'
|
82
|
-
'host'
|
83
|
-
'port'
|
84
|
-
'title'
|
85
|
-
'description'
|
86
|
-
'meta'
|
87
|
-
'links'
|
88
|
-
'images'
|
89
|
-
'
|
90
|
-
|
140
|
+
'url' => url,
|
141
|
+
'scheme' => scheme,
|
142
|
+
'host' => host,
|
143
|
+
'port' => port,
|
144
|
+
'title' => title,
|
145
|
+
'description' => description,
|
146
|
+
'meta' => meta,
|
147
|
+
'links' => links,
|
148
|
+
'images' => images,
|
149
|
+
'favicon' => favicon,
|
150
|
+
'response' => {
|
151
|
+
'status' => status_code,
|
152
|
+
'headers' => response&.headers || {},
|
153
|
+
'success' => success?
|
154
|
+
},
|
155
|
+
'error' => error_message
|
91
156
|
}
|
92
157
|
end
|
93
158
|
|
94
159
|
def response
|
95
160
|
@response ||= fetch
|
96
|
-
rescue
|
161
|
+
rescue StandardError => e
|
162
|
+
@error = e
|
97
163
|
nil
|
98
164
|
end
|
99
165
|
|
100
166
|
private
|
101
|
-
|
167
|
+
|
102
168
|
def fetch
|
103
|
-
session = Faraday.new(:
|
104
|
-
|
169
|
+
session = Faraday.new(url: url) do |faraday|
|
170
|
+
# Configure retries based on available middleware
|
171
|
+
faraday.request :retry, { max: @retries } if defined?(Faraday::Retry)
|
105
172
|
|
173
|
+
# Configure redirect handling
|
106
174
|
if @allow_redirections
|
107
|
-
|
108
|
-
|
175
|
+
begin
|
176
|
+
faraday.use FaradayMiddleware::FollowRedirects, limit: 10
|
177
|
+
faraday.use :cookie_jar
|
178
|
+
rescue NameError, NoMethodError
|
179
|
+
# Continue without middleware if not available
|
180
|
+
end
|
109
181
|
end
|
110
182
|
|
111
|
-
faraday.headers.merge!(@headers
|
183
|
+
faraday.headers.merge!(@headers)
|
112
184
|
faraday.adapter :net_http
|
113
185
|
end
|
114
186
|
|
115
|
-
|
116
|
-
|
117
|
-
req.options.open_timeout = @read_timeout
|
118
|
-
end
|
187
|
+
# Manual retry mechanism as a backup
|
188
|
+
retries = 0
|
119
189
|
|
120
|
-
|
190
|
+
begin
|
191
|
+
response = session.get do |req|
|
192
|
+
req.options.timeout = @timeout
|
193
|
+
req.options.open_timeout = @timeout
|
194
|
+
end
|
121
195
|
|
122
|
-
|
196
|
+
@url = response.env.url.to_s
|
197
|
+
response
|
198
|
+
rescue Faraday::TimeoutError, Faraday::ConnectionFailed => e
|
199
|
+
retries += 1
|
200
|
+
retry if retries <= @retries
|
201
|
+
raise e
|
202
|
+
end
|
123
203
|
end
|
124
204
|
|
125
205
|
def with_default_scheme(request)
|
126
|
-
request.url && request.scheme.nil? ?
|
127
|
-
end
|
128
|
-
|
129
|
-
def default_user_agent
|
130
|
-
"WebInspector/#{WebInspector::VERSION} (+https://github.com/davidesantangelo/webinspector)"
|
206
|
+
request.url && request.scheme.nil? ? "http://#{request.url}" : request.url
|
131
207
|
end
|
132
208
|
|
133
209
|
def page
|
134
|
-
|
210
|
+
# Use URI.open instead of open for Ruby 3.0+ compatibility
|
211
|
+
Nokogiri::HTML(URI.open(with_default_scheme(@request),
|
212
|
+
allow_redirections: :safe,
|
213
|
+
read_timeout: @timeout,
|
214
|
+
'User-Agent' => @headers['User-Agent']))
|
135
215
|
end
|
136
216
|
end
|
137
217
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'addressable/uri'
|
2
4
|
|
3
5
|
module WebInspector
|
@@ -13,7 +15,7 @@ module WebInspector
|
|
13
15
|
def host
|
14
16
|
uri.host
|
15
17
|
end
|
16
|
-
|
18
|
+
|
17
19
|
def domain
|
18
20
|
suffix_domain
|
19
21
|
end
|
@@ -24,23 +26,23 @@ module WebInspector
|
|
24
26
|
|
25
27
|
def port
|
26
28
|
URI(normalized_uri).port
|
27
|
-
end
|
29
|
+
end
|
28
30
|
|
29
31
|
private
|
30
|
-
|
32
|
+
|
31
33
|
def suffix_domain
|
32
34
|
return @domain if @domain
|
33
|
-
|
35
|
+
|
34
36
|
begin
|
35
37
|
@domain = PublicSuffix.parse(host).domain
|
36
|
-
rescue URI::InvalidURIError, PublicSuffix::DomainInvalid
|
38
|
+
rescue URI::InvalidURIError, PublicSuffix::DomainInvalid
|
37
39
|
@domain = ''
|
38
40
|
end
|
39
41
|
end
|
40
|
-
|
42
|
+
|
41
43
|
def uri
|
42
44
|
Addressable::URI.parse(@url)
|
43
|
-
rescue Addressable::URI::InvalidURIError
|
45
|
+
rescue Addressable::URI::InvalidURIError
|
44
46
|
nil
|
45
47
|
end
|
46
48
|
|
@@ -48,4 +50,4 @@ module WebInspector
|
|
48
50
|
uri.normalize.to_s
|
49
51
|
end
|
50
52
|
end
|
51
|
-
end
|
53
|
+
end
|
data/lib/web_inspector.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require File.expand_path(File.join(File.dirname(__FILE__), 'web_inspector/page'))
|
2
4
|
require File.expand_path(File.join(File.dirname(__FILE__), 'web_inspector/version'))
|
3
5
|
|
4
6
|
module WebInspector
|
5
|
-
|
7
|
+
module_function
|
6
8
|
|
7
9
|
def new(url, options = {})
|
8
10
|
Page.new(url, options)
|
9
11
|
end
|
10
|
-
end
|
12
|
+
end
|
data/lib/webinspector.rb
CHANGED
data/webinspector.gemspec
CHANGED
@@ -1,38 +1,45 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
3
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require File.expand_path('
|
5
|
+
require File.expand_path('lib/web_inspector/version', __dir__)
|
5
6
|
|
6
7
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
8
|
+
spec.name = 'webinspector'
|
8
9
|
spec.version = WebInspector::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
10
|
+
spec.authors = ['Davide Santangelo']
|
11
|
+
spec.email = ['davide.santangelo@gmail.com']
|
11
12
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
14
|
-
spec.homepage =
|
15
|
-
spec.license =
|
13
|
+
spec.summary = 'Ruby gem to inspect completely a web page.'
|
14
|
+
spec.description = 'Ruby gem to inspect completely a web page. It scrapes a given URL, and returns you its meta, links, images and more.'
|
15
|
+
spec.homepage = 'https://github.com/davidesantangelo/webinspector'
|
16
|
+
spec.license = 'MIT'
|
16
17
|
|
17
18
|
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
-
spec.bindir =
|
19
|
+
spec.bindir = 'exe'
|
19
20
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
|
-
spec.require_paths = [
|
21
|
-
|
22
|
-
|
23
|
-
|
21
|
+
spec.require_paths = ['lib']
|
22
|
+
spec.metadata = {
|
23
|
+
'source_code_uri' => 'https://github.com/davidesantangelo/webinspector',
|
24
|
+
'bug_tracker_uri' => 'https://github.com/davidesantangelo/webinspector/issues'
|
25
|
+
}
|
24
26
|
|
25
|
-
spec.
|
26
|
-
spec.add_development_dependency "vcr"
|
27
|
-
spec.add_development_dependency "typhoeus"
|
27
|
+
spec.required_ruby_version = '>= 3.0.0'
|
28
28
|
|
29
|
-
spec.
|
29
|
+
spec.add_development_dependency 'rake', '~> 13.0'
|
30
|
+
spec.add_development_dependency 'rspec', '~> 3.12'
|
31
|
+
spec.add_development_dependency 'rubocop', '~> 1.50'
|
32
|
+
spec.add_development_dependency 'vcr', '~> 6.1'
|
33
|
+
spec.add_development_dependency 'webmock', '~> 3.18'
|
30
34
|
|
31
|
-
spec.add_dependency
|
32
|
-
spec.add_dependency
|
33
|
-
spec.add_dependency
|
34
|
-
spec.add_dependency
|
35
|
-
spec.add_dependency
|
36
|
-
spec.add_dependency
|
37
|
-
spec.add_dependency
|
35
|
+
spec.add_dependency 'addressable', '~> 2.8'
|
36
|
+
spec.add_dependency 'faraday', '~> 2.7'
|
37
|
+
spec.add_dependency 'faraday-cookie_jar', '~> 0.0.7'
|
38
|
+
spec.add_dependency 'faraday-follow_redirects', '~> 0.3'
|
39
|
+
spec.add_dependency 'faraday-retry', '~> 2.1'
|
40
|
+
spec.add_dependency 'json', '~> 2.6'
|
41
|
+
spec.add_dependency 'nokogiri', '~> 1.14'
|
42
|
+
spec.add_dependency 'open_uri_redirections', '~> 0.2'
|
43
|
+
spec.add_dependency 'openurl', '~> 1.0'
|
44
|
+
spec.add_dependency 'public_suffix', '~> 5.0'
|
38
45
|
end
|