webinspector 0.5.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,18 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module WebInspector
2
4
  class Meta
3
- def initialize(page)
5
+ def initialize(page)
4
6
  @page = page
5
7
  end
6
8
 
7
9
  def meta_tags
8
10
  {
9
- 'name' => meta_tags_by('name'),
10
- 'http-equiv' => meta_tags_by('http-equiv'),
11
- 'property' => meta_tags_by('property'),
12
- 'charset' => [charset_from_meta_charset]
11
+ 'name' => meta_tags_by('name'),
12
+ 'http-equiv' => meta_tags_by('http-equiv'),
13
+ 'property' => meta_tags_by('property'),
14
+ 'charset' => [charset_from_meta_charset],
15
+ 'itemprop' => meta_tags_by('itemprop') # Add support for schema.org microdata
13
16
  }
14
17
  end
15
18
 
@@ -19,30 +22,48 @@ module WebInspector
19
22
 
20
23
  def meta
21
24
  meta_tag['name']
22
- .merge(meta_tag['http-equiv'])
23
- .merge(meta_tag['property'])
24
- .merge('charset' => meta_tag['charset'])
25
+ .merge(meta_tag['http-equiv'])
26
+ .merge(meta_tag['property'])
27
+ .merge(meta_tag['itemprop'] || {})
28
+ .merge('charset' => meta_tag['charset'])
25
29
  end
26
30
 
27
31
  def charset
28
- @charset ||= (charset_from_meta_charset || charset_from_meta_content_type)
32
+ @charset ||= charset_from_meta_charset || charset_from_meta_content_type || charset_from_header || 'utf-8'
29
33
  end
30
34
 
31
35
  private
32
36
 
33
37
  def charset_from_meta_charset
34
- @page.css('meta[charset]')[0].attributes['charset'].value rescue nil
38
+ @page.css('meta[charset]')[0].attributes['charset'].value
39
+ rescue StandardError
40
+ nil
35
41
  end
36
42
 
37
43
  def charset_from_meta_content_type
38
- @page.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(';')[1].split('=')[1] rescue nil
44
+ @page.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(';')[1].strip.split('=')[1]
45
+ rescue StandardError
46
+ nil
47
+ end
48
+
49
+ def charset_from_header
50
+ # Try to get charset from Content-Type header if available
51
+ nil
39
52
  end
40
53
 
41
- def meta_tags_by(attribute)
54
+ def meta_tags_by(attribute)
42
55
  hash = {}
43
56
  @page.css("meta[@#{attribute}]").map do |tag|
44
- name = tag.attributes[attribute].value.downcase rescue nil
45
- content = tag.attributes['content'].value rescue nil
57
+ name = begin
58
+ tag.attributes[attribute].value.downcase
59
+ rescue StandardError
60
+ nil
61
+ end
62
+ content = begin
63
+ tag.attributes['content'].value
64
+ rescue StandardError
65
+ nil
66
+ end
46
67
 
47
68
  if name && content
48
69
  hash[name] ||= []
@@ -64,4 +85,4 @@ module WebInspector
64
85
  end
65
86
  end
66
87
  end
67
- end
88
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
  require 'uri'
3
5
  require 'open-uri'
@@ -5,133 +7,316 @@ require 'open_uri_redirections'
5
7
  require 'faraday'
6
8
  require 'public_suffix'
7
9
 
10
+ # Explicitly load Faraday::Retry if available
11
+ begin
12
+ require 'faraday/retry'
13
+ rescue LoadError
14
+ # Faraday retry is not available
15
+ end
16
+
8
17
  require File.expand_path(File.join(File.dirname(__FILE__), 'inspector'))
9
18
  require File.expand_path(File.join(File.dirname(__FILE__), 'request'))
10
19
 
11
20
  module WebInspector
12
21
  class Page
13
- attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links, :domain_links, :domain_images, :images, :response
22
+ attr_reader :status_code
23
+
24
+ DEFAULT_TIMEOUT = 30
25
+ DEFAULT_RETRIES = 3
26
+ DEFAULT_USER_AGENT = -> { "WebInspector/#{WebInspector::VERSION} (+https://github.com/davidesantangelo/webinspector)" }
14
27
 
28
+ # Initialize a new WebInspector Page
29
+ #
30
+ # @param url [String] The URL to inspect
31
+ # @param options [Hash] Optional parameters
32
+ # @option options [Integer] :timeout Request timeout in seconds
33
+ # @option options [Integer] :retries Number of retries for failed requests
34
+ # @option options [Hash] :headers Custom HTTP headers
35
+ # @option options [Boolean] :allow_redirections Whether to follow redirects
36
+ # @option options [String] :user_agent Custom user agent
15
37
  def initialize(url, options = {})
16
38
  @url = url
17
39
  @options = options
40
+ @retries = options[:retries] || DEFAULT_RETRIES
41
+ @timeout = options[:timeout] || DEFAULT_TIMEOUT
42
+ @headers = options[:headers] || { 'User-Agent' => options[:user_agent] || DEFAULT_USER_AGENT.call }
43
+ @allow_redirections = options[:allow_redirections].nil? || options[:allow_redirections]
44
+
18
45
  @request = WebInspector::Request.new(url)
19
- @inspector = WebInspector::Inspector.new(page)
20
- end
21
46
 
22
- def title
23
- @inspector.title
47
+ begin
48
+ @inspector = WebInspector::Inspector.new(page)
49
+ @inspector.set_url(url, host)
50
+ @status_code = 200
51
+ rescue StandardError => e
52
+ @error = e
53
+ @status_code = e.respond_to?(:status_code) ? e.status_code : 500
54
+ end
24
55
  end
25
56
 
26
- def description
27
- @inspector.description
57
+ # Check if the page was successfully loaded
58
+ #
59
+ # @return [Boolean] true if the page was loaded, false otherwise
60
+ def success?
61
+ !@inspector.nil? && !@error
28
62
  end
29
63
 
30
- def body
31
- @inspector.body
32
- end
33
-
34
- def links
35
- @inspector.links
64
+ # Get the error message if any
65
+ #
66
+ # @return [String, nil] The error message or nil if no error
67
+ def error_message
68
+ @error&.message
36
69
  end
37
70
 
38
- def images
39
- @inspector.images
40
- end
71
+ # Delegate methods to inspector
72
+ %i[title description body links images meta javascripts stylesheets language structured_data microdata
73
+ tag_count].each do |method|
74
+ define_method(method) do
75
+ return nil unless success?
41
76
 
42
- def meta
43
- @inspector.meta
77
+ @inspector.send(method)
78
+ end
44
79
  end
45
80
 
81
+ # Special case for find method that takes arguments
46
82
  def find(words)
83
+ return nil unless success?
84
+
47
85
  @inspector.find(words)
48
86
  end
49
87
 
50
- def url
51
- @request.url
88
+ # Delegate methods to request
89
+ %i[url host domain scheme port].each do |method|
90
+ define_method(method) do
91
+ @request.send(method)
92
+ end
52
93
  end
53
94
 
54
- def host
55
- @request.host
56
- end
95
+ # Get the favicon URL if available
96
+ #
97
+ # @return [String, nil] The favicon URL or nil if not found
98
+ def favicon
99
+ return @favicon if defined?(@favicon)
57
100
 
58
- def domain
59
- @request.domain
60
- end
101
+ return nil unless success?
61
102
 
62
- def scheme
63
- @request.scheme
64
- end
103
+ @favicon = begin
104
+ # Try multiple approaches to find favicon
105
+
106
+ # 1. Look for standard favicon link tags
107
+ favicon_link = @inspector.page.css("link[rel='shortcut icon'], link[rel='icon'], link[rel='apple-touch-icon']").first
108
+ if favicon_link && favicon_link['href']
109
+ begin
110
+ return URI.join(url, favicon_link['href']).to_s
111
+ rescue URI::InvalidURIError
112
+ # Try next method
113
+ end
114
+ end
65
115
 
66
- def port
67
- @request.port
116
+ # 2. Try the default location /favicon.ico
117
+ "#{scheme}://#{host}/favicon.ico"
118
+ rescue StandardError
119
+ nil
120
+ end
68
121
  end
69
122
 
70
123
  def domain_links(u = domain)
124
+ return [] unless success?
125
+
71
126
  @inspector.domain_links(u, host)
72
127
  end
73
128
 
74
129
  def domain_images(u = domain)
130
+ return [] unless success?
131
+
75
132
  @inspector.domain_images(u, host)
76
133
  end
77
134
 
135
+ # Get information about the page's security
136
+ # @return [Hash] Security information
137
+ def security_info
138
+ return @security_info if defined?(@security_info)
139
+
140
+ @security_info = {
141
+ secure: scheme == 'https',
142
+ hsts: response&.headers && response.headers['strict-transport-security'] ? true : false,
143
+ content_security_policy: response&.headers && response.headers['content-security-policy'] ? true : false
144
+ }
145
+
146
+ # Extract SSL/TLS info if available and using HTTPS
147
+ if scheme == 'https' && response&.env&.response_headers
148
+ @security_info[:ssl_version] = response.env[:ssl_version]
149
+ @security_info[:cipher_suite] = response.env[:cipher_suite]
150
+ end
151
+
152
+ @security_info
153
+ end
154
+
155
+ # Get the content type of the page
156
+ # @return [String, nil] Content type
157
+ def content_type
158
+ response&.headers && response.headers['content-type']
159
+ end
160
+
161
+ # Get the size of the page in bytes
162
+ # @return [Integer, nil] Size in bytes
163
+ def size
164
+ return @size if defined?(@size)
165
+
166
+ @size = if response&.headers && response.headers['content-length']
167
+ response.headers['content-length'].to_i
168
+ elsif response&.body
169
+ response.body.bytesize
170
+ end
171
+ end
172
+
173
+ # Get the load time of the page in seconds
174
+ # @return [Float, nil] Load time in seconds
175
+ attr_reader :load_time
176
+
177
+ # Get all JSON-LD structured data as a hash
178
+ # @return [Array<Hash>] Structured data
179
+ def json_ld
180
+ structured_data
181
+ end
182
+
183
+ # Get a hash of all technologies detected on the page
184
+ # @return [Hash] Detected technologies
185
+ def technologies
186
+ techs = {}
187
+ js_files = javascripts || []
188
+ css_files = stylesheets || []
189
+ page_body = body || ''
190
+ page_meta = meta || {}
191
+ response_headers = response&.headers || {}
192
+
193
+ # Frameworks and Libraries
194
+ techs[:jquery] = true if js_files.any? { |js| js.include?('jquery') } || page_body.include?('jQuery')
195
+ techs[:react] = true if page_body.include?('data-reactroot') || js_files.any? { |js| js.include?('react') }
196
+ techs[:vue] = true if page_body.include?('data-v-app') || js_files.any? { |js| js.include?('vue') }
197
+ techs[:angular] = true if page_body.include?('ng-version') || js_files.any? { |js| js.include?('angular') }
198
+ techs[:bootstrap] = true if css_files.any? do |css|
199
+ css.include?('bootstrap')
200
+ end || page_body.include?('class="container"')
201
+ if response_headers['x-powered-by']&.include?('Rails') || response_headers.key?('x-rails-env')
202
+ techs[:rails] =
203
+ true
204
+ end
205
+ techs[:php] = true if response_headers['x-powered-by']&.include?('PHP')
206
+
207
+ # CMS
208
+ techs[:wordpress] = true if page_meta['generator']&.include?('WordPress') || page_body.include?('/wp-content/')
209
+ techs[:shopify] = true if page_body.include?('Shopify.shop')
210
+
211
+ # Analytics
212
+ techs[:google_analytics] = true if js_files.any? { |js| js.include?('google-analytics.com') }
213
+
214
+ # Server
215
+ server = response_headers['server']
216
+ if server
217
+ techs[:server] = server
218
+ techs[:nginx] = true if server.include?('nginx')
219
+ techs[:apache] = true if server.include?('Apache')
220
+ techs[:iis] = true if server.include?('IIS')
221
+ techs[:express] = true if response_headers['x-powered-by']&.include?('Express')
222
+ end
223
+
224
+ techs
225
+ end
226
+
227
+ # Get full JSON representation of the page with all new data
228
+ # @return [Hash] JSON representation of the page
78
229
  def to_hash
79
230
  {
80
- 'url' => url,
81
- 'scheme' => scheme,
82
- 'host' => host,
83
- 'port' => port,
84
- 'title' => title,
85
- 'description' => description,
86
- 'meta' => meta,
87
- 'links' => links,
88
- 'images' => images,
89
- 'response' => { 'status' => response.status,
90
- 'headers' => response.headers }
231
+ 'url' => url,
232
+ 'scheme' => scheme,
233
+ 'host' => host,
234
+ 'port' => port,
235
+ 'title' => title,
236
+ 'description' => description,
237
+ 'meta' => meta,
238
+ 'links' => links,
239
+ 'images' => images,
240
+ 'javascripts' => javascripts,
241
+ 'stylesheets' => stylesheets,
242
+ 'favicon' => favicon,
243
+ 'language' => language,
244
+ 'structured_data' => structured_data,
245
+ 'microdata' => microdata,
246
+ 'security_info' => security_info,
247
+ 'content_type' => content_type,
248
+ 'size' => size,
249
+ 'load_time' => load_time,
250
+ 'technologies' => technologies,
251
+ 'tag_count' => tag_count,
252
+ 'response' => {
253
+ 'status' => status_code,
254
+ 'headers' => response&.headers || {},
255
+ 'success' => success?
256
+ },
257
+ 'error' => error_message
91
258
  }
92
259
  end
93
260
 
94
261
  def response
95
262
  @response ||= fetch
96
- rescue Faraday::TimeoutError, Faraday::Error::ConnectionFailed, RuntimeError, URI::InvalidURIError => e
263
+ rescue StandardError => e
264
+ @error = e
97
265
  nil
98
266
  end
99
267
 
100
268
  private
101
-
269
+
102
270
  def fetch
103
- session = Faraday.new(:url => url) do |faraday|
104
- faraday.request :retry, max: @retries
271
+ start_time = Time.now
105
272
 
273
+ session = Faraday.new(url: url) do |faraday|
274
+ # Configure retries based on available middleware
275
+ faraday.request :retry, { max: @retries } if defined?(Faraday::Retry)
276
+
277
+ # Configure redirect handling
106
278
  if @allow_redirections
107
- faraday.use FaradayMiddleware::FollowRedirects, limit: 10
108
- faraday.use :cookie_jar
279
+ begin
280
+ faraday.use FaradayMiddleware::FollowRedirects, limit: 10
281
+ faraday.use :cookie_jar
282
+ rescue NameError, NoMethodError
283
+ # Continue without middleware if not available
284
+ end
109
285
  end
110
286
 
111
- faraday.headers.merge!(@headers || {})
287
+ faraday.headers.merge!(@headers)
112
288
  faraday.adapter :net_http
113
289
  end
114
290
 
115
- response = session.get do |req|
116
- req.options.timeout = @connection_timeout
117
- req.options.open_timeout = @read_timeout
118
- end
291
+ # Manual retry mechanism as a backup
292
+ retries = 0
119
293
 
120
- @url = response.env.url.to_s
294
+ begin
295
+ response = session.get do |req|
296
+ req.options.timeout = @timeout
297
+ req.options.open_timeout = @timeout
298
+ end
121
299
 
122
- response
300
+ @url = response.env.url.to_s
301
+ @load_time = Time.now - start_time
302
+ response
303
+ rescue Faraday::TimeoutError, Faraday::ConnectionFailed => e
304
+ retries += 1
305
+ retry if retries <= @retries
306
+ raise e
307
+ end
123
308
  end
124
309
 
125
310
  def with_default_scheme(request)
126
- request.url && request.scheme.nil? ? 'http://' + request.url : request.url
127
- end
128
-
129
- def default_user_agent
130
- "WebInspector/#{WebInspector::VERSION} (+https://github.com/davidesantangelo/webinspector)"
311
+ request.url && request.scheme.nil? ? "http://#{request.url}" : request.url
131
312
  end
132
313
 
133
314
  def page
134
- Nokogiri::HTML(open(with_default_scheme(@request), allow_redirections: :safe))
315
+ # Use URI.open instead of open for Ruby 3.0+ compatibility
316
+ Nokogiri::HTML(URI.open(with_default_scheme(@request),
317
+ allow_redirections: :safe,
318
+ read_timeout: @timeout,
319
+ 'User-Agent' => @headers['User-Agent']))
135
320
  end
136
321
  end
137
322
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'addressable/uri'
2
4
 
3
5
  module WebInspector
@@ -13,7 +15,7 @@ module WebInspector
13
15
  def host
14
16
  uri.host
15
17
  end
16
-
18
+
17
19
  def domain
18
20
  suffix_domain
19
21
  end
@@ -24,23 +26,23 @@ module WebInspector
24
26
 
25
27
  def port
26
28
  URI(normalized_uri).port
27
- end
29
+ end
28
30
 
29
31
  private
30
-
32
+
31
33
  def suffix_domain
32
34
  return @domain if @domain
33
-
35
+
34
36
  begin
35
37
  @domain = PublicSuffix.parse(host).domain
36
- rescue URI::InvalidURIError, PublicSuffix::DomainInvalid => e
38
+ rescue URI::InvalidURIError, PublicSuffix::DomainInvalid
37
39
  @domain = ''
38
40
  end
39
41
  end
40
-
42
+
41
43
  def uri
42
44
  Addressable::URI.parse(@url)
43
- rescue Addressable::URI::InvalidURIError => e
45
+ rescue Addressable::URI::InvalidURIError
44
46
  nil
45
47
  end
46
48
 
@@ -48,4 +50,4 @@ module WebInspector
48
50
  uri.normalize.to_s
49
51
  end
50
52
  end
51
- end
53
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module WebInspector
2
- VERSION = "0.5.0"
4
+ VERSION = '1.1.0'
3
5
  end
data/lib/web_inspector.rb CHANGED
@@ -1,10 +1,12 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require File.expand_path(File.join(File.dirname(__FILE__), 'web_inspector/page'))
2
4
  require File.expand_path(File.join(File.dirname(__FILE__), 'web_inspector/version'))
3
5
 
4
6
  module WebInspector
5
- extend self
7
+ module_function
6
8
 
7
9
  def new(url, options = {})
8
10
  Page.new(url, options)
9
11
  end
10
- end
12
+ end
data/lib/webinspector.rb CHANGED
@@ -1 +1,3 @@
1
- require File.expand_path(File.join(File.dirname(__FILE__), './web_inspector'))
1
+ # frozen_string_literal: true
2
+
3
+ require File.expand_path(File.join(File.dirname(__FILE__), './web_inspector'))
data/webinspector.gemspec CHANGED
@@ -1,38 +1,45 @@
1
- # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path('lib', __dir__)
3
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require File.expand_path('../lib/web_inspector/version', __FILE__)
5
+ require File.expand_path('lib/web_inspector/version', __dir__)
5
6
 
6
7
  Gem::Specification.new do |spec|
7
- spec.name = "webinspector"
8
+ spec.name = 'webinspector'
8
9
  spec.version = WebInspector::VERSION
9
- spec.authors = ["Davide Santangelo"]
10
- spec.email = ["davide.santangelo@gmail.com"]
10
+ spec.authors = ['Davide Santangelo']
11
+ spec.email = ['davide.santangelo@gmail.com']
11
12
 
12
- spec.summary = %q{Ruby gem to inspect completely a web page.}
13
- spec.description = %q{Ruby gem to inspect completely a web page. It scrapes a given URL, and returns you its meta, links, images and more.}
14
- spec.homepage = ""
15
- spec.license = "MIT"
13
+ spec.summary = 'Ruby gem to inspect completely a web page.'
14
+ spec.description = 'Ruby gem to inspect completely a web page. It scrapes a given URL, and returns you its meta, links, images and more.'
15
+ spec.homepage = 'https://github.com/davidesantangelo/webinspector'
16
+ spec.license = 'MIT'
16
17
 
17
18
  spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
- spec.bindir = "exe"
19
+ spec.bindir = 'exe'
19
20
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
- spec.require_paths = ["lib"]
21
-
22
- spec.add_development_dependency "bundler", "~> 1.8"
23
- spec.add_development_dependency "rake", "~> 10.0"
21
+ spec.require_paths = ['lib']
22
+ spec.metadata = {
23
+ 'source_code_uri' => 'https://github.com/davidesantangelo/webinspector',
24
+ 'bug_tracker_uri' => 'https://github.com/davidesantangelo/webinspector/issues'
25
+ }
24
26
 
25
- spec.add_development_dependency 'rspec'
26
- spec.add_development_dependency "vcr"
27
- spec.add_development_dependency "typhoeus"
27
+ spec.required_ruby_version = '>= 3.0.0'
28
28
 
29
- spec.required_ruby_version = ">= 1.9.3"
29
+ spec.add_development_dependency 'rake', '~> 13.0'
30
+ spec.add_development_dependency 'rspec', '~> 3.12'
31
+ spec.add_development_dependency 'rubocop', '~> 1.50'
32
+ spec.add_development_dependency 'vcr', '~> 6.1'
33
+ spec.add_development_dependency 'webmock', '~> 3.18'
30
34
 
31
- spec.add_dependency "faraday"
32
- spec.add_dependency "json"
33
- spec.add_dependency "addressable"
34
- spec.add_dependency "nokogiri"
35
- spec.add_dependency "open_uri_redirections"
36
- spec.add_dependency "openurl"
37
- spec.add_dependency "public_suffix"
35
+ spec.add_dependency 'addressable', '~> 2.8'
36
+ spec.add_dependency 'faraday', '~> 2.7'
37
+ spec.add_dependency 'faraday-cookie_jar', '~> 0.0.7'
38
+ spec.add_dependency 'faraday-follow_redirects', '~> 0.3'
39
+ spec.add_dependency 'faraday-retry', '~> 2.1'
40
+ spec.add_dependency 'json', '~> 2.6'
41
+ spec.add_dependency 'nokogiri', '~> 1.14'
42
+ spec.add_dependency 'open_uri_redirections', '~> 0.2'
43
+ spec.add_dependency 'openurl', '~> 1.0'
44
+ spec.add_dependency 'public_suffix', '~> 5.0'
38
45
  end