webinspector 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0413d3ff948ab6efff6a1cbe8a7844287149ad06f09353655e6cb208968f9481
4
- data.tar.gz: 152b950595afb57adc522da24c6959f71d160ba903b3d01ce6ee5f6a8b4d81d2
3
+ metadata.gz: 7ef69ac6245db062331d4f0b269d1f389cb9e730e4ad39e51d75236f30e0b6c9
4
+ data.tar.gz: b37d5b30a01ea6b0e74553edb9944acdeac4d5ce2808e319798b8d0d3a9f5390
5
5
  SHA512:
6
- metadata.gz: c6230493b59a0d23585be729ec98706cfdbd6852e2de2d65db83d1638f85110369d41f2275b8e0aa09b58008d53924d036840ce63523d9004f19275999be90f8
7
- data.tar.gz: dad518b0b04c1e341c14c29438ebcf84f4602bf394254a4c61382ad79ce53dae2ac92f138f331d5bf089d2220011f14b0cde0a79608a91177b2bda2ab1773a96
6
+ metadata.gz: '098f7c7f19f8bfe8b8f3b4977cac590e9c107cb8864128e4abbb3b47327e34743844fcde2d5bf6e3038e39e7271d565dbd19157b703ea041f31e4c3fbc84aa47'
7
+ data.tar.gz: 5fdeaa74861245796f15bf70ba5fcc00e7e7eb2f0c1192a60395a119b24ea1c72b579a24296cb66c1fd77cc7c4cfff8b2505d321a2ef74dd7b5618b84e69ae5c
data/README.md CHANGED
@@ -4,7 +4,6 @@ Ruby gem to inspect web pages. It scrapes a given URL and returns its title, des
4
4
 
5
5
  <a href="https://codeclimate.com/github/davidesantangelo/webinspector"><img src="https://codeclimate.com/github/davidesantangelo/webinspector/badges/gpa.svg" /></a>
6
6
 
7
-
8
7
  ## Installation
9
8
 
10
9
  Add this line to your application's Gemfile:
@@ -34,7 +33,7 @@ page = WebInspector.new('http://example.com')
34
33
  ```ruby
35
34
  page = WebInspector.new('http://example.com', {
36
35
  timeout: 30, # Request timeout in seconds (default: 30)
37
- retries: 3, # Number of retries (default: 3)
36
+ retries: 3, # Number of retries (default: 3)
38
37
  headers: {'User-Agent': 'Custom UA'} # Custom HTTP headers
39
38
  })
40
39
  ```
@@ -86,16 +85,125 @@ page.domain_images('example.com') # returns only images hosted on example.com
86
85
  page.find(["ruby", "rails"]) # returns [{"ruby"=>3}, {"rails"=>1}]
87
86
  ```
88
87
 
88
+ #### JavaScript and Stylesheets
89
+
90
+ ```ruby
91
+ page.javascripts # array of all JavaScript files (absolute URLs)
92
+ page.stylesheets # array of all CSS stylesheets (absolute URLs)
93
+ ```
94
+
95
+ #### Language Detection
96
+
97
+ ```ruby
98
+ page.language # detected language code (e.g., "en", "es", "fr")
99
+ ```
100
+
101
+ #### Structured Data
102
+
103
+ ```ruby
104
+ page.structured_data # array of JSON-LD structured data objects
105
+ page.microdata # array of microdata items
106
+ page.json_ld # alias for structured_data
107
+ ```
108
+
109
+ #### Security Information
110
+
111
+ ```ruby
112
+ page.security_info # hash with security details: { secure: true, hsts: true, ... }
113
+ ```
114
+
115
+ #### Performance Metrics
116
+
117
+ ```ruby
118
+ page.load_time # page load time in seconds
119
+ page.size # page size in bytes
120
+ ```
121
+
122
+ #### Content Type
123
+
124
+ ```ruby
125
+ page.content_type # content type header (e.g., "text/html; charset=utf-8")
126
+ ```
127
+
128
+ #### Technology Detection
129
+
130
+ ```ruby
131
+ page.technologies # hash of detected technologies: { jquery: true, react: true, ... }
132
+ ```
133
+
134
+ #### HTML Tag Statistics
135
+
136
+ ```ruby
137
+ page.tag_count # hash with counts of each HTML tag: { "div" => 45, "p" => 12, ... }
138
+ ```
139
+
140
+ #### RSS/Atom Feeds
141
+
142
+ ```ruby
143
+ page.feeds # array of RSS/Atom feed URLs found on the page
144
+ ```
145
+
146
+ #### Social Media Links
147
+
148
+ ```ruby
149
+ page.social_links # hash of social media profiles: { facebook: "url", twitter: "url", ... }
150
+ ```
151
+
152
+ #### Robots.txt and Sitemap
153
+
154
+ ```ruby
155
+ page.robots_txt_url # URL to robots.txt
156
+ page.sitemap_url # array of sitemap URLs
157
+ ```
158
+
159
+ #### CMS Detection
160
+
161
+ ```ruby
162
+ page.cms_info # hash with CMS details: { name: "WordPress", version: "6.0", themes: [...], plugins: [...] }
163
+ ```
164
+
165
+ #### Accessibility Score
166
+
167
+ ```ruby
168
+ page.accessibility_score # hash with score (0-100) and details: { score: 85, details: [...] }
169
+ ```
170
+
171
+ #### Mobile-Friendly Check
172
+
173
+ ```ruby
174
+ page.mobile_friendly? # true if the page has viewport meta tag and responsive CSS
175
+ ```
176
+
89
177
  ### Export all data to JSON
90
178
 
91
179
  ```ruby
92
180
  page.to_hash # returns a hash with all page data
93
181
  ```
94
182
 
183
+ ## Changelog
184
+
185
+ ### Version 1.2.0
186
+
187
+ **New Features:**
188
+
189
+ - RSS/Atom feed detection with `feeds` method
190
+ - Social media profile extraction with `social_links` method
191
+ - CMS detection and information with `cms_info` method (WordPress, Drupal, Joomla, Shopify, Wix, Squarespace)
192
+ - Accessibility scoring with `accessibility_score` method
193
+ - Mobile-friendly detection with `mobile_friendly?` method
194
+ - Robots.txt and sitemap URL detection with `robots_txt_url` and `sitemap_url` methods
195
+
196
+ **Improvements:**
197
+
198
+ - Enhanced `Request` module with `valid?` and `ssl?` methods for better URL validation
199
+ - Improved `Meta` module with author and publisher extraction
200
+ - Better error handling across all modules
201
+ - Performance improvements with internal caching
202
+
95
203
  ## Contributors
96
204
 
97
- * Steven Shelby ([@stevenshelby](https://github.com/stevenshelby))
98
- * Sam Nissen ([@samnissen](https://github.com/samnissen))
205
+ - Steven Shelby ([@stevenshelby](https://github.com/stevenshelby))
206
+ - Sam Nissen ([@samnissen](https://github.com/samnissen))
99
207
 
100
208
  ## License
101
209
 
@@ -71,26 +71,7 @@ module WebInspector
71
71
  # @return [Array<String>] Filtered links
72
72
  def domain_links(user_domain, host = nil)
73
73
  @host ||= host
74
-
75
- return [] if links.empty?
76
-
77
- # Handle nil user_domain
78
- user_domain = @host.to_s if user_domain.nil? || user_domain.empty?
79
-
80
- # Normalize domain for comparison
81
- user_domain = user_domain.to_s.downcase.gsub(/\s+/, '')
82
- user_domain = user_domain.sub(/^www\./, '') # Remove www prefix for comparison
83
-
84
- links.select do |link|
85
- uri = URI.parse(link.to_s)
86
- next false unless uri.host # Skip URLs without hosts
87
-
88
- uri_host = uri.host.to_s.downcase
89
- uri_host = uri_host.sub(/^www\./, '') # Remove www prefix for comparison
90
- uri_host.include?(user_domain)
91
- rescue URI::InvalidURIError, NoMethodError
92
- false
93
- end
74
+ filter_by_domain(links, user_domain)
94
75
  end
95
76
 
96
77
  # Get all images from the page
@@ -122,25 +103,325 @@ module WebInspector
122
103
  # @return [Array<String>] Filtered images
123
104
  def domain_images(user_domain, host = nil)
124
105
  @host ||= host
106
+ filter_by_domain(images, user_domain)
107
+ end
125
108
 
126
- return [] if images.empty?
109
+ # Get all JavaScript files used by the page
110
+ # @return [Array<String>] Array of JavaScript file URLs
111
+ def javascripts
112
+ @javascripts ||= begin
113
+ scripts = []
114
+ @page.css('script[src]').each do |script|
115
+ src = script[:src]
116
+ next unless src
127
117
 
128
- # Handle nil user_domain
129
- user_domain = @host.to_s if user_domain.nil? || user_domain.empty?
118
+ # Clean and normalize URL
119
+ src = src.strip
130
120
 
131
- # Normalize domain for comparison
132
- user_domain = user_domain.to_s.downcase.gsub(/\s+/, '')
133
- user_domain = user_domain.sub(/^www\./, '') # Remove www prefix for comparison
121
+ begin
122
+ absolute_url = make_absolute_url(src)
123
+ scripts << absolute_url if absolute_url
124
+ rescue URI::InvalidURIError, URI::BadURIError
125
+ # Skip invalid URLs
126
+ end
127
+ end
128
+ scripts.uniq.compact
129
+ end
130
+ end
134
131
 
135
- images.select do |img|
136
- uri = URI.parse(img.to_s)
137
- next false unless uri.host # Skip URLs without hosts
132
+ # Get stylesheets used by the page
133
+ # @return [Array<String>] Array of CSS file URLs
134
+ def stylesheets
135
+ @stylesheets ||= begin
136
+ styles = []
137
+ @page.css('link[rel="stylesheet"]').each do |style|
138
+ href = style[:href]
139
+ next unless href
138
140
 
139
- uri_host = uri.host.to_s.downcase
140
- uri_host = uri_host.sub(/^www\./, '') # Remove www prefix for comparison
141
- uri_host.include?(user_domain)
142
- rescue URI::InvalidURIError, NoMethodError
143
- false
141
+ # Clean and normalize URL
142
+ href = href.strip
143
+
144
+ begin
145
+ absolute_url = make_absolute_url(href)
146
+ styles << absolute_url if absolute_url
147
+ rescue URI::InvalidURIError, URI::BadURIError
148
+ # Skip invalid URLs
149
+ end
150
+ end
151
+ styles.uniq.compact
152
+ end
153
+ end
154
+
155
+ # Detect the page language
156
+ # @return [String, nil] Language code if detected, nil otherwise
157
+ def language
158
+ # Check for html lang attribute first
159
+ html_tag = @page.at('html')
160
+ return html_tag['lang'] if html_tag && html_tag['lang'] && !html_tag['lang'].empty?
161
+
162
+ # Then check for language meta tag
163
+ lang_meta = @meta['content-language']
164
+ return lang_meta if lang_meta && !lang_meta.empty?
165
+
166
+ # Fallback to inspecting content headers if available
167
+ nil
168
+ end
169
+
170
+ # Extract structured data (JSON-LD) from the page
171
+ # @return [Array<Hash>] Array of structured data objects
172
+ def structured_data
173
+ @structured_data ||= begin
174
+ data = []
175
+ @page.css('script[type="application/ld+json"]').each do |script|
176
+ parsed = JSON.parse(script.text)
177
+ data << parsed if parsed
178
+ rescue JSON::ParserError
179
+ # Skip invalid JSON
180
+ end
181
+ data
182
+ end
183
+ end
184
+
185
+ # Extract microdata from the page
186
+ # @return [Array<Hash>] Array of microdata items
187
+ def microdata
188
+ @microdata ||= begin
189
+ items = []
190
+ @page.css('[itemscope]').each do |scope|
191
+ item = { type: scope['itemtype'] }
192
+ properties = {}
193
+
194
+ scope.css('[itemprop]').each do |prop|
195
+ name = prop['itemprop']
196
+ # Extract value based on tag
197
+ value = case prop.name.downcase
198
+ when 'meta'
199
+ prop['content']
200
+ when 'img', 'audio', 'embed', 'iframe', 'source', 'track', 'video'
201
+ make_absolute_url(prop['src'])
202
+ when 'a', 'area', 'link'
203
+ make_absolute_url(prop['href'])
204
+ when 'time'
205
+ prop['datetime'] || prop.text.strip
206
+ else
207
+ prop.text.strip
208
+ end
209
+ properties[name] = value
210
+ end
211
+
212
+ item[:properties] = properties
213
+ items << item
214
+ end
215
+ items
216
+ end
217
+ end
218
+
219
+ # Count all tag types on the page
220
+ # @return [Hash] Counts of different HTML elements
221
+ def tag_count
222
+ tags = {}
223
+ @page.css('*').each do |element|
224
+ tag_name = element.name.downcase
225
+ tags[tag_name] ||= 0
226
+ tags[tag_name] += 1
227
+ end
228
+ tags
229
+ end
230
+
231
+ # Extract RSS/Atom feeds from the page
232
+ # @return [Array<String>] Array of feed URLs
233
+ def feeds
234
+ @feeds ||= begin
235
+ feeds = []
236
+
237
+ # Look for feed link tags
238
+ @page.css('link[type="application/rss+xml"], link[type="application/atom+xml"]').each do |link|
239
+ href = link[:href]
240
+ feeds << make_absolute_url(href) if href
241
+ end
242
+
243
+ # Look for common feed patterns in links
244
+ links.each do |link|
245
+ feeds << link if link =~ %r{/(feed|rss|atom)(/|\.xml|$)}i
246
+ end
247
+
248
+ feeds.uniq.compact
249
+ end
250
+ end
251
+
252
+ # Extract social media profile links
253
+ # @return [Hash] Hash of social platform => URL
254
+ def social_links
255
+ @social_links ||= begin
256
+ socials = {}
257
+ platforms = {
258
+ facebook: /facebook\.com/,
259
+ twitter: /(twitter\.com|x\.com)/,
260
+ linkedin: /linkedin\.com/,
261
+ instagram: /instagram\.com/,
262
+ youtube: /youtube\.com/,
263
+ github: /github\.com/,
264
+ tiktok: /tiktok\.com/
265
+ }
266
+
267
+ # Check links
268
+ links.each do |link|
269
+ platforms.each do |platform, pattern|
270
+ socials[platform] ||= link if link.match?(pattern)
271
+ end
272
+ end
273
+
274
+ socials
275
+ end
276
+ end
277
+
278
+ # Get robots.txt URL
279
+ # @return [String] robots.txt URL
280
+ def robots_txt_url
281
+ "#{@url.split('/')[0..2].join('/')}/robots.txt" if @url
282
+ end
283
+
284
+ # Get sitemap URL
285
+ # @return [Array<String>] Array of sitemap URLs
286
+ def sitemap_url
287
+ @sitemap_url ||= begin
288
+ sitemaps = []
289
+
290
+ # Check for sitemap link tag
291
+ @page.css('link[rel="sitemap"]').each do |link|
292
+ href = link[:href]
293
+ sitemaps << make_absolute_url(href) if href
294
+ end
295
+
296
+ # Add default sitemap.xml
297
+ sitemaps << "#{@url.split('/')[0..2].join('/')}/sitemap.xml" if @url
298
+
299
+ sitemaps.uniq.compact
300
+ end
301
+ end
302
+
303
+ # Detect CMS and get detailed information
304
+ # @return [Hash] CMS information
305
+ def cms_info
306
+ @cms_info ||= begin
307
+ info = { name: nil, version: nil, themes: [], plugins: [] }
308
+
309
+ # WordPress detection
310
+ if @page.to_html.include?('wp-content') || @meta['generator']&.include?('WordPress')
311
+ info[:name] = 'WordPress'
312
+ # Try to extract version from generator meta tag
313
+ info[:version] = Regexp.last_match(1) if @meta['generator'] =~ /WordPress\s+([\d.]+)/
314
+
315
+ # Detect themes
316
+ @page.css('link[href*="wp-content/themes"]').each do |link|
317
+ info[:themes] << Regexp.last_match(1) if link[:href] =~ %r{themes/([^/]+)}
318
+ end
319
+
320
+ # Detect plugins
321
+ @page.css('link[href*="wp-content/plugins"], script[src*="wp-content/plugins"]').each do |elem|
322
+ src = elem[:href] || elem[:src]
323
+ info[:plugins] << Regexp.last_match(1) if src =~ %r{plugins/([^/]+)}
324
+ end
325
+ # Drupal detection
326
+ elsif @page.to_html.include?('Drupal') || @meta['generator']&.include?('Drupal')
327
+ info[:name] = 'Drupal'
328
+ info[:version] = Regexp.last_match(1) if @meta['generator'] =~ /Drupal\s+([\d.]+)/
329
+ # Joomla detection
330
+ elsif @meta['generator']&.include?('Joomla')
331
+ info[:name] = 'Joomla'
332
+ info[:version] = Regexp.last_match(1) if @meta['generator'] =~ /Joomla!\s+([\d.]+)/
333
+ # Shopify detection
334
+ elsif @page.to_html.include?('cdn.shopify.com') || @page.to_html.include?('Shopify')
335
+ info[:name] = 'Shopify'
336
+ # Wix detection
337
+ elsif @page.to_html.include?('wix.com') || @page.to_html.include?('_wix')
338
+ info[:name] = 'Wix'
339
+ # Squarespace detection
340
+ elsif @page.to_html.include?('squarespace')
341
+ info[:name] = 'Squarespace'
342
+ end
343
+
344
+ info[:themes].uniq!
345
+ info[:plugins].uniq!
346
+ info
347
+ end
348
+ end
349
+
350
+ # Calculate a basic accessibility score
351
+ # @return [Hash] Accessibility score and details
352
+ def accessibility_score
353
+ @accessibility_score ||= begin
354
+ score = 100
355
+ details = []
356
+
357
+ # Check images for alt text
358
+ images_without_alt = @page.css('img:not([alt])').count
359
+ total_images = @page.css('img').count
360
+
361
+ if total_images.positive?
362
+ alt_percentage = ((total_images - images_without_alt).to_f / total_images * 100).round
363
+ if alt_percentage < 100
364
+ penalty = (100 - alt_percentage) / 4 # Max 25 points penalty
365
+ score -= penalty
366
+ details << "#{images_without_alt} images missing alt text"
367
+ end
368
+ end
369
+
370
+ # Check heading hierarchy
371
+ h1_count = @page.css('h1').count
372
+ if h1_count.zero?
373
+ score -= 15
374
+ details << 'No H1 heading found'
375
+ elsif h1_count > 1
376
+ score -= 10
377
+ details << 'Multiple H1 headings found'
378
+ end
379
+
380
+ # Check for ARIA labels on interactive elements
381
+ buttons_without_aria = @page.css('button:not([aria-label]):not([aria-labelledby])').select do |btn|
382
+ btn.text.strip.empty?
383
+ end.count
384
+
385
+ if buttons_without_aria.positive?
386
+ score -= [buttons_without_aria * 5, 20].min
387
+ details << "#{buttons_without_aria} buttons without accessible labels"
388
+ end
389
+
390
+ # Check for language attribute
391
+ html_tag = @page.at('html')
392
+ if html_tag.nil? || html_tag['lang'].nil? || html_tag['lang'].empty?
393
+ score -= 10
394
+ details << 'No language attribute on HTML element'
395
+ end
396
+
397
+ # Check for form labels
398
+ inputs = @page.css('input[type="text"], input[type="email"], input[type="password"], textarea')
399
+ inputs_without_labels = inputs.select do |input|
400
+ id = input['id']
401
+ !id || @page.css("label[for=\"#{id}\"]").empty?
402
+ end.count
403
+
404
+ if inputs_without_labels.positive?
405
+ score -= [inputs_without_labels * 5, 15].min
406
+ details << "#{inputs_without_labels} form inputs without labels"
407
+ end
408
+
409
+ { score: [score, 0].max, details: details }
410
+ end
411
+ end
412
+
413
+ # Check if the page is mobile-friendly
414
+ # @return [Boolean] true if mobile-friendly
415
+ def mobile_friendly?
416
+ @mobile_friendly ||= begin
417
+ # Check for viewport meta tag
418
+ viewport = @meta['viewport']
419
+ has_viewport = !viewport.nil? && viewport.include?('width=device-width')
420
+
421
+ # Check for responsive CSS (media queries)
422
+ has_media_queries = stylesheets.any? || @page.to_html.include?('@media')
423
+
424
+ has_viewport && has_media_queries
144
425
  end
145
426
  end
146
427
 
@@ -152,7 +433,7 @@ module WebInspector
152
433
  # @return [Array<Hash>] Count results
153
434
  def counter(text, words)
154
435
  words.map do |word|
155
- { word => text.scan(/#{word.downcase}/).size }
436
+ { word => text.scan(/#{Regexp.escape(word.downcase)}/).size }
156
437
  end
157
438
  end
158
439
 
@@ -179,6 +460,30 @@ module WebInspector
179
460
  end
180
461
  end
181
462
 
463
+ # Filter a list of URLs by a given domain.
464
+ # @param collection [Array<String>] The list of URLs to filter.
465
+ # @param user_domain [String] The domain to filter by.
466
+ # @return [Array<String>] The filtered list of URLs.
467
+ def filter_by_domain(collection, user_domain)
468
+ return [] if collection.empty?
469
+
470
+ # Handle nil user_domain
471
+ user_domain = @host.to_s if user_domain.nil? || user_domain.empty?
472
+
473
+ # Normalize domain for comparison
474
+ normalized_domain = user_domain.to_s.downcase.gsub(/\s+/, '').sub(/^www\./, '')
475
+
476
+ collection.select do |item|
477
+ uri = URI.parse(item.to_s)
478
+ next false unless uri.host
479
+
480
+ uri_host = uri.host.to_s.downcase.sub(/^www\./, '')
481
+ uri_host.include?(normalized_domain)
482
+ rescue URI::InvalidURIError, NoMethodError
483
+ false
484
+ end
485
+ end
486
+
182
487
  # Make a URL absolute
183
488
  # @param url [String] URL to make absolute
184
489
  # @return [String, nil] Absolute URL or nil if invalid
@@ -191,39 +496,31 @@ module WebInspector
191
496
  # Get base URL from the page if not already set
192
497
  if @base_url.nil?
193
498
  base_tag = @page.at_css('base[href]')
194
- @base_url = base_tag ? base_tag['href'] : nil
499
+ @base_url = base_tag ? base_tag['href'] : ''
195
500
  end
196
501
 
197
502
  begin
198
503
  # Try joining with base URL first if available
199
- if @base_url && !@base_url.empty?
200
- begin
201
- return URI.join(@base_url, url).to_s
202
- rescue URI::InvalidURIError, URI::BadURIError
203
- # Fall through to next method
204
- end
205
- end
504
+ return URI.join(@base_url, url).to_s unless @base_url.empty?
505
+ rescue URI::InvalidURIError, URI::BadURIError
506
+ # Fall through to next method
507
+ end
206
508
 
509
+ begin
207
510
  # If we have @url, try to use it
208
- if @url
209
- begin
210
- return URI.join(@url, url).to_s
211
- rescue URI::InvalidURIError, URI::BadURIError
212
- # Fall through to next method
213
- end
214
- end
215
-
216
- # Otherwise use a default http:// base if url is absolute path
217
- return "http://#{@host}#{url}" if url.start_with?('/')
218
-
219
- # For truly relative URLs with no base, we need to make our best guess
220
- return "http://#{@host}/#{url}" if @host
221
-
222
- # Last resort, return the original
223
- url
511
+ return URI.join(@url, url).to_s if @url
224
512
  rescue URI::InvalidURIError, URI::BadURIError
225
- url # Return original instead of nil to be more lenient
513
+ # Fall through to next method
226
514
  end
515
+
516
+ # For relative URLs, we need to make our best guess
517
+ return "http://#{@host}#{url}" if url.start_with?('/')
518
+ return "http://#{@host}/#{url}" if @host
519
+
520
+ # Last resort, return the original
521
+ url
522
+ rescue URI::InvalidURIError, URI::BadURIError
523
+ url # Return original instead of nil to be more lenient
227
524
  end
228
525
 
229
526
  # Extract a snippet from the first long paragraph
@@ -26,6 +26,19 @@ module WebInspector
26
26
  .merge(meta_tag['property'])
27
27
  .merge(meta_tag['itemprop'] || {})
28
28
  .merge('charset' => meta_tag['charset'])
29
+ .merge('author' => author, 'publisher' => publisher)
30
+ end
31
+
32
+ def author
33
+ meta_tag['name']['author'] || meta_tag['property']['article:author']
34
+ rescue StandardError
35
+ nil
36
+ end
37
+
38
+ def publisher
39
+ meta_tag['property']['article:publisher'] || meta_tag['property']['og:site_name']
40
+ rescue StandardError
41
+ nil
29
42
  end
30
43
 
31
44
  def charset
@@ -19,8 +19,7 @@ require File.expand_path(File.join(File.dirname(__FILE__), 'request'))
19
19
 
20
20
  module WebInspector
21
21
  class Page
22
- attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links,
23
- :domain_links, :domain_images, :images, :response, :status_code, :favicon
22
+ attr_reader :status_code
24
23
 
25
24
  DEFAULT_TIMEOUT = 30
26
25
  DEFAULT_RETRIES = 3
@@ -70,7 +69,8 @@ module WebInspector
70
69
  end
71
70
 
72
71
  # Delegate methods to inspector
73
- %i[title description body links images meta].each do |method|
72
+ %i[title description body links images meta javascripts stylesheets language structured_data microdata
73
+ tag_count feeds social_links robots_txt_url sitemap_url cms_info accessibility_score mobile_friendly?].each do |method|
74
74
  define_method(method) do
75
75
  return nil unless success?
76
76
 
@@ -132,8 +132,99 @@ module WebInspector
132
132
  @inspector.domain_images(u, host)
133
133
  end
134
134
 
135
- # Get full JSON representation of the page
136
- #
135
+ # Get information about the page's security
136
+ # @return [Hash] Security information
137
+ def security_info
138
+ return @security_info if defined?(@security_info)
139
+
140
+ @security_info = {
141
+ secure: scheme == 'https',
142
+ hsts: response&.headers && response.headers['strict-transport-security'] ? true : false,
143
+ content_security_policy: response&.headers && response.headers['content-security-policy'] ? true : false
144
+ }
145
+
146
+ # Extract SSL/TLS info if available and using HTTPS
147
+ if scheme == 'https' && response&.env&.response_headers
148
+ @security_info[:ssl_version] = response.env[:ssl_version]
149
+ @security_info[:cipher_suite] = response.env[:cipher_suite]
150
+ end
151
+
152
+ @security_info
153
+ end
154
+
155
+ # Get the content type of the page
156
+ # @return [String, nil] Content type
157
+ def content_type
158
+ response&.headers && response.headers['content-type']
159
+ end
160
+
161
+ # Get the size of the page in bytes
162
+ # @return [Integer, nil] Size in bytes
163
+ def size
164
+ return @size if defined?(@size)
165
+
166
+ @size = if response&.headers && response.headers['content-length']
167
+ response.headers['content-length'].to_i
168
+ elsif response&.body
169
+ response.body.bytesize
170
+ end
171
+ end
172
+
173
+ # Get the load time of the page in seconds
174
+ # @return [Float, nil] Load time in seconds
175
+ attr_reader :load_time
176
+
177
+ # Get all JSON-LD structured data as a hash
178
+ # @return [Array<Hash>] Structured data
179
+ def json_ld
180
+ structured_data
181
+ end
182
+
183
+ # Get a hash of all technologies detected on the page
184
+ # @return [Hash] Detected technologies
185
+ def technologies
186
+ techs = {}
187
+ js_files = javascripts || []
188
+ css_files = stylesheets || []
189
+ page_body = body || ''
190
+ page_meta = meta || {}
191
+ response_headers = response&.headers || {}
192
+
193
+ # Frameworks and Libraries
194
+ techs[:jquery] = true if js_files.any? { |js| js.include?('jquery') } || page_body.include?('jQuery')
195
+ techs[:react] = true if page_body.include?('data-reactroot') || js_files.any? { |js| js.include?('react') }
196
+ techs[:vue] = true if page_body.include?('data-v-app') || js_files.any? { |js| js.include?('vue') }
197
+ techs[:angular] = true if page_body.include?('ng-version') || js_files.any? { |js| js.include?('angular') }
198
+ techs[:bootstrap] = true if css_files.any? do |css|
199
+ css.include?('bootstrap')
200
+ end || page_body.include?('class="container"')
201
+ if response_headers['x-powered-by']&.include?('Rails') || response_headers.key?('x-rails-env')
202
+ techs[:rails] =
203
+ true
204
+ end
205
+ techs[:php] = true if response_headers['x-powered-by']&.include?('PHP')
206
+
207
+ # CMS
208
+ techs[:wordpress] = true if page_meta['generator']&.include?('WordPress') || page_body.include?('/wp-content/')
209
+ techs[:shopify] = true if page_body.include?('Shopify.shop')
210
+
211
+ # Analytics
212
+ techs[:google_analytics] = true if js_files.any? { |js| js.include?('google-analytics.com') }
213
+
214
+ # Server
215
+ server = response_headers['server']
216
+ if server
217
+ techs[:server] = server
218
+ techs[:nginx] = true if server.include?('nginx')
219
+ techs[:apache] = true if server.include?('Apache')
220
+ techs[:iis] = true if server.include?('IIS')
221
+ techs[:express] = true if response_headers['x-powered-by']&.include?('Express')
222
+ end
223
+
224
+ techs
225
+ end
226
+
227
+ # Get full JSON representation of the page with all new data
137
228
  # @return [Hash] JSON representation of the page
138
229
  def to_hash
139
230
  {
@@ -146,7 +237,25 @@ module WebInspector
146
237
  'meta' => meta,
147
238
  'links' => links,
148
239
  'images' => images,
240
+ 'javascripts' => javascripts,
241
+ 'stylesheets' => stylesheets,
149
242
  'favicon' => favicon,
243
+ 'language' => language,
244
+ 'structured_data' => structured_data,
245
+ 'microdata' => microdata,
246
+ 'security_info' => security_info,
247
+ 'content_type' => content_type,
248
+ 'size' => size,
249
+ 'load_time' => load_time,
250
+ 'technologies' => technologies,
251
+ 'tag_count' => tag_count,
252
+ 'feeds' => feeds,
253
+ 'social_links' => social_links,
254
+ 'robots_txt_url' => robots_txt_url,
255
+ 'sitemap_url' => sitemap_url,
256
+ 'cms_info' => cms_info,
257
+ 'accessibility_score' => accessibility_score,
258
+ 'mobile_friendly' => mobile_friendly?,
150
259
  'response' => {
151
260
  'status' => status_code,
152
261
  'headers' => response&.headers || {},
@@ -166,6 +275,8 @@ module WebInspector
166
275
  private
167
276
 
168
277
  def fetch
278
+ start_time = Time.now
279
+
169
280
  session = Faraday.new(url: url) do |faraday|
170
281
  # Configure retries based on available middleware
171
282
  faraday.request :retry, { max: @retries } if defined?(Faraday::Retry)
@@ -194,6 +305,7 @@ module WebInspector
194
305
  end
195
306
 
196
307
  @url = response.env.url.to_s
308
+ @load_time = Time.now - start_time
197
309
  response
198
310
  rescue Faraday::TimeoutError, Faraday::ConnectionFailed => e
199
311
  retries += 1
@@ -28,6 +28,22 @@ module WebInspector
28
28
  URI(normalized_uri).port
29
29
  end
30
30
 
31
+ def valid?
32
+ !uri.nil? && !uri.host.nil?
33
+ rescue StandardError
34
+ false
35
+ end
36
+
37
+ def ssl?
38
+ scheme == 'https'
39
+ end
40
+
41
+ def error_message
42
+ return nil if valid?
43
+
44
+ 'Invalid URL: Unable to parse the provided URL'
45
+ end
46
+
31
47
  private
32
48
 
33
49
  def suffix_domain
@@ -47,7 +63,11 @@ module WebInspector
47
63
  end
48
64
 
49
65
  def normalized_uri
66
+ return '' if uri.nil?
67
+
50
68
  uri.normalize.to_s
69
+ rescue StandardError
70
+ @url
51
71
  end
52
72
  end
53
73
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module WebInspector
4
- VERSION = '1.0.0'
4
+ VERSION = '1.2.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webinspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Davide Santangelo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-03-18 00:00:00.000000000 Z
11
+ date: 2025-11-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake