webinspector 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: df0bf76a03246a803f338a903611f128ee8b6d09329f33a9745a27eeb4e9793b
4
- data.tar.gz: adda6867a10d3dc5f7a9fd0ec414d7046e140fe016d945eb20098a84a5176642
3
+ metadata.gz: 7ef69ac6245db062331d4f0b269d1f389cb9e730e4ad39e51d75236f30e0b6c9
4
+ data.tar.gz: b37d5b30a01ea6b0e74553edb9944acdeac4d5ce2808e319798b8d0d3a9f5390
5
5
  SHA512:
6
- metadata.gz: 01ce7c5aab007a3c9ef300c61a990a6f00d00604c14f0a3fdec28fdfb620a1e50ad576055b892cfe4d59de66975236674ce1f81f66e2d60c0ad0e0a0c3f4a951
7
- data.tar.gz: ca58cdda149cf3b0cc6dcb29017b8e080b70b4ed881c924acddfba98760246417bb0304bc4eb42242a328d4ade99408181cc3ac10016998cd2b23de8fe34a8bf
6
+ metadata.gz: '098f7c7f19f8bfe8b8f3b4977cac590e9c107cb8864128e4abbb3b47327e34743844fcde2d5bf6e3038e39e7271d565dbd19157b703ea041f31e4c3fbc84aa47'
7
+ data.tar.gz: 5fdeaa74861245796f15bf70ba5fcc00e7e7eb2f0c1192a60395a119b24ea1c72b579a24296cb66c1fd77cc7c4cfff8b2505d321a2ef74dd7b5618b84e69ae5c
data/README.md CHANGED
@@ -4,7 +4,6 @@ Ruby gem to inspect web pages. It scrapes a given URL and returns its title, des
4
4
 
5
5
  <a href="https://codeclimate.com/github/davidesantangelo/webinspector"><img src="https://codeclimate.com/github/davidesantangelo/webinspector/badges/gpa.svg" /></a>
6
6
 
7
-
8
7
  ## Installation
9
8
 
10
9
  Add this line to your application's Gemfile:
@@ -34,7 +33,7 @@ page = WebInspector.new('http://example.com')
34
33
  ```ruby
35
34
  page = WebInspector.new('http://example.com', {
36
35
  timeout: 30, # Request timeout in seconds (default: 30)
37
- retries: 3, # Number of retries (default: 3)
36
+ retries: 3, # Number of retries (default: 3)
38
37
  headers: {'User-Agent': 'Custom UA'} # Custom HTTP headers
39
38
  })
40
39
  ```
@@ -138,16 +137,73 @@ page.technologies # hash of detected technologies: { jquery: true, react: true,
138
137
  page.tag_count # hash with counts of each HTML tag: { "div" => 45, "p" => 12, ... }
139
138
  ```
140
139
 
140
+ #### RSS/Atom Feeds
141
+
142
+ ```ruby
143
+ page.feeds # array of RSS/Atom feed URLs found on the page
144
+ ```
145
+
146
+ #### Social Media Links
147
+
148
+ ```ruby
149
+ page.social_links # hash of social media profiles: { facebook: "url", twitter: "url", ... }
150
+ ```
151
+
152
+ #### Robots.txt and Sitemap
153
+
154
+ ```ruby
155
+ page.robots_txt_url # URL to robots.txt
156
+ page.sitemap_url # array of sitemap URLs
157
+ ```
158
+
159
+ #### CMS Detection
160
+
161
+ ```ruby
162
+ page.cms_info # hash with CMS details: { name: "WordPress", version: "6.0", themes: [...], plugins: [...] }
163
+ ```
164
+
165
+ #### Accessibility Score
166
+
167
+ ```ruby
168
+ page.accessibility_score # hash with score (0-100) and details: { score: 85, details: [...] }
169
+ ```
170
+
171
+ #### Mobile-Friendly Check
172
+
173
+ ```ruby
174
+ page.mobile_friendly? # true if the page has viewport meta tag and responsive CSS
175
+ ```
176
+
141
177
  ### Export all data to JSON
142
178
 
143
179
  ```ruby
144
180
  page.to_hash # returns a hash with all page data
145
181
  ```
146
182
 
183
+ ## Changelog
184
+
185
+ ### Version 1.2.0
186
+
187
+ **New Features:**
188
+
189
+ - RSS/Atom feed detection with `feeds` method
190
+ - Social media profile extraction with `social_links` method
191
+ - CMS detection and information with `cms_info` method (WordPress, Drupal, Joomla, Shopify, Wix, Squarespace)
192
+ - Accessibility scoring with `accessibility_score` method
193
+ - Mobile-friendly detection with `mobile_friendly?` method
194
+ - Robots.txt and sitemap URL detection with `robots_txt_url` and `sitemap_url` methods
195
+
196
+ **Improvements:**
197
+
198
+ - Enhanced `Request` module with `valid?` and `ssl?` methods for better URL validation
199
+ - Improved `Meta` module with author and publisher extraction
200
+ - Better error handling across all modules
201
+ - Performance improvements with internal caching
202
+
147
203
  ## Contributors
148
204
 
149
- * Steven Shelby ([@stevenshelby](https://github.com/stevenshelby))
150
- * Sam Nissen ([@samnissen](https://github.com/samnissen))
205
+ - Steven Shelby ([@stevenshelby](https://github.com/stevenshelby))
206
+ - Sam Nissen ([@samnissen](https://github.com/samnissen))
151
207
 
152
208
  ## License
153
209
 
@@ -228,6 +228,203 @@ module WebInspector
228
228
  tags
229
229
  end
230
230
 
231
+ # Extract RSS/Atom feeds from the page
232
+ # @return [Array<String>] Array of feed URLs
233
+ def feeds
234
+ @feeds ||= begin
235
+ feeds = []
236
+
237
+ # Look for feed link tags
238
+ @page.css('link[type="application/rss+xml"], link[type="application/atom+xml"]').each do |link|
239
+ href = link[:href]
240
+ feeds << make_absolute_url(href) if href
241
+ end
242
+
243
+ # Look for common feed patterns in links
244
+ links.each do |link|
245
+ feeds << link if link =~ %r{/(feed|rss|atom)(/|\.xml|$)}i
246
+ end
247
+
248
+ feeds.uniq.compact
249
+ end
250
+ end
251
+
252
+ # Extract social media profile links
253
+ # @return [Hash] Hash of social platform => URL
254
+ def social_links
255
+ @social_links ||= begin
256
+ socials = {}
257
+ platforms = {
258
+ facebook: /facebook\.com/,
259
+ twitter: /(twitter\.com|x\.com)/,
260
+ linkedin: /linkedin\.com/,
261
+ instagram: /instagram\.com/,
262
+ youtube: /youtube\.com/,
263
+ github: /github\.com/,
264
+ tiktok: /tiktok\.com/
265
+ }
266
+
267
+ # Check links
268
+ links.each do |link|
269
+ platforms.each do |platform, pattern|
270
+ socials[platform] ||= link if link.match?(pattern)
271
+ end
272
+ end
273
+
274
+ socials
275
+ end
276
+ end
277
+
278
+ # Get robots.txt URL
279
+ # @return [String] robots.txt URL
280
+ def robots_txt_url
281
+ "#{@url.split('/')[0..2].join('/')}/robots.txt" if @url
282
+ end
283
+
284
+ # Get sitemap URL
285
+ # @return [Array<String>] Array of sitemap URLs
286
+ def sitemap_url
287
+ @sitemap_url ||= begin
288
+ sitemaps = []
289
+
290
+ # Check for sitemap link tag
291
+ @page.css('link[rel="sitemap"]').each do |link|
292
+ href = link[:href]
293
+ sitemaps << make_absolute_url(href) if href
294
+ end
295
+
296
+ # Add default sitemap.xml
297
+ sitemaps << "#{@url.split('/')[0..2].join('/')}/sitemap.xml" if @url
298
+
299
+ sitemaps.uniq.compact
300
+ end
301
+ end
302
+
303
+ # Detect CMS and get detailed information
304
+ # @return [Hash] CMS information
305
+ def cms_info
306
+ @cms_info ||= begin
307
+ info = { name: nil, version: nil, themes: [], plugins: [] }
308
+
309
+ # WordPress detection
310
+ if @page.to_html.include?('wp-content') || @meta['generator']&.include?('WordPress')
311
+ info[:name] = 'WordPress'
312
+ # Try to extract version from generator meta tag
313
+ info[:version] = Regexp.last_match(1) if @meta['generator'] =~ /WordPress\s+([\d.]+)/
314
+
315
+ # Detect themes
316
+ @page.css('link[href*="wp-content/themes"]').each do |link|
317
+ info[:themes] << Regexp.last_match(1) if link[:href] =~ %r{themes/([^/]+)}
318
+ end
319
+
320
+ # Detect plugins
321
+ @page.css('link[href*="wp-content/plugins"], script[src*="wp-content/plugins"]').each do |elem|
322
+ src = elem[:href] || elem[:src]
323
+ info[:plugins] << Regexp.last_match(1) if src =~ %r{plugins/([^/]+)}
324
+ end
325
+ # Drupal detection
326
+ elsif @page.to_html.include?('Drupal') || @meta['generator']&.include?('Drupal')
327
+ info[:name] = 'Drupal'
328
+ info[:version] = Regexp.last_match(1) if @meta['generator'] =~ /Drupal\s+([\d.]+)/
329
+ # Joomla detection
330
+ elsif @meta['generator']&.include?('Joomla')
331
+ info[:name] = 'Joomla'
332
+ info[:version] = Regexp.last_match(1) if @meta['generator'] =~ /Joomla!\s+([\d.]+)/
333
+ # Shopify detection
334
+ elsif @page.to_html.include?('cdn.shopify.com') || @page.to_html.include?('Shopify')
335
+ info[:name] = 'Shopify'
336
+ # Wix detection
337
+ elsif @page.to_html.include?('wix.com') || @page.to_html.include?('_wix')
338
+ info[:name] = 'Wix'
339
+ # Squarespace detection
340
+ elsif @page.to_html.include?('squarespace')
341
+ info[:name] = 'Squarespace'
342
+ end
343
+
344
+ info[:themes].uniq!
345
+ info[:plugins].uniq!
346
+ info
347
+ end
348
+ end
349
+
350
+ # Calculate a basic accessibility score
351
+ # @return [Hash] Accessibility score and details
352
+ def accessibility_score
353
+ @accessibility_score ||= begin
354
+ score = 100
355
+ details = []
356
+
357
+ # Check images for alt text
358
+ images_without_alt = @page.css('img:not([alt])').count
359
+ total_images = @page.css('img').count
360
+
361
+ if total_images.positive?
362
+ alt_percentage = ((total_images - images_without_alt).to_f / total_images * 100).round
363
+ if alt_percentage < 100
364
+ penalty = (100 - alt_percentage) / 4 # Max 25 points penalty
365
+ score -= penalty
366
+ details << "#{images_without_alt} images missing alt text"
367
+ end
368
+ end
369
+
370
+ # Check heading hierarchy
371
+ h1_count = @page.css('h1').count
372
+ if h1_count.zero?
373
+ score -= 15
374
+ details << 'No H1 heading found'
375
+ elsif h1_count > 1
376
+ score -= 10
377
+ details << 'Multiple H1 headings found'
378
+ end
379
+
380
+ # Check for ARIA labels on interactive elements
381
+ buttons_without_aria = @page.css('button:not([aria-label]):not([aria-labelledby])').select do |btn|
382
+ btn.text.strip.empty?
383
+ end.count
384
+
385
+ if buttons_without_aria.positive?
386
+ score -= [buttons_without_aria * 5, 20].min
387
+ details << "#{buttons_without_aria} buttons without accessible labels"
388
+ end
389
+
390
+ # Check for language attribute
391
+ html_tag = @page.at('html')
392
+ if html_tag.nil? || html_tag['lang'].nil? || html_tag['lang'].empty?
393
+ score -= 10
394
+ details << 'No language attribute on HTML element'
395
+ end
396
+
397
+ # Check for form labels
398
+ inputs = @page.css('input[type="text"], input[type="email"], input[type="password"], textarea')
399
+ inputs_without_labels = inputs.select do |input|
400
+ id = input['id']
401
+ !id || @page.css("label[for=\"#{id}\"]").empty?
402
+ end.count
403
+
404
+ if inputs_without_labels.positive?
405
+ score -= [inputs_without_labels * 5, 15].min
406
+ details << "#{inputs_without_labels} form inputs without labels"
407
+ end
408
+
409
+ { score: [score, 0].max, details: details }
410
+ end
411
+ end
412
+
413
+ # Check if the page is mobile-friendly
414
+ # @return [Boolean] true if mobile-friendly
415
+ def mobile_friendly?
416
+ @mobile_friendly ||= begin
417
+ # Check for viewport meta tag
418
+ viewport = @meta['viewport']
419
+ has_viewport = !viewport.nil? && viewport.include?('width=device-width')
420
+
421
+ # Check for responsive CSS (media queries)
422
+ has_media_queries = stylesheets.any? || @page.to_html.include?('@media')
423
+
424
+ has_viewport && has_media_queries
425
+ end
426
+ end
427
+
231
428
  private
232
429
 
233
430
  # Count occurrences of words in text
@@ -26,6 +26,19 @@ module WebInspector
26
26
  .merge(meta_tag['property'])
27
27
  .merge(meta_tag['itemprop'] || {})
28
28
  .merge('charset' => meta_tag['charset'])
29
+ .merge('author' => author, 'publisher' => publisher)
30
+ end
31
+
32
+ def author
33
+ meta_tag['name']['author'] || meta_tag['property']['article:author']
34
+ rescue StandardError
35
+ nil
36
+ end
37
+
38
+ def publisher
39
+ meta_tag['property']['article:publisher'] || meta_tag['property']['og:site_name']
40
+ rescue StandardError
41
+ nil
29
42
  end
30
43
 
31
44
  def charset
@@ -70,7 +70,7 @@ module WebInspector
70
70
 
71
71
  # Delegate methods to inspector
72
72
  %i[title description body links images meta javascripts stylesheets language structured_data microdata
73
- tag_count].each do |method|
73
+ tag_count feeds social_links robots_txt_url sitemap_url cms_info accessibility_score mobile_friendly?].each do |method|
74
74
  define_method(method) do
75
75
  return nil unless success?
76
76
 
@@ -249,6 +249,13 @@ module WebInspector
249
249
  'load_time' => load_time,
250
250
  'technologies' => technologies,
251
251
  'tag_count' => tag_count,
252
+ 'feeds' => feeds,
253
+ 'social_links' => social_links,
254
+ 'robots_txt_url' => robots_txt_url,
255
+ 'sitemap_url' => sitemap_url,
256
+ 'cms_info' => cms_info,
257
+ 'accessibility_score' => accessibility_score,
258
+ 'mobile_friendly' => mobile_friendly?,
252
259
  'response' => {
253
260
  'status' => status_code,
254
261
  'headers' => response&.headers || {},
@@ -28,6 +28,22 @@ module WebInspector
28
28
  URI(normalized_uri).port
29
29
  end
30
30
 
31
+ def valid?
32
+ !uri.nil? && !uri.host.nil?
33
+ rescue StandardError
34
+ false
35
+ end
36
+
37
+ def ssl?
38
+ scheme == 'https'
39
+ end
40
+
41
+ def error_message
42
+ return nil if valid?
43
+
44
+ 'Invalid URL: Unable to parse the provided URL'
45
+ end
46
+
31
47
  private
32
48
 
33
49
  def suffix_domain
@@ -47,7 +63,11 @@ module WebInspector
47
63
  end
48
64
 
49
65
  def normalized_uri
66
+ return '' if uri.nil?
67
+
50
68
  uri.normalize.to_s
69
+ rescue StandardError
70
+ @url
51
71
  end
52
72
  end
53
73
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module WebInspector
4
- VERSION = '1.1.0'
4
+ VERSION = '1.2.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webinspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Davide Santangelo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-07-29 00:00:00.000000000 Z
11
+ date: 2025-11-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake