webinspector 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -4
- data/lib/web_inspector/inspector.rb +197 -0
- data/lib/web_inspector/meta.rb +13 -0
- data/lib/web_inspector/page.rb +8 -1
- data/lib/web_inspector/request.rb +20 -0
- data/lib/web_inspector/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7ef69ac6245db062331d4f0b269d1f389cb9e730e4ad39e51d75236f30e0b6c9
|
|
4
|
+
data.tar.gz: b37d5b30a01ea6b0e74553edb9944acdeac4d5ce2808e319798b8d0d3a9f5390
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: '098f7c7f19f8bfe8b8f3b4977cac590e9c107cb8864128e4abbb3b47327e34743844fcde2d5bf6e3038e39e7271d565dbd19157b703ea041f31e4c3fbc84aa47'
|
|
7
|
+
data.tar.gz: 5fdeaa74861245796f15bf70ba5fcc00e7e7eb2f0c1192a60395a119b24ea1c72b579a24296cb66c1fd77cc7c4cfff8b2505d321a2ef74dd7b5618b84e69ae5c
|
data/README.md
CHANGED
|
@@ -4,7 +4,6 @@ Ruby gem to inspect web pages. It scrapes a given URL and returns its title, des
|
|
|
4
4
|
|
|
5
5
|
<a href="https://codeclimate.com/github/davidesantangelo/webinspector"><img src="https://codeclimate.com/github/davidesantangelo/webinspector/badges/gpa.svg" /></a>
|
|
6
6
|
|
|
7
|
-
|
|
8
7
|
## Installation
|
|
9
8
|
|
|
10
9
|
Add this line to your application's Gemfile:
|
|
@@ -34,7 +33,7 @@ page = WebInspector.new('http://example.com')
|
|
|
34
33
|
```ruby
|
|
35
34
|
page = WebInspector.new('http://example.com', {
|
|
36
35
|
timeout: 30, # Request timeout in seconds (default: 30)
|
|
37
|
-
retries: 3, # Number of retries (default: 3)
|
|
36
|
+
retries: 3, # Number of retries (default: 3)
|
|
38
37
|
headers: {'User-Agent': 'Custom UA'} # Custom HTTP headers
|
|
39
38
|
})
|
|
40
39
|
```
|
|
@@ -138,16 +137,73 @@ page.technologies # hash of detected technologies: { jquery: true, react: true,
|
|
|
138
137
|
page.tag_count # hash with counts of each HTML tag: { "div" => 45, "p" => 12, ... }
|
|
139
138
|
```
|
|
140
139
|
|
|
140
|
+
#### RSS/Atom Feeds
|
|
141
|
+
|
|
142
|
+
```ruby
|
|
143
|
+
page.feeds # array of RSS/Atom feed URLs found on the page
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
#### Social Media Links
|
|
147
|
+
|
|
148
|
+
```ruby
|
|
149
|
+
page.social_links # hash of social media profiles: { facebook: "url", twitter: "url", ... }
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
#### Robots.txt and Sitemap
|
|
153
|
+
|
|
154
|
+
```ruby
|
|
155
|
+
page.robots_txt_url # URL to robots.txt
|
|
156
|
+
page.sitemap_url # array of sitemap URLs
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
#### CMS Detection
|
|
160
|
+
|
|
161
|
+
```ruby
|
|
162
|
+
page.cms_info # hash with CMS details: { name: "WordPress", version: "6.0", themes: [...], plugins: [...] }
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
#### Accessibility Score
|
|
166
|
+
|
|
167
|
+
```ruby
|
|
168
|
+
page.accessibility_score # hash with score (0-100) and details: { score: 85, details: [...] }
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
#### Mobile-Friendly Check
|
|
172
|
+
|
|
173
|
+
```ruby
|
|
174
|
+
page.mobile_friendly? # true if the page has viewport meta tag and responsive CSS
|
|
175
|
+
```
|
|
176
|
+
|
|
141
177
|
### Export all data to JSON
|
|
142
178
|
|
|
143
179
|
```ruby
|
|
144
180
|
page.to_hash # returns a hash with all page data
|
|
145
181
|
```
|
|
146
182
|
|
|
183
|
+
## Changelog
|
|
184
|
+
|
|
185
|
+
### Version 1.2.0
|
|
186
|
+
|
|
187
|
+
**New Features:**
|
|
188
|
+
|
|
189
|
+
- RSS/Atom feed detection with `feeds` method
|
|
190
|
+
- Social media profile extraction with `social_links` method
|
|
191
|
+
- CMS detection and information with `cms_info` method (WordPress, Drupal, Joomla, Shopify, Wix, Squarespace)
|
|
192
|
+
- Accessibility scoring with `accessibility_score` method
|
|
193
|
+
- Mobile-friendly detection with `mobile_friendly?` method
|
|
194
|
+
- Robots.txt and sitemap URL detection with `robots_txt_url` and `sitemap_url` methods
|
|
195
|
+
|
|
196
|
+
**Improvements:**
|
|
197
|
+
|
|
198
|
+
- Enhanced `Request` module with `valid?` and `ssl?` methods for better URL validation
|
|
199
|
+
- Improved `Meta` module with author and publisher extraction
|
|
200
|
+
- Better error handling across all modules
|
|
201
|
+
- Performance improvements with internal caching
|
|
202
|
+
|
|
147
203
|
## Contributors
|
|
148
204
|
|
|
149
|
-
|
|
150
|
-
|
|
205
|
+
- Steven Shelby ([@stevenshelby](https://github.com/stevenshelby))
|
|
206
|
+
- Sam Nissen ([@samnissen](https://github.com/samnissen))
|
|
151
207
|
|
|
152
208
|
## License
|
|
153
209
|
|
|
@@ -228,6 +228,203 @@ module WebInspector
|
|
|
228
228
|
tags
|
|
229
229
|
end
|
|
230
230
|
|
|
231
|
+
# Extract RSS/Atom feeds from the page
|
|
232
|
+
# @return [Array<String>] Array of feed URLs
|
|
233
|
+
def feeds
|
|
234
|
+
@feeds ||= begin
|
|
235
|
+
feeds = []
|
|
236
|
+
|
|
237
|
+
# Look for feed link tags
|
|
238
|
+
@page.css('link[type="application/rss+xml"], link[type="application/atom+xml"]').each do |link|
|
|
239
|
+
href = link[:href]
|
|
240
|
+
feeds << make_absolute_url(href) if href
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# Look for common feed patterns in links
|
|
244
|
+
links.each do |link|
|
|
245
|
+
feeds << link if link =~ %r{/(feed|rss|atom)(/|\.xml|$)}i
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
feeds.uniq.compact
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# Extract social media profile links
|
|
253
|
+
# @return [Hash] Hash of social platform => URL
|
|
254
|
+
def social_links
|
|
255
|
+
@social_links ||= begin
|
|
256
|
+
socials = {}
|
|
257
|
+
platforms = {
|
|
258
|
+
facebook: /facebook\.com/,
|
|
259
|
+
twitter: /(twitter\.com|x\.com)/,
|
|
260
|
+
linkedin: /linkedin\.com/,
|
|
261
|
+
instagram: /instagram\.com/,
|
|
262
|
+
youtube: /youtube\.com/,
|
|
263
|
+
github: /github\.com/,
|
|
264
|
+
tiktok: /tiktok\.com/
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
# Check links
|
|
268
|
+
links.each do |link|
|
|
269
|
+
platforms.each do |platform, pattern|
|
|
270
|
+
socials[platform] ||= link if link.match?(pattern)
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
socials
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# Get robots.txt URL
|
|
279
|
+
# @return [String] robots.txt URL
|
|
280
|
+
def robots_txt_url
|
|
281
|
+
"#{@url.split('/')[0..2].join('/')}/robots.txt" if @url
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
# Get sitemap URL
|
|
285
|
+
# @return [Array<String>] Array of sitemap URLs
|
|
286
|
+
def sitemap_url
|
|
287
|
+
@sitemap_url ||= begin
|
|
288
|
+
sitemaps = []
|
|
289
|
+
|
|
290
|
+
# Check for sitemap link tag
|
|
291
|
+
@page.css('link[rel="sitemap"]').each do |link|
|
|
292
|
+
href = link[:href]
|
|
293
|
+
sitemaps << make_absolute_url(href) if href
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
# Add default sitemap.xml
|
|
297
|
+
sitemaps << "#{@url.split('/')[0..2].join('/')}/sitemap.xml" if @url
|
|
298
|
+
|
|
299
|
+
sitemaps.uniq.compact
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
# Detect CMS and get detailed information
|
|
304
|
+
# @return [Hash] CMS information
|
|
305
|
+
def cms_info
|
|
306
|
+
@cms_info ||= begin
|
|
307
|
+
info = { name: nil, version: nil, themes: [], plugins: [] }
|
|
308
|
+
|
|
309
|
+
# WordPress detection
|
|
310
|
+
if @page.to_html.include?('wp-content') || @meta['generator']&.include?('WordPress')
|
|
311
|
+
info[:name] = 'WordPress'
|
|
312
|
+
# Try to extract version from generator meta tag
|
|
313
|
+
info[:version] = Regexp.last_match(1) if @meta['generator'] =~ /WordPress\s+([\d.]+)/
|
|
314
|
+
|
|
315
|
+
# Detect themes
|
|
316
|
+
@page.css('link[href*="wp-content/themes"]').each do |link|
|
|
317
|
+
info[:themes] << Regexp.last_match(1) if link[:href] =~ %r{themes/([^/]+)}
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
# Detect plugins
|
|
321
|
+
@page.css('link[href*="wp-content/plugins"], script[src*="wp-content/plugins"]').each do |elem|
|
|
322
|
+
src = elem[:href] || elem[:src]
|
|
323
|
+
info[:plugins] << Regexp.last_match(1) if src =~ %r{plugins/([^/]+)}
|
|
324
|
+
end
|
|
325
|
+
# Drupal detection
|
|
326
|
+
elsif @page.to_html.include?('Drupal') || @meta['generator']&.include?('Drupal')
|
|
327
|
+
info[:name] = 'Drupal'
|
|
328
|
+
info[:version] = Regexp.last_match(1) if @meta['generator'] =~ /Drupal\s+([\d.]+)/
|
|
329
|
+
# Joomla detection
|
|
330
|
+
elsif @meta['generator']&.include?('Joomla')
|
|
331
|
+
info[:name] = 'Joomla'
|
|
332
|
+
info[:version] = Regexp.last_match(1) if @meta['generator'] =~ /Joomla!\s+([\d.]+)/
|
|
333
|
+
# Shopify detection
|
|
334
|
+
elsif @page.to_html.include?('cdn.shopify.com') || @page.to_html.include?('Shopify')
|
|
335
|
+
info[:name] = 'Shopify'
|
|
336
|
+
# Wix detection
|
|
337
|
+
elsif @page.to_html.include?('wix.com') || @page.to_html.include?('_wix')
|
|
338
|
+
info[:name] = 'Wix'
|
|
339
|
+
# Squarespace detection
|
|
340
|
+
elsif @page.to_html.include?('squarespace')
|
|
341
|
+
info[:name] = 'Squarespace'
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
info[:themes].uniq!
|
|
345
|
+
info[:plugins].uniq!
|
|
346
|
+
info
|
|
347
|
+
end
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
# Calculate a basic accessibility score
|
|
351
|
+
# @return [Hash] Accessibility score and details
|
|
352
|
+
def accessibility_score
|
|
353
|
+
@accessibility_score ||= begin
|
|
354
|
+
score = 100
|
|
355
|
+
details = []
|
|
356
|
+
|
|
357
|
+
# Check images for alt text
|
|
358
|
+
images_without_alt = @page.css('img:not([alt])').count
|
|
359
|
+
total_images = @page.css('img').count
|
|
360
|
+
|
|
361
|
+
if total_images.positive?
|
|
362
|
+
alt_percentage = ((total_images - images_without_alt).to_f / total_images * 100).round
|
|
363
|
+
if alt_percentage < 100
|
|
364
|
+
penalty = (100 - alt_percentage) / 4 # Max 25 points penalty
|
|
365
|
+
score -= penalty
|
|
366
|
+
details << "#{images_without_alt} images missing alt text"
|
|
367
|
+
end
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
# Check heading hierarchy
|
|
371
|
+
h1_count = @page.css('h1').count
|
|
372
|
+
if h1_count.zero?
|
|
373
|
+
score -= 15
|
|
374
|
+
details << 'No H1 heading found'
|
|
375
|
+
elsif h1_count > 1
|
|
376
|
+
score -= 10
|
|
377
|
+
details << 'Multiple H1 headings found'
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
# Check for ARIA labels on interactive elements
|
|
381
|
+
buttons_without_aria = @page.css('button:not([aria-label]):not([aria-labelledby])').select do |btn|
|
|
382
|
+
btn.text.strip.empty?
|
|
383
|
+
end.count
|
|
384
|
+
|
|
385
|
+
if buttons_without_aria.positive?
|
|
386
|
+
score -= [buttons_without_aria * 5, 20].min
|
|
387
|
+
details << "#{buttons_without_aria} buttons without accessible labels"
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
# Check for language attribute
|
|
391
|
+
html_tag = @page.at('html')
|
|
392
|
+
if html_tag.nil? || html_tag['lang'].nil? || html_tag['lang'].empty?
|
|
393
|
+
score -= 10
|
|
394
|
+
details << 'No language attribute on HTML element'
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
# Check for form labels
|
|
398
|
+
inputs = @page.css('input[type="text"], input[type="email"], input[type="password"], textarea')
|
|
399
|
+
inputs_without_labels = inputs.select do |input|
|
|
400
|
+
id = input['id']
|
|
401
|
+
!id || @page.css("label[for=\"#{id}\"]").empty?
|
|
402
|
+
end.count
|
|
403
|
+
|
|
404
|
+
if inputs_without_labels.positive?
|
|
405
|
+
score -= [inputs_without_labels * 5, 15].min
|
|
406
|
+
details << "#{inputs_without_labels} form inputs without labels"
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
{ score: [score, 0].max, details: details }
|
|
410
|
+
end
|
|
411
|
+
end
|
|
412
|
+
|
|
413
|
+
# Check if the page is mobile-friendly
|
|
414
|
+
# @return [Boolean] true if mobile-friendly
|
|
415
|
+
def mobile_friendly?
|
|
416
|
+
@mobile_friendly ||= begin
|
|
417
|
+
# Check for viewport meta tag
|
|
418
|
+
viewport = @meta['viewport']
|
|
419
|
+
has_viewport = !viewport.nil? && viewport.include?('width=device-width')
|
|
420
|
+
|
|
421
|
+
# Check for responsive CSS (media queries)
|
|
422
|
+
has_media_queries = stylesheets.any? || @page.to_html.include?('@media')
|
|
423
|
+
|
|
424
|
+
has_viewport && has_media_queries
|
|
425
|
+
end
|
|
426
|
+
end
|
|
427
|
+
|
|
231
428
|
private
|
|
232
429
|
|
|
233
430
|
# Count occurrences of words in text
|
data/lib/web_inspector/meta.rb
CHANGED
|
@@ -26,6 +26,19 @@ module WebInspector
|
|
|
26
26
|
.merge(meta_tag['property'])
|
|
27
27
|
.merge(meta_tag['itemprop'] || {})
|
|
28
28
|
.merge('charset' => meta_tag['charset'])
|
|
29
|
+
.merge('author' => author, 'publisher' => publisher)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def author
|
|
33
|
+
meta_tag['name']['author'] || meta_tag['property']['article:author']
|
|
34
|
+
rescue StandardError
|
|
35
|
+
nil
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def publisher
|
|
39
|
+
meta_tag['property']['article:publisher'] || meta_tag['property']['og:site_name']
|
|
40
|
+
rescue StandardError
|
|
41
|
+
nil
|
|
29
42
|
end
|
|
30
43
|
|
|
31
44
|
def charset
|
data/lib/web_inspector/page.rb
CHANGED
|
@@ -70,7 +70,7 @@ module WebInspector
|
|
|
70
70
|
|
|
71
71
|
# Delegate methods to inspector
|
|
72
72
|
%i[title description body links images meta javascripts stylesheets language structured_data microdata
|
|
73
|
-
tag_count].each do |method|
|
|
73
|
+
tag_count feeds social_links robots_txt_url sitemap_url cms_info accessibility_score mobile_friendly?].each do |method|
|
|
74
74
|
define_method(method) do
|
|
75
75
|
return nil unless success?
|
|
76
76
|
|
|
@@ -249,6 +249,13 @@ module WebInspector
|
|
|
249
249
|
'load_time' => load_time,
|
|
250
250
|
'technologies' => technologies,
|
|
251
251
|
'tag_count' => tag_count,
|
|
252
|
+
'feeds' => feeds,
|
|
253
|
+
'social_links' => social_links,
|
|
254
|
+
'robots_txt_url' => robots_txt_url,
|
|
255
|
+
'sitemap_url' => sitemap_url,
|
|
256
|
+
'cms_info' => cms_info,
|
|
257
|
+
'accessibility_score' => accessibility_score,
|
|
258
|
+
'mobile_friendly' => mobile_friendly?,
|
|
252
259
|
'response' => {
|
|
253
260
|
'status' => status_code,
|
|
254
261
|
'headers' => response&.headers || {},
|
|
@@ -28,6 +28,22 @@ module WebInspector
|
|
|
28
28
|
URI(normalized_uri).port
|
|
29
29
|
end
|
|
30
30
|
|
|
31
|
+
def valid?
|
|
32
|
+
!uri.nil? && !uri.host.nil?
|
|
33
|
+
rescue StandardError
|
|
34
|
+
false
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def ssl?
|
|
38
|
+
scheme == 'https'
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def error_message
|
|
42
|
+
return nil if valid?
|
|
43
|
+
|
|
44
|
+
'Invalid URL: Unable to parse the provided URL'
|
|
45
|
+
end
|
|
46
|
+
|
|
31
47
|
private
|
|
32
48
|
|
|
33
49
|
def suffix_domain
|
|
@@ -47,7 +63,11 @@ module WebInspector
|
|
|
47
63
|
end
|
|
48
64
|
|
|
49
65
|
def normalized_uri
|
|
66
|
+
return '' if uri.nil?
|
|
67
|
+
|
|
50
68
|
uri.normalize.to_s
|
|
69
|
+
rescue StandardError
|
|
70
|
+
@url
|
|
51
71
|
end
|
|
52
72
|
end
|
|
53
73
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: webinspector
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Davide Santangelo
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-
|
|
11
|
+
date: 2025-11-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rake
|