webinspector 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +112 -4
- data/lib/web_inspector/inspector.rb +357 -60
- data/lib/web_inspector/meta.rb +13 -0
- data/lib/web_inspector/page.rb +117 -5
- data/lib/web_inspector/request.rb +20 -0
- data/lib/web_inspector/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7ef69ac6245db062331d4f0b269d1f389cb9e730e4ad39e51d75236f30e0b6c9
|
|
4
|
+
data.tar.gz: b37d5b30a01ea6b0e74553edb9944acdeac4d5ce2808e319798b8d0d3a9f5390
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: '098f7c7f19f8bfe8b8f3b4977cac590e9c107cb8864128e4abbb3b47327e34743844fcde2d5bf6e3038e39e7271d565dbd19157b703ea041f31e4c3fbc84aa47'
|
|
7
|
+
data.tar.gz: 5fdeaa74861245796f15bf70ba5fcc00e7e7eb2f0c1192a60395a119b24ea1c72b579a24296cb66c1fd77cc7c4cfff8b2505d321a2ef74dd7b5618b84e69ae5c
|
data/README.md
CHANGED
|
@@ -4,7 +4,6 @@ Ruby gem to inspect web pages. It scrapes a given URL and returns its title, des
|
|
|
4
4
|
|
|
5
5
|
<a href="https://codeclimate.com/github/davidesantangelo/webinspector"><img src="https://codeclimate.com/github/davidesantangelo/webinspector/badges/gpa.svg" /></a>
|
|
6
6
|
|
|
7
|
-
|
|
8
7
|
## Installation
|
|
9
8
|
|
|
10
9
|
Add this line to your application's Gemfile:
|
|
@@ -34,7 +33,7 @@ page = WebInspector.new('http://example.com')
|
|
|
34
33
|
```ruby
|
|
35
34
|
page = WebInspector.new('http://example.com', {
|
|
36
35
|
timeout: 30, # Request timeout in seconds (default: 30)
|
|
37
|
-
retries: 3, # Number of retries (default: 3)
|
|
36
|
+
retries: 3, # Number of retries (default: 3)
|
|
38
37
|
headers: {'User-Agent': 'Custom UA'} # Custom HTTP headers
|
|
39
38
|
})
|
|
40
39
|
```
|
|
@@ -86,16 +85,125 @@ page.domain_images('example.com') # returns only images hosted on example.com
|
|
|
86
85
|
page.find(["ruby", "rails"]) # returns [{"ruby"=>3}, {"rails"=>1}]
|
|
87
86
|
```
|
|
88
87
|
|
|
88
|
+
#### JavaScript and Stylesheets
|
|
89
|
+
|
|
90
|
+
```ruby
|
|
91
|
+
page.javascripts # array of all JavaScript files (absolute URLs)
|
|
92
|
+
page.stylesheets # array of all CSS stylesheets (absolute URLs)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
#### Language Detection
|
|
96
|
+
|
|
97
|
+
```ruby
|
|
98
|
+
page.language # detected language code (e.g., "en", "es", "fr")
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
#### Structured Data
|
|
102
|
+
|
|
103
|
+
```ruby
|
|
104
|
+
page.structured_data # array of JSON-LD structured data objects
|
|
105
|
+
page.microdata # array of microdata items
|
|
106
|
+
page.json_ld # alias for structured_data
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
#### Security Information
|
|
110
|
+
|
|
111
|
+
```ruby
|
|
112
|
+
page.security_info # hash with security details: { secure: true, hsts: true, ... }
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
#### Performance Metrics
|
|
116
|
+
|
|
117
|
+
```ruby
|
|
118
|
+
page.load_time # page load time in seconds
|
|
119
|
+
page.size # page size in bytes
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
#### Content Type
|
|
123
|
+
|
|
124
|
+
```ruby
|
|
125
|
+
page.content_type # content type header (e.g., "text/html; charset=utf-8")
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
#### Technology Detection
|
|
129
|
+
|
|
130
|
+
```ruby
|
|
131
|
+
page.technologies # hash of detected technologies: { jquery: true, react: true, ... }
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
#### HTML Tag Statistics
|
|
135
|
+
|
|
136
|
+
```ruby
|
|
137
|
+
page.tag_count # hash with counts of each HTML tag: { "div" => 45, "p" => 12, ... }
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
#### RSS/Atom Feeds
|
|
141
|
+
|
|
142
|
+
```ruby
|
|
143
|
+
page.feeds # array of RSS/Atom feed URLs found on the page
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
#### Social Media Links
|
|
147
|
+
|
|
148
|
+
```ruby
|
|
149
|
+
page.social_links # hash of social media profiles: { facebook: "url", twitter: "url", ... }
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
#### Robots.txt and Sitemap
|
|
153
|
+
|
|
154
|
+
```ruby
|
|
155
|
+
page.robots_txt_url # URL to robots.txt
|
|
156
|
+
page.sitemap_url # array of sitemap URLs
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
#### CMS Detection
|
|
160
|
+
|
|
161
|
+
```ruby
|
|
162
|
+
page.cms_info # hash with CMS details: { name: "WordPress", version: "6.0", themes: [...], plugins: [...] }
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
#### Accessibility Score
|
|
166
|
+
|
|
167
|
+
```ruby
|
|
168
|
+
page.accessibility_score # hash with score (0-100) and details: { score: 85, details: [...] }
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
#### Mobile-Friendly Check
|
|
172
|
+
|
|
173
|
+
```ruby
|
|
174
|
+
page.mobile_friendly? # true if the page has viewport meta tag and responsive CSS
|
|
175
|
+
```
|
|
176
|
+
|
|
89
177
|
### Export all data to JSON
|
|
90
178
|
|
|
91
179
|
```ruby
|
|
92
180
|
page.to_hash # returns a hash with all page data
|
|
93
181
|
```
|
|
94
182
|
|
|
183
|
+
## Changelog
|
|
184
|
+
|
|
185
|
+
### Version 1.2.0
|
|
186
|
+
|
|
187
|
+
**New Features:**
|
|
188
|
+
|
|
189
|
+
- RSS/Atom feed detection with `feeds` method
|
|
190
|
+
- Social media profile extraction with `social_links` method
|
|
191
|
+
- CMS detection and information with `cms_info` method (WordPress, Drupal, Joomla, Shopify, Wix, Squarespace)
|
|
192
|
+
- Accessibility scoring with `accessibility_score` method
|
|
193
|
+
- Mobile-friendly detection with `mobile_friendly?` method
|
|
194
|
+
- Robots.txt and sitemap URL detection with `robots_txt_url` and `sitemap_url` methods
|
|
195
|
+
|
|
196
|
+
**Improvements:**
|
|
197
|
+
|
|
198
|
+
- Enhanced `Request` module with `valid?` and `ssl?` methods for better URL validation
|
|
199
|
+
- Improved `Meta` module with author and publisher extraction
|
|
200
|
+
- Better error handling across all modules
|
|
201
|
+
- Performance improvements with internal caching
|
|
202
|
+
|
|
95
203
|
## Contributors
|
|
96
204
|
|
|
97
|
-
|
|
98
|
-
|
|
205
|
+
- Steven Shelby ([@stevenshelby](https://github.com/stevenshelby))
|
|
206
|
+
- Sam Nissen ([@samnissen](https://github.com/samnissen))
|
|
99
207
|
|
|
100
208
|
## License
|
|
101
209
|
|
|
@@ -71,26 +71,7 @@ module WebInspector
|
|
|
71
71
|
# @return [Array<String>] Filtered links
|
|
72
72
|
def domain_links(user_domain, host = nil)
|
|
73
73
|
@host ||= host
|
|
74
|
-
|
|
75
|
-
return [] if links.empty?
|
|
76
|
-
|
|
77
|
-
# Handle nil user_domain
|
|
78
|
-
user_domain = @host.to_s if user_domain.nil? || user_domain.empty?
|
|
79
|
-
|
|
80
|
-
# Normalize domain for comparison
|
|
81
|
-
user_domain = user_domain.to_s.downcase.gsub(/\s+/, '')
|
|
82
|
-
user_domain = user_domain.sub(/^www\./, '') # Remove www prefix for comparison
|
|
83
|
-
|
|
84
|
-
links.select do |link|
|
|
85
|
-
uri = URI.parse(link.to_s)
|
|
86
|
-
next false unless uri.host # Skip URLs without hosts
|
|
87
|
-
|
|
88
|
-
uri_host = uri.host.to_s.downcase
|
|
89
|
-
uri_host = uri_host.sub(/^www\./, '') # Remove www prefix for comparison
|
|
90
|
-
uri_host.include?(user_domain)
|
|
91
|
-
rescue URI::InvalidURIError, NoMethodError
|
|
92
|
-
false
|
|
93
|
-
end
|
|
74
|
+
filter_by_domain(links, user_domain)
|
|
94
75
|
end
|
|
95
76
|
|
|
96
77
|
# Get all images from the page
|
|
@@ -122,25 +103,325 @@ module WebInspector
|
|
|
122
103
|
# @return [Array<String>] Filtered images
|
|
123
104
|
def domain_images(user_domain, host = nil)
|
|
124
105
|
@host ||= host
|
|
106
|
+
filter_by_domain(images, user_domain)
|
|
107
|
+
end
|
|
125
108
|
|
|
126
|
-
|
|
109
|
+
# Get all JavaScript files used by the page
|
|
110
|
+
# @return [Array<String>] Array of JavaScript file URLs
|
|
111
|
+
def javascripts
|
|
112
|
+
@javascripts ||= begin
|
|
113
|
+
scripts = []
|
|
114
|
+
@page.css('script[src]').each do |script|
|
|
115
|
+
src = script[:src]
|
|
116
|
+
next unless src
|
|
127
117
|
|
|
128
|
-
|
|
129
|
-
|
|
118
|
+
# Clean and normalize URL
|
|
119
|
+
src = src.strip
|
|
130
120
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
121
|
+
begin
|
|
122
|
+
absolute_url = make_absolute_url(src)
|
|
123
|
+
scripts << absolute_url if absolute_url
|
|
124
|
+
rescue URI::InvalidURIError, URI::BadURIError
|
|
125
|
+
# Skip invalid URLs
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
scripts.uniq.compact
|
|
129
|
+
end
|
|
130
|
+
end
|
|
134
131
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
132
|
+
# Get stylesheets used by the page
|
|
133
|
+
# @return [Array<String>] Array of CSS file URLs
|
|
134
|
+
def stylesheets
|
|
135
|
+
@stylesheets ||= begin
|
|
136
|
+
styles = []
|
|
137
|
+
@page.css('link[rel="stylesheet"]').each do |style|
|
|
138
|
+
href = style[:href]
|
|
139
|
+
next unless href
|
|
138
140
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
141
|
+
# Clean and normalize URL
|
|
142
|
+
href = href.strip
|
|
143
|
+
|
|
144
|
+
begin
|
|
145
|
+
absolute_url = make_absolute_url(href)
|
|
146
|
+
styles << absolute_url if absolute_url
|
|
147
|
+
rescue URI::InvalidURIError, URI::BadURIError
|
|
148
|
+
# Skip invalid URLs
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
styles.uniq.compact
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Detect the page language
|
|
156
|
+
# @return [String, nil] Language code if detected, nil otherwise
|
|
157
|
+
def language
|
|
158
|
+
# Check for html lang attribute first
|
|
159
|
+
html_tag = @page.at('html')
|
|
160
|
+
return html_tag['lang'] if html_tag && html_tag['lang'] && !html_tag['lang'].empty?
|
|
161
|
+
|
|
162
|
+
# Then check for language meta tag
|
|
163
|
+
lang_meta = @meta['content-language']
|
|
164
|
+
return lang_meta if lang_meta && !lang_meta.empty?
|
|
165
|
+
|
|
166
|
+
# Fallback to inspecting content headers if available
|
|
167
|
+
nil
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Extract structured data (JSON-LD) from the page
|
|
171
|
+
# @return [Array<Hash>] Array of structured data objects
|
|
172
|
+
def structured_data
|
|
173
|
+
@structured_data ||= begin
|
|
174
|
+
data = []
|
|
175
|
+
@page.css('script[type="application/ld+json"]').each do |script|
|
|
176
|
+
parsed = JSON.parse(script.text)
|
|
177
|
+
data << parsed if parsed
|
|
178
|
+
rescue JSON::ParserError
|
|
179
|
+
# Skip invalid JSON
|
|
180
|
+
end
|
|
181
|
+
data
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Extract microdata from the page
|
|
186
|
+
# @return [Array<Hash>] Array of microdata items
|
|
187
|
+
def microdata
|
|
188
|
+
@microdata ||= begin
|
|
189
|
+
items = []
|
|
190
|
+
@page.css('[itemscope]').each do |scope|
|
|
191
|
+
item = { type: scope['itemtype'] }
|
|
192
|
+
properties = {}
|
|
193
|
+
|
|
194
|
+
scope.css('[itemprop]').each do |prop|
|
|
195
|
+
name = prop['itemprop']
|
|
196
|
+
# Extract value based on tag
|
|
197
|
+
value = case prop.name.downcase
|
|
198
|
+
when 'meta'
|
|
199
|
+
prop['content']
|
|
200
|
+
when 'img', 'audio', 'embed', 'iframe', 'source', 'track', 'video'
|
|
201
|
+
make_absolute_url(prop['src'])
|
|
202
|
+
when 'a', 'area', 'link'
|
|
203
|
+
make_absolute_url(prop['href'])
|
|
204
|
+
when 'time'
|
|
205
|
+
prop['datetime'] || prop.text.strip
|
|
206
|
+
else
|
|
207
|
+
prop.text.strip
|
|
208
|
+
end
|
|
209
|
+
properties[name] = value
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
item[:properties] = properties
|
|
213
|
+
items << item
|
|
214
|
+
end
|
|
215
|
+
items
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# Count all tag types on the page
|
|
220
|
+
# @return [Hash] Counts of different HTML elements
|
|
221
|
+
def tag_count
|
|
222
|
+
tags = {}
|
|
223
|
+
@page.css('*').each do |element|
|
|
224
|
+
tag_name = element.name.downcase
|
|
225
|
+
tags[tag_name] ||= 0
|
|
226
|
+
tags[tag_name] += 1
|
|
227
|
+
end
|
|
228
|
+
tags
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# Extract RSS/Atom feeds from the page
|
|
232
|
+
# @return [Array<String>] Array of feed URLs
|
|
233
|
+
def feeds
|
|
234
|
+
@feeds ||= begin
|
|
235
|
+
feeds = []
|
|
236
|
+
|
|
237
|
+
# Look for feed link tags
|
|
238
|
+
@page.css('link[type="application/rss+xml"], link[type="application/atom+xml"]').each do |link|
|
|
239
|
+
href = link[:href]
|
|
240
|
+
feeds << make_absolute_url(href) if href
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# Look for common feed patterns in links
|
|
244
|
+
links.each do |link|
|
|
245
|
+
feeds << link if link =~ %r{/(feed|rss|atom)(/|\.xml|$)}i
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
feeds.uniq.compact
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# Extract social media profile links
|
|
253
|
+
# @return [Hash] Hash of social platform => URL
|
|
254
|
+
def social_links
|
|
255
|
+
@social_links ||= begin
|
|
256
|
+
socials = {}
|
|
257
|
+
platforms = {
|
|
258
|
+
facebook: /facebook\.com/,
|
|
259
|
+
twitter: /(twitter\.com|x\.com)/,
|
|
260
|
+
linkedin: /linkedin\.com/,
|
|
261
|
+
instagram: /instagram\.com/,
|
|
262
|
+
youtube: /youtube\.com/,
|
|
263
|
+
github: /github\.com/,
|
|
264
|
+
tiktok: /tiktok\.com/
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
# Check links
|
|
268
|
+
links.each do |link|
|
|
269
|
+
platforms.each do |platform, pattern|
|
|
270
|
+
socials[platform] ||= link if link.match?(pattern)
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
socials
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# Get robots.txt URL
|
|
279
|
+
# @return [String] robots.txt URL
|
|
280
|
+
def robots_txt_url
|
|
281
|
+
"#{@url.split('/')[0..2].join('/')}/robots.txt" if @url
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
# Get sitemap URL
|
|
285
|
+
# @return [Array<String>] Array of sitemap URLs
|
|
286
|
+
def sitemap_url
|
|
287
|
+
@sitemap_url ||= begin
|
|
288
|
+
sitemaps = []
|
|
289
|
+
|
|
290
|
+
# Check for sitemap link tag
|
|
291
|
+
@page.css('link[rel="sitemap"]').each do |link|
|
|
292
|
+
href = link[:href]
|
|
293
|
+
sitemaps << make_absolute_url(href) if href
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
# Add default sitemap.xml
|
|
297
|
+
sitemaps << "#{@url.split('/')[0..2].join('/')}/sitemap.xml" if @url
|
|
298
|
+
|
|
299
|
+
sitemaps.uniq.compact
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
# Detect CMS and get detailed information
|
|
304
|
+
# @return [Hash] CMS information
|
|
305
|
+
def cms_info
|
|
306
|
+
@cms_info ||= begin
|
|
307
|
+
info = { name: nil, version: nil, themes: [], plugins: [] }
|
|
308
|
+
|
|
309
|
+
# WordPress detection
|
|
310
|
+
if @page.to_html.include?('wp-content') || @meta['generator']&.include?('WordPress')
|
|
311
|
+
info[:name] = 'WordPress'
|
|
312
|
+
# Try to extract version from generator meta tag
|
|
313
|
+
info[:version] = Regexp.last_match(1) if @meta['generator'] =~ /WordPress\s+([\d.]+)/
|
|
314
|
+
|
|
315
|
+
# Detect themes
|
|
316
|
+
@page.css('link[href*="wp-content/themes"]').each do |link|
|
|
317
|
+
info[:themes] << Regexp.last_match(1) if link[:href] =~ %r{themes/([^/]+)}
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
# Detect plugins
|
|
321
|
+
@page.css('link[href*="wp-content/plugins"], script[src*="wp-content/plugins"]').each do |elem|
|
|
322
|
+
src = elem[:href] || elem[:src]
|
|
323
|
+
info[:plugins] << Regexp.last_match(1) if src =~ %r{plugins/([^/]+)}
|
|
324
|
+
end
|
|
325
|
+
# Drupal detection
|
|
326
|
+
elsif @page.to_html.include?('Drupal') || @meta['generator']&.include?('Drupal')
|
|
327
|
+
info[:name] = 'Drupal'
|
|
328
|
+
info[:version] = Regexp.last_match(1) if @meta['generator'] =~ /Drupal\s+([\d.]+)/
|
|
329
|
+
# Joomla detection
|
|
330
|
+
elsif @meta['generator']&.include?('Joomla')
|
|
331
|
+
info[:name] = 'Joomla'
|
|
332
|
+
info[:version] = Regexp.last_match(1) if @meta['generator'] =~ /Joomla!\s+([\d.]+)/
|
|
333
|
+
# Shopify detection
|
|
334
|
+
elsif @page.to_html.include?('cdn.shopify.com') || @page.to_html.include?('Shopify')
|
|
335
|
+
info[:name] = 'Shopify'
|
|
336
|
+
# Wix detection
|
|
337
|
+
elsif @page.to_html.include?('wix.com') || @page.to_html.include?('_wix')
|
|
338
|
+
info[:name] = 'Wix'
|
|
339
|
+
# Squarespace detection
|
|
340
|
+
elsif @page.to_html.include?('squarespace')
|
|
341
|
+
info[:name] = 'Squarespace'
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
info[:themes].uniq!
|
|
345
|
+
info[:plugins].uniq!
|
|
346
|
+
info
|
|
347
|
+
end
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
# Calculate a basic accessibility score
|
|
351
|
+
# @return [Hash] Accessibility score and details
|
|
352
|
+
def accessibility_score
|
|
353
|
+
@accessibility_score ||= begin
|
|
354
|
+
score = 100
|
|
355
|
+
details = []
|
|
356
|
+
|
|
357
|
+
# Check images for alt text
|
|
358
|
+
images_without_alt = @page.css('img:not([alt])').count
|
|
359
|
+
total_images = @page.css('img').count
|
|
360
|
+
|
|
361
|
+
if total_images.positive?
|
|
362
|
+
alt_percentage = ((total_images - images_without_alt).to_f / total_images * 100).round
|
|
363
|
+
if alt_percentage < 100
|
|
364
|
+
penalty = (100 - alt_percentage) / 4 # Max 25 points penalty
|
|
365
|
+
score -= penalty
|
|
366
|
+
details << "#{images_without_alt} images missing alt text"
|
|
367
|
+
end
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
# Check heading hierarchy
|
|
371
|
+
h1_count = @page.css('h1').count
|
|
372
|
+
if h1_count.zero?
|
|
373
|
+
score -= 15
|
|
374
|
+
details << 'No H1 heading found'
|
|
375
|
+
elsif h1_count > 1
|
|
376
|
+
score -= 10
|
|
377
|
+
details << 'Multiple H1 headings found'
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
# Check for ARIA labels on interactive elements
|
|
381
|
+
buttons_without_aria = @page.css('button:not([aria-label]):not([aria-labelledby])').select do |btn|
|
|
382
|
+
btn.text.strip.empty?
|
|
383
|
+
end.count
|
|
384
|
+
|
|
385
|
+
if buttons_without_aria.positive?
|
|
386
|
+
score -= [buttons_without_aria * 5, 20].min
|
|
387
|
+
details << "#{buttons_without_aria} buttons without accessible labels"
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
# Check for language attribute
|
|
391
|
+
html_tag = @page.at('html')
|
|
392
|
+
if html_tag.nil? || html_tag['lang'].nil? || html_tag['lang'].empty?
|
|
393
|
+
score -= 10
|
|
394
|
+
details << 'No language attribute on HTML element'
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
# Check for form labels
|
|
398
|
+
inputs = @page.css('input[type="text"], input[type="email"], input[type="password"], textarea')
|
|
399
|
+
inputs_without_labels = inputs.select do |input|
|
|
400
|
+
id = input['id']
|
|
401
|
+
!id || @page.css("label[for=\"#{id}\"]").empty?
|
|
402
|
+
end.count
|
|
403
|
+
|
|
404
|
+
if inputs_without_labels.positive?
|
|
405
|
+
score -= [inputs_without_labels * 5, 15].min
|
|
406
|
+
details << "#{inputs_without_labels} form inputs without labels"
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
{ score: [score, 0].max, details: details }
|
|
410
|
+
end
|
|
411
|
+
end
|
|
412
|
+
|
|
413
|
+
# Check if the page is mobile-friendly
|
|
414
|
+
# @return [Boolean] true if mobile-friendly
|
|
415
|
+
def mobile_friendly?
|
|
416
|
+
@mobile_friendly ||= begin
|
|
417
|
+
# Check for viewport meta tag
|
|
418
|
+
viewport = @meta['viewport']
|
|
419
|
+
has_viewport = !viewport.nil? && viewport.include?('width=device-width')
|
|
420
|
+
|
|
421
|
+
# Check for responsive CSS (media queries)
|
|
422
|
+
has_media_queries = stylesheets.any? || @page.to_html.include?('@media')
|
|
423
|
+
|
|
424
|
+
has_viewport && has_media_queries
|
|
144
425
|
end
|
|
145
426
|
end
|
|
146
427
|
|
|
@@ -152,7 +433,7 @@ module WebInspector
|
|
|
152
433
|
# @return [Array<Hash>] Count results
|
|
153
434
|
def counter(text, words)
|
|
154
435
|
words.map do |word|
|
|
155
|
-
{ word => text.scan(/#{word.downcase}/).size }
|
|
436
|
+
{ word => text.scan(/#{Regexp.escape(word.downcase)}/).size }
|
|
156
437
|
end
|
|
157
438
|
end
|
|
158
439
|
|
|
@@ -179,6 +460,30 @@ module WebInspector
|
|
|
179
460
|
end
|
|
180
461
|
end
|
|
181
462
|
|
|
463
|
+
# Filter a list of URLs by a given domain.
|
|
464
|
+
# @param collection [Array<String>] The list of URLs to filter.
|
|
465
|
+
# @param user_domain [String] The domain to filter by.
|
|
466
|
+
# @return [Array<String>] The filtered list of URLs.
|
|
467
|
+
def filter_by_domain(collection, user_domain)
|
|
468
|
+
return [] if collection.empty?
|
|
469
|
+
|
|
470
|
+
# Handle nil user_domain
|
|
471
|
+
user_domain = @host.to_s if user_domain.nil? || user_domain.empty?
|
|
472
|
+
|
|
473
|
+
# Normalize domain for comparison
|
|
474
|
+
normalized_domain = user_domain.to_s.downcase.gsub(/\s+/, '').sub(/^www\./, '')
|
|
475
|
+
|
|
476
|
+
collection.select do |item|
|
|
477
|
+
uri = URI.parse(item.to_s)
|
|
478
|
+
next false unless uri.host
|
|
479
|
+
|
|
480
|
+
uri_host = uri.host.to_s.downcase.sub(/^www\./, '')
|
|
481
|
+
uri_host.include?(normalized_domain)
|
|
482
|
+
rescue URI::InvalidURIError, NoMethodError
|
|
483
|
+
false
|
|
484
|
+
end
|
|
485
|
+
end
|
|
486
|
+
|
|
182
487
|
# Make a URL absolute
|
|
183
488
|
# @param url [String] URL to make absolute
|
|
184
489
|
# @return [String, nil] Absolute URL or nil if invalid
|
|
@@ -191,39 +496,31 @@ module WebInspector
|
|
|
191
496
|
# Get base URL from the page if not already set
|
|
192
497
|
if @base_url.nil?
|
|
193
498
|
base_tag = @page.at_css('base[href]')
|
|
194
|
-
@base_url = base_tag ? base_tag['href'] :
|
|
499
|
+
@base_url = base_tag ? base_tag['href'] : ''
|
|
195
500
|
end
|
|
196
501
|
|
|
197
502
|
begin
|
|
198
503
|
# Try joining with base URL first if available
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
# Fall through to next method
|
|
204
|
-
end
|
|
205
|
-
end
|
|
504
|
+
return URI.join(@base_url, url).to_s unless @base_url.empty?
|
|
505
|
+
rescue URI::InvalidURIError, URI::BadURIError
|
|
506
|
+
# Fall through to next method
|
|
507
|
+
end
|
|
206
508
|
|
|
509
|
+
begin
|
|
207
510
|
# If we have @url, try to use it
|
|
208
|
-
if @url
|
|
209
|
-
begin
|
|
210
|
-
return URI.join(@url, url).to_s
|
|
211
|
-
rescue URI::InvalidURIError, URI::BadURIError
|
|
212
|
-
# Fall through to next method
|
|
213
|
-
end
|
|
214
|
-
end
|
|
215
|
-
|
|
216
|
-
# Otherwise use a default http:// base if url is absolute path
|
|
217
|
-
return "http://#{@host}#{url}" if url.start_with?('/')
|
|
218
|
-
|
|
219
|
-
# For truly relative URLs with no base, we need to make our best guess
|
|
220
|
-
return "http://#{@host}/#{url}" if @host
|
|
221
|
-
|
|
222
|
-
# Last resort, return the original
|
|
223
|
-
url
|
|
511
|
+
return URI.join(@url, url).to_s if @url
|
|
224
512
|
rescue URI::InvalidURIError, URI::BadURIError
|
|
225
|
-
|
|
513
|
+
# Fall through to next method
|
|
226
514
|
end
|
|
515
|
+
|
|
516
|
+
# For relative URLs, we need to make our best guess
|
|
517
|
+
return "http://#{@host}#{url}" if url.start_with?('/')
|
|
518
|
+
return "http://#{@host}/#{url}" if @host
|
|
519
|
+
|
|
520
|
+
# Last resort, return the original
|
|
521
|
+
url
|
|
522
|
+
rescue URI::InvalidURIError, URI::BadURIError
|
|
523
|
+
url # Return original instead of nil to be more lenient
|
|
227
524
|
end
|
|
228
525
|
|
|
229
526
|
# Extract a snippet from the first long paragraph
|
data/lib/web_inspector/meta.rb
CHANGED
|
@@ -26,6 +26,19 @@ module WebInspector
|
|
|
26
26
|
.merge(meta_tag['property'])
|
|
27
27
|
.merge(meta_tag['itemprop'] || {})
|
|
28
28
|
.merge('charset' => meta_tag['charset'])
|
|
29
|
+
.merge('author' => author, 'publisher' => publisher)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def author
|
|
33
|
+
meta_tag['name']['author'] || meta_tag['property']['article:author']
|
|
34
|
+
rescue StandardError
|
|
35
|
+
nil
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def publisher
|
|
39
|
+
meta_tag['property']['article:publisher'] || meta_tag['property']['og:site_name']
|
|
40
|
+
rescue StandardError
|
|
41
|
+
nil
|
|
29
42
|
end
|
|
30
43
|
|
|
31
44
|
def charset
|
data/lib/web_inspector/page.rb
CHANGED
|
@@ -19,8 +19,7 @@ require File.expand_path(File.join(File.dirname(__FILE__), 'request'))
|
|
|
19
19
|
|
|
20
20
|
module WebInspector
|
|
21
21
|
class Page
|
|
22
|
-
attr_reader :
|
|
23
|
-
:domain_links, :domain_images, :images, :response, :status_code, :favicon
|
|
22
|
+
attr_reader :status_code
|
|
24
23
|
|
|
25
24
|
DEFAULT_TIMEOUT = 30
|
|
26
25
|
DEFAULT_RETRIES = 3
|
|
@@ -70,7 +69,8 @@ module WebInspector
|
|
|
70
69
|
end
|
|
71
70
|
|
|
72
71
|
# Delegate methods to inspector
|
|
73
|
-
%i[title description body links images meta
|
|
72
|
+
%i[title description body links images meta javascripts stylesheets language structured_data microdata
|
|
73
|
+
tag_count feeds social_links robots_txt_url sitemap_url cms_info accessibility_score mobile_friendly?].each do |method|
|
|
74
74
|
define_method(method) do
|
|
75
75
|
return nil unless success?
|
|
76
76
|
|
|
@@ -132,8 +132,99 @@ module WebInspector
|
|
|
132
132
|
@inspector.domain_images(u, host)
|
|
133
133
|
end
|
|
134
134
|
|
|
135
|
-
# Get
|
|
136
|
-
#
|
|
135
|
+
# Get information about the page's security
|
|
136
|
+
# @return [Hash] Security information
|
|
137
|
+
def security_info
|
|
138
|
+
return @security_info if defined?(@security_info)
|
|
139
|
+
|
|
140
|
+
@security_info = {
|
|
141
|
+
secure: scheme == 'https',
|
|
142
|
+
hsts: response&.headers && response.headers['strict-transport-security'] ? true : false,
|
|
143
|
+
content_security_policy: response&.headers && response.headers['content-security-policy'] ? true : false
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
# Extract SSL/TLS info if available and using HTTPS
|
|
147
|
+
if scheme == 'https' && response&.env&.response_headers
|
|
148
|
+
@security_info[:ssl_version] = response.env[:ssl_version]
|
|
149
|
+
@security_info[:cipher_suite] = response.env[:cipher_suite]
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
@security_info
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Get the content type of the page
|
|
156
|
+
# @return [String, nil] Content type
|
|
157
|
+
def content_type
|
|
158
|
+
response&.headers && response.headers['content-type']
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Get the size of the page in bytes
|
|
162
|
+
# @return [Integer, nil] Size in bytes
|
|
163
|
+
def size
|
|
164
|
+
return @size if defined?(@size)
|
|
165
|
+
|
|
166
|
+
@size = if response&.headers && response.headers['content-length']
|
|
167
|
+
response.headers['content-length'].to_i
|
|
168
|
+
elsif response&.body
|
|
169
|
+
response.body.bytesize
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Get the load time of the page in seconds
|
|
174
|
+
# @return [Float, nil] Load time in seconds
|
|
175
|
+
attr_reader :load_time
|
|
176
|
+
|
|
177
|
+
# Get all JSON-LD structured data as a hash
|
|
178
|
+
# @return [Array<Hash>] Structured data
|
|
179
|
+
def json_ld
|
|
180
|
+
structured_data
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Get a hash of all technologies detected on the page
|
|
184
|
+
# @return [Hash] Detected technologies
|
|
185
|
+
def technologies
|
|
186
|
+
techs = {}
|
|
187
|
+
js_files = javascripts || []
|
|
188
|
+
css_files = stylesheets || []
|
|
189
|
+
page_body = body || ''
|
|
190
|
+
page_meta = meta || {}
|
|
191
|
+
response_headers = response&.headers || {}
|
|
192
|
+
|
|
193
|
+
# Frameworks and Libraries
|
|
194
|
+
techs[:jquery] = true if js_files.any? { |js| js.include?('jquery') } || page_body.include?('jQuery')
|
|
195
|
+
techs[:react] = true if page_body.include?('data-reactroot') || js_files.any? { |js| js.include?('react') }
|
|
196
|
+
techs[:vue] = true if page_body.include?('data-v-app') || js_files.any? { |js| js.include?('vue') }
|
|
197
|
+
techs[:angular] = true if page_body.include?('ng-version') || js_files.any? { |js| js.include?('angular') }
|
|
198
|
+
techs[:bootstrap] = true if css_files.any? do |css|
|
|
199
|
+
css.include?('bootstrap')
|
|
200
|
+
end || page_body.include?('class="container"')
|
|
201
|
+
if response_headers['x-powered-by']&.include?('Rails') || response_headers.key?('x-rails-env')
|
|
202
|
+
techs[:rails] =
|
|
203
|
+
true
|
|
204
|
+
end
|
|
205
|
+
techs[:php] = true if response_headers['x-powered-by']&.include?('PHP')
|
|
206
|
+
|
|
207
|
+
# CMS
|
|
208
|
+
techs[:wordpress] = true if page_meta['generator']&.include?('WordPress') || page_body.include?('/wp-content/')
|
|
209
|
+
techs[:shopify] = true if page_body.include?('Shopify.shop')
|
|
210
|
+
|
|
211
|
+
# Analytics
|
|
212
|
+
techs[:google_analytics] = true if js_files.any? { |js| js.include?('google-analytics.com') }
|
|
213
|
+
|
|
214
|
+
# Server
|
|
215
|
+
server = response_headers['server']
|
|
216
|
+
if server
|
|
217
|
+
techs[:server] = server
|
|
218
|
+
techs[:nginx] = true if server.include?('nginx')
|
|
219
|
+
techs[:apache] = true if server.include?('Apache')
|
|
220
|
+
techs[:iis] = true if server.include?('IIS')
|
|
221
|
+
techs[:express] = true if response_headers['x-powered-by']&.include?('Express')
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
techs
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# Get full JSON representation of the page with all new data
|
|
137
228
|
# @return [Hash] JSON representation of the page
|
|
138
229
|
def to_hash
|
|
139
230
|
{
|
|
@@ -146,7 +237,25 @@ module WebInspector
|
|
|
146
237
|
'meta' => meta,
|
|
147
238
|
'links' => links,
|
|
148
239
|
'images' => images,
|
|
240
|
+
'javascripts' => javascripts,
|
|
241
|
+
'stylesheets' => stylesheets,
|
|
149
242
|
'favicon' => favicon,
|
|
243
|
+
'language' => language,
|
|
244
|
+
'structured_data' => structured_data,
|
|
245
|
+
'microdata' => microdata,
|
|
246
|
+
'security_info' => security_info,
|
|
247
|
+
'content_type' => content_type,
|
|
248
|
+
'size' => size,
|
|
249
|
+
'load_time' => load_time,
|
|
250
|
+
'technologies' => technologies,
|
|
251
|
+
'tag_count' => tag_count,
|
|
252
|
+
'feeds' => feeds,
|
|
253
|
+
'social_links' => social_links,
|
|
254
|
+
'robots_txt_url' => robots_txt_url,
|
|
255
|
+
'sitemap_url' => sitemap_url,
|
|
256
|
+
'cms_info' => cms_info,
|
|
257
|
+
'accessibility_score' => accessibility_score,
|
|
258
|
+
'mobile_friendly' => mobile_friendly?,
|
|
150
259
|
'response' => {
|
|
151
260
|
'status' => status_code,
|
|
152
261
|
'headers' => response&.headers || {},
|
|
@@ -166,6 +275,8 @@ module WebInspector
|
|
|
166
275
|
private
|
|
167
276
|
|
|
168
277
|
def fetch
|
|
278
|
+
start_time = Time.now
|
|
279
|
+
|
|
169
280
|
session = Faraday.new(url: url) do |faraday|
|
|
170
281
|
# Configure retries based on available middleware
|
|
171
282
|
faraday.request :retry, { max: @retries } if defined?(Faraday::Retry)
|
|
@@ -194,6 +305,7 @@ module WebInspector
|
|
|
194
305
|
end
|
|
195
306
|
|
|
196
307
|
@url = response.env.url.to_s
|
|
308
|
+
@load_time = Time.now - start_time
|
|
197
309
|
response
|
|
198
310
|
rescue Faraday::TimeoutError, Faraday::ConnectionFailed => e
|
|
199
311
|
retries += 1
|
|
@@ -28,6 +28,22 @@ module WebInspector
|
|
|
28
28
|
URI(normalized_uri).port
|
|
29
29
|
end
|
|
30
30
|
|
|
31
|
+
def valid?
|
|
32
|
+
!uri.nil? && !uri.host.nil?
|
|
33
|
+
rescue StandardError
|
|
34
|
+
false
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def ssl?
|
|
38
|
+
scheme == 'https'
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def error_message
|
|
42
|
+
return nil if valid?
|
|
43
|
+
|
|
44
|
+
'Invalid URL: Unable to parse the provided URL'
|
|
45
|
+
end
|
|
46
|
+
|
|
31
47
|
private
|
|
32
48
|
|
|
33
49
|
def suffix_domain
|
|
@@ -47,7 +63,11 @@ module WebInspector
|
|
|
47
63
|
end
|
|
48
64
|
|
|
49
65
|
def normalized_uri
|
|
66
|
+
return '' if uri.nil?
|
|
67
|
+
|
|
50
68
|
uri.normalize.to_s
|
|
69
|
+
rescue StandardError
|
|
70
|
+
@url
|
|
51
71
|
end
|
|
52
72
|
end
|
|
53
73
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: webinspector
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Davide Santangelo
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-
|
|
11
|
+
date: 2025-11-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rake
|