webinspector 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0413d3ff948ab6efff6a1cbe8a7844287149ad06f09353655e6cb208968f9481
4
- data.tar.gz: 152b950595afb57adc522da24c6959f71d160ba903b3d01ce6ee5f6a8b4d81d2
3
+ metadata.gz: df0bf76a03246a803f338a903611f128ee8b6d09329f33a9745a27eeb4e9793b
4
+ data.tar.gz: adda6867a10d3dc5f7a9fd0ec414d7046e140fe016d945eb20098a84a5176642
5
5
  SHA512:
6
- metadata.gz: c6230493b59a0d23585be729ec98706cfdbd6852e2de2d65db83d1638f85110369d41f2275b8e0aa09b58008d53924d036840ce63523d9004f19275999be90f8
7
- data.tar.gz: dad518b0b04c1e341c14c29438ebcf84f4602bf394254a4c61382ad79ce53dae2ac92f138f331d5bf089d2220011f14b0cde0a79608a91177b2bda2ab1773a96
6
+ metadata.gz: 01ce7c5aab007a3c9ef300c61a990a6f00d00604c14f0a3fdec28fdfb620a1e50ad576055b892cfe4d59de66975236674ce1f81f66e2d60c0ad0e0a0c3f4a951
7
+ data.tar.gz: ca58cdda149cf3b0cc6dcb29017b8e080b70b4ed881c924acddfba98760246417bb0304bc4eb42242a328d4ade99408181cc3ac10016998cd2b23de8fe34a8bf
data/README.md CHANGED
@@ -86,6 +86,58 @@ page.domain_images('example.com') # returns only images hosted on example.com
86
86
  page.find(["ruby", "rails"]) # returns [{"ruby"=>3}, {"rails"=>1}]
87
87
  ```
88
88
 
89
+ #### JavaScript and Stylesheets
90
+
91
+ ```ruby
92
+ page.javascripts # array of all JavaScript files (absolute URLs)
93
+ page.stylesheets # array of all CSS stylesheets (absolute URLs)
94
+ ```
95
+
96
+ #### Language Detection
97
+
98
+ ```ruby
99
+ page.language # detected language code (e.g., "en", "es", "fr")
100
+ ```
101
+
102
+ #### Structured Data
103
+
104
+ ```ruby
105
+ page.structured_data # array of JSON-LD structured data objects
106
+ page.microdata # array of microdata items
107
+ page.json_ld # alias for structured_data
108
+ ```
109
+
110
+ #### Security Information
111
+
112
+ ```ruby
113
+ page.security_info # hash with security details: { secure: true, hsts: true, ... }
114
+ ```
115
+
116
+ #### Performance Metrics
117
+
118
+ ```ruby
119
+ page.load_time # page load time in seconds
120
+ page.size # page size in bytes
121
+ ```
122
+
123
+ #### Content Type
124
+
125
+ ```ruby
126
+ page.content_type # content type header (e.g., "text/html; charset=utf-8")
127
+ ```
128
+
129
+ #### Technology Detection
130
+
131
+ ```ruby
132
+ page.technologies # hash of detected technologies: { jquery: true, react: true, ... }
133
+ ```
134
+
135
+ #### HTML Tag Statistics
136
+
137
+ ```ruby
138
+ page.tag_count # hash with counts of each HTML tag: { "div" => 45, "p" => 12, ... }
139
+ ```
140
+
89
141
  ### Export all data to JSON
90
142
 
91
143
  ```ruby
@@ -71,26 +71,7 @@ module WebInspector
71
71
  # @return [Array<String>] Filtered links
72
72
  def domain_links(user_domain, host = nil)
73
73
  @host ||= host
74
-
75
- return [] if links.empty?
76
-
77
- # Handle nil user_domain
78
- user_domain = @host.to_s if user_domain.nil? || user_domain.empty?
79
-
80
- # Normalize domain for comparison
81
- user_domain = user_domain.to_s.downcase.gsub(/\s+/, '')
82
- user_domain = user_domain.sub(/^www\./, '') # Remove www prefix for comparison
83
-
84
- links.select do |link|
85
- uri = URI.parse(link.to_s)
86
- next false unless uri.host # Skip URLs without hosts
87
-
88
- uri_host = uri.host.to_s.downcase
89
- uri_host = uri_host.sub(/^www\./, '') # Remove www prefix for comparison
90
- uri_host.include?(user_domain)
91
- rescue URI::InvalidURIError, NoMethodError
92
- false
93
- end
74
+ filter_by_domain(links, user_domain)
94
75
  end
95
76
 
96
77
  # Get all images from the page
@@ -122,28 +103,131 @@ module WebInspector
122
103
  # @return [Array<String>] Filtered images
123
104
  def domain_images(user_domain, host = nil)
124
105
  @host ||= host
106
+ filter_by_domain(images, user_domain)
107
+ end
125
108
 
126
- return [] if images.empty?
109
+ # Get all JavaScript files used by the page
110
+ # @return [Array<String>] Array of JavaScript file URLs
111
+ def javascripts
112
+ @javascripts ||= begin
113
+ scripts = []
114
+ @page.css('script[src]').each do |script|
115
+ src = script[:src]
116
+ next unless src
127
117
 
128
- # Handle nil user_domain
129
- user_domain = @host.to_s if user_domain.nil? || user_domain.empty?
118
+ # Clean and normalize URL
119
+ src = src.strip
130
120
 
131
- # Normalize domain for comparison
132
- user_domain = user_domain.to_s.downcase.gsub(/\s+/, '')
133
- user_domain = user_domain.sub(/^www\./, '') # Remove www prefix for comparison
121
+ begin
122
+ absolute_url = make_absolute_url(src)
123
+ scripts << absolute_url if absolute_url
124
+ rescue URI::InvalidURIError, URI::BadURIError
125
+ # Skip invalid URLs
126
+ end
127
+ end
128
+ scripts.uniq.compact
129
+ end
130
+ end
134
131
 
135
- images.select do |img|
136
- uri = URI.parse(img.to_s)
137
- next false unless uri.host # Skip URLs without hosts
132
+ # Get stylesheets used by the page
133
+ # @return [Array<String>] Array of CSS file URLs
134
+ def stylesheets
135
+ @stylesheets ||= begin
136
+ styles = []
137
+ @page.css('link[rel="stylesheet"]').each do |style|
138
+ href = style[:href]
139
+ next unless href
138
140
 
139
- uri_host = uri.host.to_s.downcase
140
- uri_host = uri_host.sub(/^www\./, '') # Remove www prefix for comparison
141
- uri_host.include?(user_domain)
142
- rescue URI::InvalidURIError, NoMethodError
143
- false
141
+ # Clean and normalize URL
142
+ href = href.strip
143
+
144
+ begin
145
+ absolute_url = make_absolute_url(href)
146
+ styles << absolute_url if absolute_url
147
+ rescue URI::InvalidURIError, URI::BadURIError
148
+ # Skip invalid URLs
149
+ end
150
+ end
151
+ styles.uniq.compact
152
+ end
153
+ end
154
+
155
+ # Detect the page language
156
+ # @return [String, nil] Language code if detected, nil otherwise
157
+ def language
158
+ # Check for html lang attribute first
159
+ html_tag = @page.at('html')
160
+ return html_tag['lang'] if html_tag && html_tag['lang'] && !html_tag['lang'].empty?
161
+
162
+ # Then check for language meta tag
163
+ lang_meta = @meta['content-language']
164
+ return lang_meta if lang_meta && !lang_meta.empty?
165
+
166
+ # Fallback to inspecting content headers if available
167
+ nil
168
+ end
169
+
170
+ # Extract structured data (JSON-LD) from the page
171
+ # @return [Array<Hash>] Array of structured data objects
172
+ def structured_data
173
+ @structured_data ||= begin
174
+ data = []
175
+ @page.css('script[type="application/ld+json"]').each do |script|
176
+ parsed = JSON.parse(script.text)
177
+ data << parsed if parsed
178
+ rescue JSON::ParserError
179
+ # Skip invalid JSON
180
+ end
181
+ data
182
+ end
183
+ end
184
+
185
+ # Extract microdata from the page
186
+ # @return [Array<Hash>] Array of microdata items
187
+ def microdata
188
+ @microdata ||= begin
189
+ items = []
190
+ @page.css('[itemscope]').each do |scope|
191
+ item = { type: scope['itemtype'] }
192
+ properties = {}
193
+
194
+ scope.css('[itemprop]').each do |prop|
195
+ name = prop['itemprop']
196
+ # Extract value based on tag
197
+ value = case prop.name.downcase
198
+ when 'meta'
199
+ prop['content']
200
+ when 'img', 'audio', 'embed', 'iframe', 'source', 'track', 'video'
201
+ make_absolute_url(prop['src'])
202
+ when 'a', 'area', 'link'
203
+ make_absolute_url(prop['href'])
204
+ when 'time'
205
+ prop['datetime'] || prop.text.strip
206
+ else
207
+ prop.text.strip
208
+ end
209
+ properties[name] = value
210
+ end
211
+
212
+ item[:properties] = properties
213
+ items << item
214
+ end
215
+ items
144
216
  end
145
217
  end
146
218
 
219
+ # Count all tag types on the page
220
+ # @return [Hash] Counts of different HTML elements
221
+ def tag_count
222
+ tags = {}
223
+ @page.css('*').each do |element|
224
+ tag_name = element.name.downcase
225
+ tags[tag_name] ||= 0
226
+ tags[tag_name] += 1
227
+ end
228
+ tags
229
+ end
230
+
147
231
  private
148
232
 
149
233
  # Count occurrences of words in text
@@ -152,7 +236,7 @@ module WebInspector
152
236
  # @return [Array<Hash>] Count results
153
237
  def counter(text, words)
154
238
  words.map do |word|
155
- { word => text.scan(/#{word.downcase}/).size }
239
+ { word => text.scan(/#{Regexp.escape(word.downcase)}/).size }
156
240
  end
157
241
  end
158
242
 
@@ -179,6 +263,30 @@ module WebInspector
179
263
  end
180
264
  end
181
265
 
266
+ # Filter a list of URLs by a given domain.
267
+ # @param collection [Array<String>] The list of URLs to filter.
268
+ # @param user_domain [String] The domain to filter by.
269
+ # @return [Array<String>] The filtered list of URLs.
270
+ def filter_by_domain(collection, user_domain)
271
+ return [] if collection.empty?
272
+
273
+ # Handle nil user_domain
274
+ user_domain = @host.to_s if user_domain.nil? || user_domain.empty?
275
+
276
+ # Normalize domain for comparison
277
+ normalized_domain = user_domain.to_s.downcase.gsub(/\s+/, '').sub(/^www\./, '')
278
+
279
+ collection.select do |item|
280
+ uri = URI.parse(item.to_s)
281
+ next false unless uri.host
282
+
283
+ uri_host = uri.host.to_s.downcase.sub(/^www\./, '')
284
+ uri_host.include?(normalized_domain)
285
+ rescue URI::InvalidURIError, NoMethodError
286
+ false
287
+ end
288
+ end
289
+
182
290
  # Make a URL absolute
183
291
  # @param url [String] URL to make absolute
184
292
  # @return [String, nil] Absolute URL or nil if invalid
@@ -191,39 +299,31 @@ module WebInspector
191
299
  # Get base URL from the page if not already set
192
300
  if @base_url.nil?
193
301
  base_tag = @page.at_css('base[href]')
194
- @base_url = base_tag ? base_tag['href'] : nil
302
+ @base_url = base_tag ? base_tag['href'] : ''
195
303
  end
196
304
 
197
305
  begin
198
306
  # Try joining with base URL first if available
199
- if @base_url && !@base_url.empty?
200
- begin
201
- return URI.join(@base_url, url).to_s
202
- rescue URI::InvalidURIError, URI::BadURIError
203
- # Fall through to next method
204
- end
205
- end
307
+ return URI.join(@base_url, url).to_s unless @base_url.empty?
308
+ rescue URI::InvalidURIError, URI::BadURIError
309
+ # Fall through to next method
310
+ end
206
311
 
312
+ begin
207
313
  # If we have @url, try to use it
208
- if @url
209
- begin
210
- return URI.join(@url, url).to_s
211
- rescue URI::InvalidURIError, URI::BadURIError
212
- # Fall through to next method
213
- end
214
- end
215
-
216
- # Otherwise use a default http:// base if url is absolute path
217
- return "http://#{@host}#{url}" if url.start_with?('/')
218
-
219
- # For truly relative URLs with no base, we need to make our best guess
220
- return "http://#{@host}/#{url}" if @host
221
-
222
- # Last resort, return the original
223
- url
314
+ return URI.join(@url, url).to_s if @url
224
315
  rescue URI::InvalidURIError, URI::BadURIError
225
- url # Return original instead of nil to be more lenient
316
+ # Fall through to next method
226
317
  end
318
+
319
+ # For relative URLs, we need to make our best guess
320
+ return "http://#{@host}#{url}" if url.start_with?('/')
321
+ return "http://#{@host}/#{url}" if @host
322
+
323
+ # Last resort, return the original
324
+ url
325
+ rescue URI::InvalidURIError, URI::BadURIError
326
+ url # Return original instead of nil to be more lenient
227
327
  end
228
328
 
229
329
  # Extract a snippet from the first long paragraph
@@ -19,8 +19,7 @@ require File.expand_path(File.join(File.dirname(__FILE__), 'request'))
19
19
 
20
20
  module WebInspector
21
21
  class Page
22
- attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links,
23
- :domain_links, :domain_images, :images, :response, :status_code, :favicon
22
+ attr_reader :status_code
24
23
 
25
24
  DEFAULT_TIMEOUT = 30
26
25
  DEFAULT_RETRIES = 3
@@ -70,7 +69,8 @@ module WebInspector
70
69
  end
71
70
 
72
71
  # Delegate methods to inspector
73
- %i[title description body links images meta].each do |method|
72
+ %i[title description body links images meta javascripts stylesheets language structured_data microdata
73
+ tag_count].each do |method|
74
74
  define_method(method) do
75
75
  return nil unless success?
76
76
 
@@ -132,8 +132,99 @@ module WebInspector
132
132
  @inspector.domain_images(u, host)
133
133
  end
134
134
 
135
- # Get full JSON representation of the page
136
- #
135
+ # Get information about the page's security
136
+ # @return [Hash] Security information
137
+ def security_info
138
+ return @security_info if defined?(@security_info)
139
+
140
+ @security_info = {
141
+ secure: scheme == 'https',
142
+ hsts: response&.headers && response.headers['strict-transport-security'] ? true : false,
143
+ content_security_policy: response&.headers && response.headers['content-security-policy'] ? true : false
144
+ }
145
+
146
+ # Extract SSL/TLS info if available and using HTTPS
147
+ if scheme == 'https' && response&.env&.response_headers
148
+ @security_info[:ssl_version] = response.env[:ssl_version]
149
+ @security_info[:cipher_suite] = response.env[:cipher_suite]
150
+ end
151
+
152
+ @security_info
153
+ end
154
+
155
+ # Get the content type of the page
156
+ # @return [String, nil] Content type
157
+ def content_type
158
+ response&.headers && response.headers['content-type']
159
+ end
160
+
161
+ # Get the size of the page in bytes
162
+ # @return [Integer, nil] Size in bytes
163
+ def size
164
+ return @size if defined?(@size)
165
+
166
+ @size = if response&.headers && response.headers['content-length']
167
+ response.headers['content-length'].to_i
168
+ elsif response&.body
169
+ response.body.bytesize
170
+ end
171
+ end
172
+
173
+ # Get the load time of the page in seconds
174
+ # @return [Float, nil] Load time in seconds
175
+ attr_reader :load_time
176
+
177
+ # Get all JSON-LD structured data as a hash
178
+ # @return [Array<Hash>] Structured data
179
+ def json_ld
180
+ structured_data
181
+ end
182
+
183
+ # Get a hash of all technologies detected on the page
184
+ # @return [Hash] Detected technologies
185
+ def technologies
186
+ techs = {}
187
+ js_files = javascripts || []
188
+ css_files = stylesheets || []
189
+ page_body = body || ''
190
+ page_meta = meta || {}
191
+ response_headers = response&.headers || {}
192
+
193
+ # Frameworks and Libraries
194
+ techs[:jquery] = true if js_files.any? { |js| js.include?('jquery') } || page_body.include?('jQuery')
195
+ techs[:react] = true if page_body.include?('data-reactroot') || js_files.any? { |js| js.include?('react') }
196
+ techs[:vue] = true if page_body.include?('data-v-app') || js_files.any? { |js| js.include?('vue') }
197
+ techs[:angular] = true if page_body.include?('ng-version') || js_files.any? { |js| js.include?('angular') }
198
+ techs[:bootstrap] = true if css_files.any? do |css|
199
+ css.include?('bootstrap')
200
+ end || page_body.include?('class="container"')
201
+ if response_headers['x-powered-by']&.include?('Rails') || response_headers.key?('x-rails-env')
202
+ techs[:rails] =
203
+ true
204
+ end
205
+ techs[:php] = true if response_headers['x-powered-by']&.include?('PHP')
206
+
207
+ # CMS
208
+ techs[:wordpress] = true if page_meta['generator']&.include?('WordPress') || page_body.include?('/wp-content/')
209
+ techs[:shopify] = true if page_body.include?('Shopify.shop')
210
+
211
+ # Analytics
212
+ techs[:google_analytics] = true if js_files.any? { |js| js.include?('google-analytics.com') }
213
+
214
+ # Server
215
+ server = response_headers['server']
216
+ if server
217
+ techs[:server] = server
218
+ techs[:nginx] = true if server.include?('nginx')
219
+ techs[:apache] = true if server.include?('Apache')
220
+ techs[:iis] = true if server.include?('IIS')
221
+ techs[:express] = true if response_headers['x-powered-by']&.include?('Express')
222
+ end
223
+
224
+ techs
225
+ end
226
+
227
+ # Get full JSON representation of the page with all new data
137
228
  # @return [Hash] JSON representation of the page
138
229
  def to_hash
139
230
  {
@@ -146,7 +237,18 @@ module WebInspector
146
237
  'meta' => meta,
147
238
  'links' => links,
148
239
  'images' => images,
240
+ 'javascripts' => javascripts,
241
+ 'stylesheets' => stylesheets,
149
242
  'favicon' => favicon,
243
+ 'language' => language,
244
+ 'structured_data' => structured_data,
245
+ 'microdata' => microdata,
246
+ 'security_info' => security_info,
247
+ 'content_type' => content_type,
248
+ 'size' => size,
249
+ 'load_time' => load_time,
250
+ 'technologies' => technologies,
251
+ 'tag_count' => tag_count,
150
252
  'response' => {
151
253
  'status' => status_code,
152
254
  'headers' => response&.headers || {},
@@ -166,6 +268,8 @@ module WebInspector
166
268
  private
167
269
 
168
270
  def fetch
271
+ start_time = Time.now
272
+
169
273
  session = Faraday.new(url: url) do |faraday|
170
274
  # Configure retries based on available middleware
171
275
  faraday.request :retry, { max: @retries } if defined?(Faraday::Retry)
@@ -194,6 +298,7 @@ module WebInspector
194
298
  end
195
299
 
196
300
  @url = response.env.url.to_s
301
+ @load_time = Time.now - start_time
197
302
  response
198
303
  rescue Faraday::TimeoutError, Faraday::ConnectionFailed => e
199
304
  retries += 1
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module WebInspector
4
- VERSION = '1.0.0'
4
+ VERSION = '1.1.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webinspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Davide Santangelo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-03-18 00:00:00.000000000 Z
11
+ date: 2025-07-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake