RubyGems - webinspector - Versions diffs - 1.0.0 → 1.1.0 - Mend

webinspector 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/README.md +52 -0
data/lib/web_inspector/inspector.rb +160 -60
data/lib/web_inspector/page.rb +110 -5
data/lib/web_inspector/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0413d3ff948ab6efff6a1cbe8a7844287149ad06f09353655e6cb208968f9481
-  data.tar.gz: 152b950595afb57adc522da24c6959f71d160ba903b3d01ce6ee5f6a8b4d81d2
+  metadata.gz: df0bf76a03246a803f338a903611f128ee8b6d09329f33a9745a27eeb4e9793b
+  data.tar.gz: adda6867a10d3dc5f7a9fd0ec414d7046e140fe016d945eb20098a84a5176642
 SHA512:
-  metadata.gz: c6230493b59a0d23585be729ec98706cfdbd6852e2de2d65db83d1638f85110369d41f2275b8e0aa09b58008d53924d036840ce63523d9004f19275999be90f8
-  data.tar.gz: dad518b0b04c1e341c14c29438ebcf84f4602bf394254a4c61382ad79ce53dae2ac92f138f331d5bf089d2220011f14b0cde0a79608a91177b2bda2ab1773a96
+  metadata.gz: 01ce7c5aab007a3c9ef300c61a990a6f00d00604c14f0a3fdec28fdfb620a1e50ad576055b892cfe4d59de66975236674ce1f81f66e2d60c0ad0e0a0c3f4a951
+  data.tar.gz: ca58cdda149cf3b0cc6dcb29017b8e080b70b4ed881c924acddfba98760246417bb0304bc4eb42242a328d4ade99408181cc3ac10016998cd2b23de8fe34a8bf

data/README.md CHANGED Viewed

@@ -86,6 +86,58 @@ page.domain_images('example.com') # returns only images hosted on example.com
 page.find(["ruby", "rails"]) # returns [{"ruby"=>3}, {"rails"=>1}]
 ```
+#### JavaScript and Stylesheets
+```ruby
+page.javascripts  # array of all JavaScript files (absolute URLs)
+page.stylesheets  # array of all CSS stylesheets (absolute URLs)
+```
+#### Language Detection
+```ruby
+page.language  # detected language code (e.g., "en", "es", "fr")
+```
+#### Structured Data
+```ruby
+page.structured_data  # array of JSON-LD structured data objects
+page.microdata        # array of microdata items
+page.json_ld          # alias for structured_data
+```
+#### Security Information
+```ruby
+page.security_info  # hash with security details: { secure: true, hsts: true, ... }
+```
+#### Performance Metrics
+```ruby
+page.load_time  # page load time in seconds
+page.size       # page size in bytes
+```
+#### Content Type
+```ruby
+page.content_type  # content type header (e.g., "text/html; charset=utf-8")
+```
+#### Technology Detection
+```ruby
+page.technologies  # hash of detected technologies: { jquery: true, react: true, ... }
+```
+#### HTML Tag Statistics
+```ruby
+page.tag_count  # hash with counts of each HTML tag: { "div" => 45, "p" => 12, ... }
+```
 ### Export all data to JSON
 ```ruby

data/lib/web_inspector/inspector.rb CHANGED Viewed

@@ -71,26 +71,7 @@ module WebInspector
     # @return [Array<String>] Filtered links
     def domain_links(user_domain, host = nil)
       @host ||= host
-      return [] if links.empty?
-      # Handle nil user_domain
-      user_domain = @host.to_s if user_domain.nil? || user_domain.empty?
-      # Normalize domain for comparison
-      user_domain = user_domain.to_s.downcase.gsub(/\s+/, '')
-      user_domain = user_domain.sub(/^www\./, '') # Remove www prefix for comparison
-      links.select do |link|
-        uri = URI.parse(link.to_s)
-        next false unless uri.host # Skip URLs without hosts
-        uri_host = uri.host.to_s.downcase
-        uri_host = uri_host.sub(/^www\./, '') # Remove www prefix for comparison
-        uri_host.include?(user_domain)
-      rescue URI::InvalidURIError, NoMethodError
-        false
-      end
+      filter_by_domain(links, user_domain)
     end
     # Get all images from the page
@@ -122,28 +103,131 @@ module WebInspector
     # @return [Array<String>] Filtered images
     def domain_images(user_domain, host = nil)
       @host ||= host
+      filter_by_domain(images, user_domain)
+    end
-      return [] if images.empty?
+    # Get all JavaScript files used by the page
+    # @return [Array<String>] Array of JavaScript file URLs
+    def javascripts
+      @javascripts ||= begin
+        scripts = []
+        @page.css('script[src]').each do |script|
+          src = script[:src]
+          next unless src
-      # Handle nil user_domain
-      user_domain = @host.to_s if user_domain.nil? || user_domain.empty?
+          # Clean and normalize URL
+          src = src.strip
-      # Normalize domain for comparison
-      user_domain = user_domain.to_s.downcase.gsub(/\s+/, '')
-      user_domain = user_domain.sub(/^www\./, '') # Remove www prefix for comparison
+          begin
+            absolute_url = make_absolute_url(src)
+            scripts << absolute_url if absolute_url
+          rescue URI::InvalidURIError, URI::BadURIError
+            # Skip invalid URLs
+          end
+        end
+        scripts.uniq.compact
+      end
+    end
-      images.select do |img|
-        uri = URI.parse(img.to_s)
-        next false unless uri.host # Skip URLs without hosts
+    # Get stylesheets used by the page
+    # @return [Array<String>] Array of CSS file URLs
+    def stylesheets
+      @stylesheets ||= begin
+        styles = []
+        @page.css('link[rel="stylesheet"]').each do |style|
+          href = style[:href]
+          next unless href
-        uri_host = uri.host.to_s.downcase
-        uri_host = uri_host.sub(/^www\./, '') # Remove www prefix for comparison
-        uri_host.include?(user_domain)
-      rescue URI::InvalidURIError, NoMethodError
-        false
+          # Clean and normalize URL
+          href = href.strip
+          begin
+            absolute_url = make_absolute_url(href)
+            styles << absolute_url if absolute_url
+          rescue URI::InvalidURIError, URI::BadURIError
+            # Skip invalid URLs
+          end
+        end
+        styles.uniq.compact
+      end
+    end
+    # Detect the page language
+    # @return [String, nil] Language code if detected, nil otherwise
+    def language
+      # Check for html lang attribute first
+      html_tag = @page.at('html')
+      return html_tag['lang'] if html_tag && html_tag['lang'] && !html_tag['lang'].empty?
+      # Then check for language meta tag
+      lang_meta = @meta['content-language']
+      return lang_meta if lang_meta && !lang_meta.empty?
+      # Fallback to inspecting content headers if available
+      nil
+    end
+    # Extract structured data (JSON-LD) from the page
+    # @return [Array<Hash>] Array of structured data objects
+    def structured_data
+      @structured_data ||= begin
+        data = []
+        @page.css('script[type="application/ld+json"]').each do |script|
+          parsed = JSON.parse(script.text)
+          data << parsed if parsed
+        rescue JSON::ParserError
+          # Skip invalid JSON
+        end
+        data
+      end
+    end
+    # Extract microdata from the page
+    # @return [Array<Hash>] Array of microdata items
+    def microdata
+      @microdata ||= begin
+        items = []
+        @page.css('[itemscope]').each do |scope|
+          item = { type: scope['itemtype'] }
+          properties = {}
+          scope.css('[itemprop]').each do |prop|
+            name = prop['itemprop']
+            # Extract value based on tag
+            value = case prop.name.downcase
+                    when 'meta'
+                      prop['content']
+                    when 'img', 'audio', 'embed', 'iframe', 'source', 'track', 'video'
+                      make_absolute_url(prop['src'])
+                    when 'a', 'area', 'link'
+                      make_absolute_url(prop['href'])
+                    when 'time'
+                      prop['datetime'] || prop.text.strip
+                    else
+                      prop.text.strip
+                    end
+            properties[name] = value
+          end
+          item[:properties] = properties
+          items << item
+        end
+        items
       end
     end
+    # Count all tag types on the page
+    # @return [Hash] Counts of different HTML elements
+    def tag_count
+      tags = {}
+      @page.css('*').each do |element|
+        tag_name = element.name.downcase
+        tags[tag_name] ||= 0
+        tags[tag_name] += 1
+      end
+      tags
+    end
     private
     # Count occurrences of words in text
@@ -152,7 +236,7 @@ module WebInspector
     # @return [Array<Hash>] Count results
     def counter(text, words)
       words.map do |word|
-        { word => text.scan(/#{word.downcase}/).size }
+        { word => text.scan(/#{Regexp.escape(word.downcase)}/).size }
       end
     end
@@ -179,6 +263,30 @@ module WebInspector
       end
     end
+    # Filter a list of URLs by a given domain.
+    # @param collection [Array<String>] The list of URLs to filter.
+    # @param user_domain [String] The domain to filter by.
+    # @return [Array<String>] The filtered list of URLs.
+    def filter_by_domain(collection, user_domain)
+      return [] if collection.empty?
+      # Handle nil user_domain
+      user_domain = @host.to_s if user_domain.nil? || user_domain.empty?
+      # Normalize domain for comparison
+      normalized_domain = user_domain.to_s.downcase.gsub(/\s+/, '').sub(/^www\./, '')
+      collection.select do |item|
+        uri = URI.parse(item.to_s)
+        next false unless uri.host
+        uri_host = uri.host.to_s.downcase.sub(/^www\./, '')
+        uri_host.include?(normalized_domain)
+      rescue URI::InvalidURIError, NoMethodError
+        false
+      end
+    end
     # Make a URL absolute
     # @param url [String] URL to make absolute
     # @return [String, nil] Absolute URL or nil if invalid
@@ -191,39 +299,31 @@ module WebInspector
       # Get base URL from the page if not already set
       if @base_url.nil?
         base_tag = @page.at_css('base[href]')
-        @base_url = base_tag ? base_tag['href'] : nil
+        @base_url = base_tag ? base_tag['href'] : ''
       end
       begin
         # Try joining with base URL first if available
-        if @base_url && !@base_url.empty?
-          begin
-            return URI.join(@base_url, url).to_s
-          rescue URI::InvalidURIError, URI::BadURIError
-            # Fall through to next method
-          end
-        end
+        return URI.join(@base_url, url).to_s unless @base_url.empty?
+      rescue URI::InvalidURIError, URI::BadURIError
+        # Fall through to next method
+      end
+      begin
         # If we have @url, try to use it
-        if @url
-          begin
-            return URI.join(@url, url).to_s
-          rescue URI::InvalidURIError, URI::BadURIError
-            # Fall through to next method
-          end
-        end
-        # Otherwise use a default http:// base if url is absolute path
-        return "http://#{@host}#{url}" if url.start_with?('/')
-        # For truly relative URLs with no base, we need to make our best guess
-        return "http://#{@host}/#{url}" if @host
-        # Last resort, return the original
-        url
+        return URI.join(@url, url).to_s if @url
       rescue URI::InvalidURIError, URI::BadURIError
-        url # Return original instead of nil to be more lenient
+        # Fall through to next method
       end
+      # For relative URLs, we need to make our best guess
+      return "http://#{@host}#{url}" if url.start_with?('/')
+      return "http://#{@host}/#{url}" if @host
+      # Last resort, return the original
+      url
+    rescue URI::InvalidURIError, URI::BadURIError
+      url # Return original instead of nil to be more lenient
     end
     # Extract a snippet from the first long paragraph

data/lib/web_inspector/page.rb CHANGED Viewed

@@ -19,8 +19,7 @@ require File.expand_path(File.join(File.dirname(__FILE__), 'request'))
 module WebInspector
   class Page
-    attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links,
-                :domain_links, :domain_images, :images, :response, :status_code, :favicon
+    attr_reader :status_code
     DEFAULT_TIMEOUT = 30
     DEFAULT_RETRIES = 3
@@ -70,7 +69,8 @@ module WebInspector
     end
     # Delegate methods to inspector
-    %i[title description body links images meta].each do |method|
+    %i[title description body links images meta javascripts stylesheets language structured_data microdata
+       tag_count].each do |method|
       define_method(method) do
         return nil unless success?
@@ -132,8 +132,99 @@ module WebInspector
       @inspector.domain_images(u, host)
     end
-    # Get full JSON representation of the page
-    #
+    # Get information about the page's security
+    # @return [Hash] Security information
+    def security_info
+      return @security_info if defined?(@security_info)
+      @security_info = {
+        secure: scheme == 'https',
+        hsts: response&.headers && response.headers['strict-transport-security'] ? true : false,
+        content_security_policy: response&.headers && response.headers['content-security-policy'] ? true : false
+      }
+      # Extract SSL/TLS info if available and using HTTPS
+      if scheme == 'https' && response&.env&.response_headers
+        @security_info[:ssl_version] = response.env[:ssl_version]
+        @security_info[:cipher_suite] = response.env[:cipher_suite]
+      end
+      @security_info
+    end
+    # Get the content type of the page
+    # @return [String, nil] Content type
+    def content_type
+      response&.headers && response.headers['content-type']
+    end
+    # Get the size of the page in bytes
+    # @return [Integer, nil] Size in bytes
+    def size
+      return @size if defined?(@size)
+      @size = if response&.headers && response.headers['content-length']
+                response.headers['content-length'].to_i
+              elsif response&.body
+                response.body.bytesize
+              end
+    end
+    # Get the load time of the page in seconds
+    # @return [Float, nil] Load time in seconds
+    attr_reader :load_time
+    # Get all JSON-LD structured data as a hash
+    # @return [Array<Hash>] Structured data
+    def json_ld
+      structured_data
+    end
+    # Get a hash of all technologies detected on the page
+    # @return [Hash] Detected technologies
+    def technologies
+      techs = {}
+      js_files = javascripts || []
+      css_files = stylesheets || []
+      page_body = body || ''
+      page_meta = meta || {}
+      response_headers = response&.headers || {}
+      # Frameworks and Libraries
+      techs[:jquery] = true if js_files.any? { |js| js.include?('jquery') } || page_body.include?('jQuery')
+      techs[:react] = true if page_body.include?('data-reactroot') || js_files.any? { |js| js.include?('react') }
+      techs[:vue] = true if page_body.include?('data-v-app') || js_files.any? { |js| js.include?('vue') }
+      techs[:angular] = true if page_body.include?('ng-version') || js_files.any? { |js| js.include?('angular') }
+      techs[:bootstrap] = true if css_files.any? do |css|
+        css.include?('bootstrap')
+      end || page_body.include?('class="container"')
+      if response_headers['x-powered-by']&.include?('Rails') || response_headers.key?('x-rails-env')
+        techs[:rails] =
+          true
+      end
+      techs[:php] = true if response_headers['x-powered-by']&.include?('PHP')
+      # CMS
+      techs[:wordpress] = true if page_meta['generator']&.include?('WordPress') || page_body.include?('/wp-content/')
+      techs[:shopify] = true if page_body.include?('Shopify.shop')
+      # Analytics
+      techs[:google_analytics] = true if js_files.any? { |js| js.include?('google-analytics.com') }
+      # Server
+      server = response_headers['server']
+      if server
+        techs[:server] = server
+        techs[:nginx] = true if server.include?('nginx')
+        techs[:apache] = true if server.include?('Apache')
+        techs[:iis] = true if server.include?('IIS')
+        techs[:express] = true if response_headers['x-powered-by']&.include?('Express')
+      end
+      techs
+    end
+    # Get full JSON representation of the page with all new data
     # @return [Hash] JSON representation of the page
     def to_hash
       {
@@ -146,7 +237,18 @@ module WebInspector
         'meta' => meta,
         'links' => links,
         'images' => images,
+        'javascripts' => javascripts,
+        'stylesheets' => stylesheets,
         'favicon' => favicon,
+        'language' => language,
+        'structured_data' => structured_data,
+        'microdata' => microdata,
+        'security_info' => security_info,
+        'content_type' => content_type,
+        'size' => size,
+        'load_time' => load_time,
+        'technologies' => technologies,
+        'tag_count' => tag_count,
         'response' => {
           'status' => status_code,
           'headers' => response&.headers || {},
@@ -166,6 +268,8 @@ module WebInspector
     private
     def fetch
+      start_time = Time.now
       session = Faraday.new(url: url) do |faraday|
         # Configure retries based on available middleware
         faraday.request :retry, { max: @retries } if defined?(Faraday::Retry)
@@ -194,6 +298,7 @@ module WebInspector
         end
         @url = response.env.url.to_s
+        @load_time = Time.now - start_time
         response
       rescue Faraday::TimeoutError, Faraday::ConnectionFailed => e
         retries += 1

data/lib/web_inspector/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module WebInspector
-  VERSION = '1.0.0'
+  VERSION = '1.1.0'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: webinspector
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.1.0
 platform: ruby
 authors:
 - Davide Santangelo
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2025-03-18 00:00:00.000000000 Z
+date: 2025-07-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake