RubyGems - canon - Versions diffs - 0.1.3 → 0.1.5 - Mend

canon 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

checksums.yaml +4 -4
data/.rubocop.yml +9 -1
data/.rubocop_todo.yml +276 -7
data/README.adoc +203 -138
data/_config.yml +116 -0
data/docs/ADVANCED_TOPICS.adoc +20 -0
data/docs/BASIC_USAGE.adoc +16 -0
data/docs/CHARACTER_VISUALIZATION.adoc +567 -0
data/docs/CLI.adoc +493 -0
data/docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
data/docs/DIFF_ARCHITECTURE.adoc +435 -0
data/docs/DIFF_FORMATTING.adoc +540 -0
data/docs/FORMATS.adoc +447 -0
data/docs/INDEX.adoc +222 -0
data/docs/INPUT_VALIDATION.adoc +477 -0
data/docs/MATCH_ARCHITECTURE.adoc +463 -0
data/docs/MATCH_OPTIONS.adoc +719 -0
data/docs/MODES.adoc +432 -0
data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
data/docs/OPTIONS.adoc +1387 -0
data/docs/PREPROCESSING.adoc +491 -0
data/docs/RSPEC.adoc +605 -0
data/docs/RUBY_API.adoc +478 -0
data/docs/SEMANTIC_DIFF_REPORT.adoc +528 -0
data/docs/UNDERSTANDING_CANON.adoc +17 -0
data/docs/VERBOSE.adoc +482 -0
data/exe/canon +7 -0
data/lib/canon/cli.rb +179 -0
data/lib/canon/commands/diff_command.rb +195 -0
data/lib/canon/commands/format_command.rb +113 -0
data/lib/canon/comparison/base_comparator.rb +39 -0
data/lib/canon/comparison/comparison_result.rb +79 -0
data/lib/canon/comparison/html_comparator.rb +410 -0
data/lib/canon/comparison/json_comparator.rb +212 -0
data/lib/canon/comparison/match_options.rb +616 -0
data/lib/canon/comparison/xml_comparator.rb +566 -0
data/lib/canon/comparison/yaml_comparator.rb +93 -0
data/lib/canon/comparison.rb +239 -0
data/lib/canon/config.rb +172 -0
data/lib/canon/diff/diff_block.rb +71 -0
data/lib/canon/diff/diff_block_builder.rb +105 -0
data/lib/canon/diff/diff_classifier.rb +46 -0
data/lib/canon/diff/diff_context.rb +85 -0
data/lib/canon/diff/diff_context_builder.rb +107 -0
data/lib/canon/diff/diff_line.rb +77 -0
data/lib/canon/diff/diff_node.rb +56 -0
data/lib/canon/diff/diff_node_mapper.rb +148 -0
data/lib/canon/diff/diff_report.rb +133 -0
data/lib/canon/diff/diff_report_builder.rb +62 -0
data/lib/canon/diff_formatter/by_line/base_formatter.rb +407 -0
data/lib/canon/diff_formatter/by_line/html_formatter.rb +672 -0
data/lib/canon/diff_formatter/by_line/json_formatter.rb +284 -0
data/lib/canon/diff_formatter/by_line/simple_formatter.rb +190 -0
data/lib/canon/diff_formatter/by_line/xml_formatter.rb +860 -0
data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +292 -0
data/lib/canon/diff_formatter/by_object/base_formatter.rb +199 -0
data/lib/canon/diff_formatter/by_object/json_formatter.rb +305 -0
data/lib/canon/diff_formatter/by_object/xml_formatter.rb +248 -0
data/lib/canon/diff_formatter/by_object/yaml_formatter.rb +17 -0
data/lib/canon/diff_formatter/character_map.yml +197 -0
data/lib/canon/diff_formatter/debug_output.rb +431 -0
data/lib/canon/diff_formatter/diff_detail_formatter.rb +551 -0
data/lib/canon/diff_formatter/legend.rb +141 -0
data/lib/canon/diff_formatter.rb +520 -0
data/lib/canon/errors.rb +56 -0
data/lib/canon/formatters/html4_formatter.rb +17 -0
data/lib/canon/formatters/html5_formatter.rb +17 -0
data/lib/canon/formatters/html_formatter.rb +37 -0
data/lib/canon/formatters/html_formatter_base.rb +163 -0
data/lib/canon/formatters/json_formatter.rb +3 -0
data/lib/canon/formatters/xml_formatter.rb +20 -55
data/lib/canon/formatters/yaml_formatter.rb +4 -1
data/lib/canon/pretty_printer/html.rb +57 -0
data/lib/canon/pretty_printer/json.rb +25 -0
data/lib/canon/pretty_printer/xml.rb +29 -0
data/lib/canon/rspec_matchers.rb +222 -80
data/lib/canon/validators/base_validator.rb +49 -0
data/lib/canon/validators/html_validator.rb +138 -0
data/lib/canon/validators/json_validator.rb +89 -0
data/lib/canon/validators/xml_validator.rb +53 -0
data/lib/canon/validators/yaml_validator.rb +73 -0
data/lib/canon/version.rb +1 -1
data/lib/canon/xml/attribute_handler.rb +80 -0
data/lib/canon/xml/c14n.rb +36 -0
data/lib/canon/xml/character_encoder.rb +38 -0
data/lib/canon/xml/data_model.rb +225 -0
data/lib/canon/xml/element_matcher.rb +196 -0
data/lib/canon/xml/line_range_mapper.rb +158 -0
data/lib/canon/xml/namespace_handler.rb +86 -0
data/lib/canon/xml/node.rb +32 -0
data/lib/canon/xml/nodes/attribute_node.rb +54 -0
data/lib/canon/xml/nodes/comment_node.rb +23 -0
data/lib/canon/xml/nodes/element_node.rb +56 -0
data/lib/canon/xml/nodes/namespace_node.rb +38 -0
data/lib/canon/xml/nodes/processing_instruction_node.rb +24 -0
data/lib/canon/xml/nodes/root_node.rb +16 -0
data/lib/canon/xml/nodes/text_node.rb +23 -0
data/lib/canon/xml/processor.rb +151 -0
data/lib/canon/xml/whitespace_normalizer.rb +72 -0
data/lib/canon/xml/xml_base_handler.rb +188 -0
data/lib/canon.rb +14 -3
metadata +116 -21

data/lib/canon/formatters/html_formatter_base.rb ADDED Viewed

@@ -0,0 +1,163 @@
+# frozen_string_literal: true
+require "nokogiri"
+module Canon
+  module Formatters
+    # Base class for HTML formatters with shared canonicalization logic
+    #
+    # This abstract base class provides common HTML canonicalization logic
+    # for both HTML4 and HTML5 formatters. It handles:
+    # - Attribute sorting for consistency
+    # - Whitespace normalization
+    # - Block element spacing
+    #
+    # == Canonicalization Process
+    #
+    # 1. Parse HTML using format-specific parser (subclass responsibility)
+    # 2. Sort all element attributes alphabetically
+    # 3. Normalize whitespace (remove whitespace-only text nodes, collapse runs)
+    # 4. Ensure proper spacing between block-level elements
+    # 5. Serialize to HTML string
+    #
+    # == Subclass Implementation
+    #
+    # Subclasses must implement the `parse` class method:
+    #
+    #   def self.parse(html)
+    #     # Return Nokogiri::HTML4::Document or Nokogiri::HTML5::Document
+    #   end
+    #
+    # == Block Elements
+    #
+    # The following elements are treated as block-level and will have spacing
+    # preserved between them: address, article, aside, blockquote, dd, details,
+    # dialog, div, dl, dt, fieldset, figcaption, figure, footer, form, h1-h6,
+    # header, hgroup, hr, li, main, nav, ol, p, pre, section, table, tbody,
+    # td, tfoot, th, thead, tr, ul
+    #
+    # == Usage
+    #
+    #   # Via subclass (Html4Formatter or Html5Formatter)
+    #   canonical_html = Canon::Formatters::Html4Formatter.format(html_string)
+    #
+    class HtmlFormatterBase
+      # Block-level HTML elements that should preserve spacing between them
+      BLOCK_ELEMENTS = %w[
+        address article aside blockquote dd details dialog div dl dt
+        fieldset figcaption figure footer form h1 h2 h3 h4 h5 h6
+        header hgroup hr li main nav ol p pre section table tbody
+        td tfoot th thead tr ul
+      ].freeze
+      # Format HTML using canonical form
+      # @param html [String] HTML document to canonicalize
+      # @return [String] Canonical form of HTML
+      def self.format(html)
+        doc = parse(html)
+        canonicalize(doc)
+      end
+      # Parse HTML into a Nokogiri document
+      # @param html [String] HTML document to parse
+      # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document]
+      #   Parsed HTML document
+      def self.parse(_html)
+        raise NotImplementedError,
+              "Subclasses must implement the parse method"
+      end
+      # Canonicalize HTML document
+      # @param doc [Nokogiri::HTML::Document] Parsed HTML document
+      # @return [String] Canonical HTML string
+      def self.canonicalize(doc)
+        # Sort attributes for consistency
+        sort_attributes(doc)
+        # Normalize whitespace between elements
+        normalize_whitespace(doc)
+        # Serialize with consistent formatting
+        html = doc.to_html(
+          save_with: Nokogiri::XML::Node::SaveOptions::NO_DECLARATION,
+        ).strip
+        # Post-process: ensure spaces between block element tags
+        # This is needed because Nokogiri's serialization may remove
+        # whitespace text nodes between block elements
+        ensure_block_element_spacing(html)
+      end
+      # Sort element attributes alphabetically throughout document
+      # @param doc [Nokogiri::HTML::Document] Document to process
+      def self.sort_attributes(doc)
+        doc.traverse do |node|
+          next unless node.element?
+          next if node.attributes.empty?
+          sorted_attrs = node.attributes.sort_by { |name, _| name }
+          node.attributes.each_key { |name| node.remove_attribute(name) }
+          sorted_attrs.each { |name, attr| node[name] = attr.value }
+        end
+      end
+      # Normalize whitespace by removing whitespace-only text nodes
+      # between elements and collapsing whitespace within text content
+      # @param doc [Nokogiri::HTML::Document] Document to process
+      def self.normalize_whitespace(doc)
+        # Normalize whitespace in text nodes
+        doc.traverse do |node|
+          next unless node.text?
+          # Handle whitespace-only text nodes
+          if node.text.strip.empty? && node.parent&.element?
+            # Check if this text node is between block-level elements
+            prev_sibling = node.previous_sibling
+            next_sibling = node.next_sibling
+            # If between block elements, preserve one space
+            if block_element?(prev_sibling) || block_element?(next_sibling) ||
+                block_element?(node.parent)
+              node.content = " "
+            else
+              # Otherwise remove it
+              node.remove
+            end
+          else
+            # Collapse multiple whitespace characters into single spaces
+            # but preserve leading/trailing single spaces for inline content
+            normalized = node.text.gsub(/\s+/, " ")
+            # Only strip if the entire parent chain suggests it's appropriate
+            # (e.g., at document boundaries)
+            if node.parent&.name == "body" &&
+                (node.previous_sibling.nil? || node.next_sibling.nil?)
+              normalized = normalized.strip
+            end
+            node.content = normalized
+          end
+        end
+      end
+      # Ensure spacing between block element tags in serialized HTML
+      # @param html [String] Serialized HTML string
+      # @return [String] HTML with proper spacing between block elements
+      def self.ensure_block_element_spacing(html)
+        # Build regex pattern for block element tags
+        block_tags = BLOCK_ELEMENTS.join("|")
+        # Add space between closing and opening block element tags
+        # Match: ><opening_block_tag or </closing_block_tag><opening_block_tag
+        html.gsub(/(<\/(?:#{block_tags})>)(<(?:#{block_tags})[\s>])/, '\1 \2')
+      end
+      # Check if a node is a block-level element
+      # @param node [Nokogiri::XML::Node, nil] Node to check
+      # @return [Boolean] true if node is a block element
+      def self.block_element?(node)
+        node&.element? && BLOCK_ELEMENTS.include?(node.name.downcase)
+      end
+      private_class_method :sort_attributes, :normalize_whitespace,
+                           :ensure_block_element_spacing, :block_element?
+    end
+  end
+end

data/lib/canon/formatters/json_formatter.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require "json"
+require_relative "../validators/json_validator"
 module Canon
   module Formatters
@@ -12,6 +13,8 @@ module Canon
       end
       def self.parse(json)
+        # Validate before parsing
+        Canon::Validators::JsonValidator.validate!(json)
         JSON.parse(json)
       end

data/lib/canon/formatters/xml_formatter.rb CHANGED Viewed

@@ -1,68 +1,33 @@
 # frozen_string_literal: true
 require "nokogiri"
+require_relative "../xml/c14n"
+require_relative "../pretty_printer/xml"
+require_relative "../validators/xml_validator"
 module Canon
   module Formatters
-    # XML formatter for canonicalization
+    # XML formatter using Canonical XML 1.1 or pretty printing
     class XmlFormatter
-      # Source of XSLT
-      # https://emmanueloga.wordpress.com/2009/09/29/pretty-printing-xhtml-with-nokogiri-and-xslt/
-      NOKOGIRI_C14N_XSL = <<~XSL
-        <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
-          <xsl:output method="xml" encoding="ISO-8859-1"/>
-          <xsl:param name="indent-increment" select="'   '"/>
-          <xsl:template name="newline">
-            <xsl:text disable-output-escaping="yes">
-        </xsl:text>
-          </xsl:template>
-          <xsl:template match="comment() | processing-instruction()">
-            <xsl:param name="indent" select="''"/>
-            <xsl:call-template name="newline"/>
-            <xsl:value-of select="$indent"/>
-            <xsl:copy />
-          </xsl:template>
-          <xsl:template match="text()">
-            <xsl:param name="indent" select="''"/>
-            <xsl:call-template name="newline"/>
-            <xsl:value-of select="$indent"/>
-            <xsl:value-of select="normalize-space(.)"/>
-          </xsl:template>
-          <xsl:template match="text()[normalize-space(.)='']"/>
-          <xsl:template match="*">
-            <xsl:param name="indent" select="''"/>
-            <xsl:call-template name="newline"/>
-            <xsl:value-of select="$indent"/>
-            <xsl:choose>
-              <xsl:when test="count(child::*) > 0">
-                <xsl:copy>
-                <xsl:copy-of select="@*"/>
-                <xsl:apply-templates select="*|text()">
-                  <xsl:with-param name="indent" select="concat ($indent, $indent-increment)"/>
-                </xsl:apply-templates>
-                <xsl:call-template name="newline"/>
-                <xsl:value-of select="$indent"/>
-                </xsl:copy>
-              </xsl:when>
-              <xsl:otherwise>
-                <xsl:copy-of select="."/>
-              </xsl:otherwise>
-            </xsl:choose>
-          </xsl:template>
-        </xsl:stylesheet>
-      XSL
-      def self.format(xml)
-        Nokogiri::XSLT(NOKOGIRI_C14N_XSL)
-          .transform(Nokogiri::XML(xml, &:noblanks))
-          .to_xml(indent: 2, pretty: true, encoding: "UTF-8")
+      # Format XML with pretty printing by default
+      # @param xml [String] XML document to format
+      # @param pretty [Boolean] Whether to pretty print (default: true)
+      # @param indent [Integer] Number of spaces for indentation (default: 2)
+      # @return [String] Formatted XML
+      def self.format(xml, pretty: true, indent: 2)
+        if pretty
+          Canon::PrettyPrinter::Xml.new(indent: indent).format(xml)
+        else
+          Canon::Xml::C14n.canonicalize(xml, with_comments: false)
+        end
       end
+      # Parse XML into a Nokogiri document
+      # @param xml [String] XML document to parse
+      # @return [Nokogiri::XML::Document] Parsed XML document
       def self.parse(xml)
+        # Validate before parsing
+        Canon::Validators::XmlValidator.validate!(xml)
         Nokogiri::XML(xml)
       end
     end

data/lib/canon/formatters/yaml_formatter.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require "yaml"
+require_relative "../validators/yaml_validator"
 module Canon
   module Formatters
@@ -12,7 +13,9 @@ module Canon
       end
       def self.parse(yaml)
-        YAML.safe_load(yaml)
+        # Validate before parsing
+        Canon::Validators::YamlValidator.validate!(yaml)
+        YAML.safe_load(yaml, permitted_classes: [Symbol, Date, Time])
       end
       def self.sort_yaml_keys(obj)

data/lib/canon/pretty_printer/html.rb ADDED Viewed

@@ -0,0 +1,57 @@
+# frozen_string_literal: true
+require "nokogiri"
+module Canon
+  module PrettyPrinter
+    # Pretty printer for HTML with consistent indentation
+    class Html
+      def initialize(indent: 2, indent_type: "space")
+        @indent = indent.to_i
+        @indent_type = indent_type
+      end
+      # Pretty print HTML with consistent indentation
+      def format(html_string)
+        # Detect if this is XHTML or HTML
+        if xhtml?(html_string)
+          format_as_xhtml(html_string)
+        else
+          format_as_html(html_string)
+        end
+      end
+      private
+      def xhtml?(html_string)
+        # Check for XHTML DOCTYPE or xmlns attribute
+        html_string.include?("XHTML") ||
+          html_string.include?('xmlns="http://www.w3.org/1999/xhtml"')
+      end
+      def format_as_xhtml(html_string)
+        # Parse as XML for XHTML
+        doc = Nokogiri::XML(html_string, &:noblanks)
+        # Use Nokogiri's built-in pretty printing
+        if @indent_type == "tab"
+          doc.to_xml(indent: 1, indent_text: "\t", encoding: "UTF-8")
+        else
+          doc.to_xml(indent: @indent, encoding: "UTF-8")
+        end
+      end
+      def format_as_html(html_string)
+        # Parse as HTML5
+        doc = Nokogiri::HTML5(html_string)
+        # Use Nokogiri's built-in pretty printing
+        if @indent_type == "tab"
+          doc.to_html(indent: 1, indent_text: "\t", encoding: "UTF-8")
+        else
+          doc.to_html(indent: @indent, encoding: "UTF-8")
+        end
+      end
+    end
+  end
+end

data/lib/canon/pretty_printer/json.rb ADDED Viewed

@@ -0,0 +1,25 @@
+# frozen_string_literal: true
+require "json"
+module Canon
+  module PrettyPrinter
+    # Pretty printer for JSON with consistent indentation
+    class Json
+      def initialize(indent: 2, indent_type: "space")
+        @indent = indent.to_i
+        @indent_type = indent_type
+      end
+      # Pretty print JSON with consistent indentation
+      def format(json_string)
+        obj = JSON.parse(json_string)
+        # Determine indent string
+        indent_str = @indent_type == "tab" ? "\t" : " " * @indent
+        JSON.pretty_generate(obj, indent: indent_str)
+      end
+    end
+  end
+end

data/lib/canon/pretty_printer/xml.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# frozen_string_literal: true
+require "nokogiri"
+module Canon
+  module PrettyPrinter
+    # Pretty printer for XML with consistent indentation
+    class Xml
+      def initialize(indent: 2, indent_type: "space")
+        @indent = indent.to_i
+        @indent_type = indent_type
+      end
+      # Pretty print XML with consistent indentation
+      def format(xml_string)
+        doc = Nokogiri::XML(xml_string, &:noblanks)
+        # Use Nokogiri's built-in pretty printing
+        if @indent_type == "tab"
+          # For tabs, use indent_text parameter
+          doc.to_xml(indent: 1, indent_text: "\t", encoding: "UTF-8")
+        else
+          # For spaces, use indent parameter
+          doc.to_xml(indent: @indent, encoding: "UTF-8")
+        end
+      end
+    end
+  end
+end