canon 0.1.2 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/.rubocop.yml +9 -1
 - data/.rubocop_todo.yml +280 -5
 - data/README.adoc +203 -138
 - data/_config.yml +116 -0
 - data/docs/ADVANCED_TOPICS.adoc +20 -0
 - data/docs/BASIC_USAGE.adoc +16 -0
 - data/docs/CHARACTER_VISUALIZATION.adoc +567 -0
 - data/docs/CLI.adoc +493 -0
 - data/docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
 - data/docs/DIFF_ARCHITECTURE.adoc +435 -0
 - data/docs/DIFF_FORMATTING.adoc +540 -0
 - data/docs/FORMATS.adoc +447 -0
 - data/docs/INDEX.adoc +222 -0
 - data/docs/INPUT_VALIDATION.adoc +477 -0
 - data/docs/MATCH_ARCHITECTURE.adoc +463 -0
 - data/docs/MATCH_OPTIONS.adoc +719 -0
 - data/docs/MODES.adoc +432 -0
 - data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
 - data/docs/OPTIONS.adoc +1387 -0
 - data/docs/PREPROCESSING.adoc +491 -0
 - data/docs/RSPEC.adoc +605 -0
 - data/docs/RUBY_API.adoc +478 -0
 - data/docs/SEMANTIC_DIFF_REPORT.adoc +528 -0
 - data/docs/UNDERSTANDING_CANON.adoc +17 -0
 - data/docs/VERBOSE.adoc +482 -0
 - data/exe/canon +7 -0
 - data/lib/canon/cli.rb +179 -0
 - data/lib/canon/commands/diff_command.rb +195 -0
 - data/lib/canon/commands/format_command.rb +113 -0
 - data/lib/canon/comparison/base_comparator.rb +39 -0
 - data/lib/canon/comparison/comparison_result.rb +79 -0
 - data/lib/canon/comparison/html_comparator.rb +410 -0
 - data/lib/canon/comparison/json_comparator.rb +212 -0
 - data/lib/canon/comparison/match_options.rb +616 -0
 - data/lib/canon/comparison/xml_comparator.rb +566 -0
 - data/lib/canon/comparison/yaml_comparator.rb +93 -0
 - data/lib/canon/comparison.rb +239 -0
 - data/lib/canon/config.rb +172 -0
 - data/lib/canon/diff/diff_block.rb +71 -0
 - data/lib/canon/diff/diff_block_builder.rb +105 -0
 - data/lib/canon/diff/diff_classifier.rb +46 -0
 - data/lib/canon/diff/diff_context.rb +85 -0
 - data/lib/canon/diff/diff_context_builder.rb +107 -0
 - data/lib/canon/diff/diff_line.rb +77 -0
 - data/lib/canon/diff/diff_node.rb +56 -0
 - data/lib/canon/diff/diff_node_mapper.rb +148 -0
 - data/lib/canon/diff/diff_report.rb +133 -0
 - data/lib/canon/diff/diff_report_builder.rb +62 -0
 - data/lib/canon/diff_formatter/by_line/base_formatter.rb +407 -0
 - data/lib/canon/diff_formatter/by_line/html_formatter.rb +672 -0
 - data/lib/canon/diff_formatter/by_line/json_formatter.rb +284 -0
 - data/lib/canon/diff_formatter/by_line/simple_formatter.rb +190 -0
 - data/lib/canon/diff_formatter/by_line/xml_formatter.rb +860 -0
 - data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +292 -0
 - data/lib/canon/diff_formatter/by_object/base_formatter.rb +199 -0
 - data/lib/canon/diff_formatter/by_object/json_formatter.rb +305 -0
 - data/lib/canon/diff_formatter/by_object/xml_formatter.rb +248 -0
 - data/lib/canon/diff_formatter/by_object/yaml_formatter.rb +17 -0
 - data/lib/canon/diff_formatter/character_map.yml +197 -0
 - data/lib/canon/diff_formatter/debug_output.rb +431 -0
 - data/lib/canon/diff_formatter/diff_detail_formatter.rb +551 -0
 - data/lib/canon/diff_formatter/legend.rb +141 -0
 - data/lib/canon/diff_formatter.rb +520 -0
 - data/lib/canon/errors.rb +56 -0
 - data/lib/canon/formatters/html4_formatter.rb +17 -0
 - data/lib/canon/formatters/html5_formatter.rb +17 -0
 - data/lib/canon/formatters/html_formatter.rb +37 -0
 - data/lib/canon/formatters/html_formatter_base.rb +163 -0
 - data/lib/canon/formatters/json_formatter.rb +3 -0
 - data/lib/canon/formatters/xml_formatter.rb +20 -55
 - data/lib/canon/formatters/yaml_formatter.rb +4 -1
 - data/lib/canon/pretty_printer/html.rb +57 -0
 - data/lib/canon/pretty_printer/json.rb +25 -0
 - data/lib/canon/pretty_printer/xml.rb +29 -0
 - data/lib/canon/rspec_matchers.rb +222 -77
 - data/lib/canon/validators/base_validator.rb +49 -0
 - data/lib/canon/validators/html_validator.rb +138 -0
 - data/lib/canon/validators/json_validator.rb +89 -0
 - data/lib/canon/validators/xml_validator.rb +53 -0
 - data/lib/canon/validators/yaml_validator.rb +73 -0
 - data/lib/canon/version.rb +1 -1
 - data/lib/canon/xml/attribute_handler.rb +80 -0
 - data/lib/canon/xml/c14n.rb +36 -0
 - data/lib/canon/xml/character_encoder.rb +38 -0
 - data/lib/canon/xml/data_model.rb +225 -0
 - data/lib/canon/xml/element_matcher.rb +196 -0
 - data/lib/canon/xml/line_range_mapper.rb +158 -0
 - data/lib/canon/xml/namespace_handler.rb +86 -0
 - data/lib/canon/xml/node.rb +32 -0
 - data/lib/canon/xml/nodes/attribute_node.rb +54 -0
 - data/lib/canon/xml/nodes/comment_node.rb +23 -0
 - data/lib/canon/xml/nodes/element_node.rb +56 -0
 - data/lib/canon/xml/nodes/namespace_node.rb +38 -0
 - data/lib/canon/xml/nodes/processing_instruction_node.rb +24 -0
 - data/lib/canon/xml/nodes/root_node.rb +16 -0
 - data/lib/canon/xml/nodes/text_node.rb +23 -0
 - data/lib/canon/xml/processor.rb +151 -0
 - data/lib/canon/xml/whitespace_normalizer.rb +72 -0
 - data/lib/canon/xml/xml_base_handler.rb +188 -0
 - data/lib/canon.rb +14 -3
 - metadata +116 -21
 
| 
         @@ -0,0 +1,163 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require "nokogiri"
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            module Canon
         
     | 
| 
      
 6 
     | 
    
         
            +
              module Formatters
         
     | 
| 
      
 7 
     | 
    
         
            +
                # Base class for HTML formatters with shared canonicalization logic
         
     | 
| 
      
 8 
     | 
    
         
            +
                #
         
     | 
| 
      
 9 
     | 
    
         
            +
                # This abstract base class provides common HTML canonicalization logic
         
     | 
| 
      
 10 
     | 
    
         
            +
                # for both HTML4 and HTML5 formatters. It handles:
         
     | 
| 
      
 11 
     | 
    
         
            +
                # - Attribute sorting for consistency
         
     | 
| 
      
 12 
     | 
    
         
            +
                # - Whitespace normalization
         
     | 
| 
      
 13 
     | 
    
         
            +
                # - Block element spacing
         
     | 
| 
      
 14 
     | 
    
         
            +
                #
         
     | 
| 
      
 15 
     | 
    
         
            +
                # == Canonicalization Process
         
     | 
| 
      
 16 
     | 
    
         
            +
                #
         
     | 
| 
      
 17 
     | 
    
         
            +
                # 1. Parse HTML using format-specific parser (subclass responsibility)
         
     | 
| 
      
 18 
     | 
    
         
            +
                # 2. Sort all element attributes alphabetically
         
     | 
| 
      
 19 
     | 
    
         
            +
                # 3. Normalize whitespace (remove whitespace-only text nodes, collapse runs)
         
     | 
| 
      
 20 
     | 
    
         
            +
                # 4. Ensure proper spacing between block-level elements
         
     | 
| 
      
 21 
     | 
    
         
            +
                # 5. Serialize to HTML string
         
     | 
| 
      
 22 
     | 
    
         
            +
                #
         
     | 
| 
      
 23 
     | 
    
         
            +
                # == Subclass Implementation
         
     | 
| 
      
 24 
     | 
    
         
            +
                #
         
     | 
| 
      
 25 
     | 
    
         
            +
                # Subclasses must implement the `parse` class method:
         
     | 
| 
      
 26 
     | 
    
         
            +
                #
         
     | 
| 
      
 27 
     | 
    
         
            +
                #   def self.parse(html)
         
     | 
| 
      
 28 
     | 
    
         
            +
                #     # Return Nokogiri::HTML4::Document or Nokogiri::HTML5::Document
         
     | 
| 
      
 29 
     | 
    
         
            +
                #   end
         
     | 
| 
      
 30 
     | 
    
         
            +
                #
         
     | 
| 
      
 31 
     | 
    
         
            +
                # == Block Elements
         
     | 
| 
      
 32 
     | 
    
         
            +
                #
         
     | 
| 
      
 33 
     | 
    
         
            +
                # The following elements are treated as block-level and will have spacing
         
     | 
| 
      
 34 
     | 
    
         
            +
                # preserved between them: address, article, aside, blockquote, dd, details,
         
     | 
| 
      
 35 
     | 
    
         
            +
                # dialog, div, dl, dt, fieldset, figcaption, figure, footer, form, h1-h6,
         
     | 
| 
      
 36 
     | 
    
         
            +
                # header, hgroup, hr, li, main, nav, ol, p, pre, section, table, tbody,
         
     | 
| 
      
 37 
     | 
    
         
            +
                # td, tfoot, th, thead, tr, ul
         
     | 
| 
      
 38 
     | 
    
         
            +
                #
         
     | 
| 
      
 39 
     | 
    
         
            +
                # == Usage
         
     | 
| 
      
 40 
     | 
    
         
            +
                #
         
     | 
| 
      
 41 
     | 
    
         
            +
                #   # Via subclass (Html4Formatter or Html5Formatter)
         
     | 
| 
      
 42 
     | 
    
         
            +
                #   canonical_html = Canon::Formatters::Html4Formatter.format(html_string)
         
     | 
| 
      
 43 
     | 
    
         
            +
                #
         
     | 
| 
      
 44 
     | 
    
         
            +
                class HtmlFormatterBase
         
     | 
| 
      
 45 
     | 
    
         
            +
                  # Block-level HTML elements that should preserve spacing between them
         
     | 
| 
      
 46 
     | 
    
         
            +
                  BLOCK_ELEMENTS = %w[
         
     | 
| 
      
 47 
     | 
    
         
            +
                    address article aside blockquote dd details dialog div dl dt
         
     | 
| 
      
 48 
     | 
    
         
            +
                    fieldset figcaption figure footer form h1 h2 h3 h4 h5 h6
         
     | 
| 
      
 49 
     | 
    
         
            +
                    header hgroup hr li main nav ol p pre section table tbody
         
     | 
| 
      
 50 
     | 
    
         
            +
                    td tfoot th thead tr ul
         
     | 
| 
      
 51 
     | 
    
         
            +
                  ].freeze
         
     | 
| 
      
 52 
     | 
    
         
            +
                  # Format HTML using canonical form
         
     | 
| 
      
 53 
     | 
    
         
            +
                  # @param html [String] HTML document to canonicalize
         
     | 
| 
      
 54 
     | 
    
         
            +
                  # @return [String] Canonical form of HTML
         
     | 
| 
      
 55 
     | 
    
         
            +
                  def self.format(html)
         
     | 
| 
      
 56 
     | 
    
         
            +
                    doc = parse(html)
         
     | 
| 
      
 57 
     | 
    
         
            +
                    canonicalize(doc)
         
     | 
| 
      
 58 
     | 
    
         
            +
                  end
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
                  # Parse HTML into a Nokogiri document
         
     | 
| 
      
 61 
     | 
    
         
            +
                  # @param html [String] HTML document to parse
         
     | 
| 
      
 62 
     | 
    
         
            +
                  # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document]
         
     | 
| 
      
 63 
     | 
    
         
            +
                  #   Parsed HTML document
         
     | 
| 
      
 64 
     | 
    
         
            +
                  def self.parse(_html)
         
     | 
| 
      
 65 
     | 
    
         
            +
                    raise NotImplementedError,
         
     | 
| 
      
 66 
     | 
    
         
            +
                          "Subclasses must implement the parse method"
         
     | 
| 
      
 67 
     | 
    
         
            +
                  end
         
     | 
| 
      
 68 
     | 
    
         
            +
             
     | 
| 
      
 69 
     | 
    
         
            +
                  # Canonicalize HTML document
         
     | 
| 
      
 70 
     | 
    
         
            +
                  # @param doc [Nokogiri::HTML::Document] Parsed HTML document
         
     | 
| 
      
 71 
     | 
    
         
            +
                  # @return [String] Canonical HTML string
         
     | 
| 
      
 72 
     | 
    
         
            +
                  def self.canonicalize(doc)
         
     | 
| 
      
 73 
     | 
    
         
            +
                    # Sort attributes for consistency
         
     | 
| 
      
 74 
     | 
    
         
            +
                    sort_attributes(doc)
         
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
      
 76 
     | 
    
         
            +
                    # Normalize whitespace between elements
         
     | 
| 
      
 77 
     | 
    
         
            +
                    normalize_whitespace(doc)
         
     | 
| 
      
 78 
     | 
    
         
            +
             
     | 
| 
      
 79 
     | 
    
         
            +
                    # Serialize with consistent formatting
         
     | 
| 
      
 80 
     | 
    
         
            +
                    html = doc.to_html(
         
     | 
| 
      
 81 
     | 
    
         
            +
                      save_with: Nokogiri::XML::Node::SaveOptions::NO_DECLARATION,
         
     | 
| 
      
 82 
     | 
    
         
            +
                    ).strip
         
     | 
| 
      
 83 
     | 
    
         
            +
             
     | 
| 
      
 84 
     | 
    
         
            +
                    # Post-process: ensure spaces between block element tags
         
     | 
| 
      
 85 
     | 
    
         
            +
                    # This is needed because Nokogiri's serialization may remove
         
     | 
| 
      
 86 
     | 
    
         
            +
                    # whitespace text nodes between block elements
         
     | 
| 
      
 87 
     | 
    
         
            +
                    ensure_block_element_spacing(html)
         
     | 
| 
      
 88 
     | 
    
         
            +
                  end
         
     | 
| 
      
 89 
     | 
    
         
            +
             
     | 
| 
      
 90 
     | 
    
         
            +
                  # Sort element attributes alphabetically throughout document
         
     | 
| 
      
 91 
     | 
    
         
            +
                  # @param doc [Nokogiri::HTML::Document] Document to process
         
     | 
| 
      
 92 
     | 
    
         
            +
                  def self.sort_attributes(doc)
         
     | 
| 
      
 93 
     | 
    
         
            +
                    doc.traverse do |node|
         
     | 
| 
      
 94 
     | 
    
         
            +
                      next unless node.element?
         
     | 
| 
      
 95 
     | 
    
         
            +
                      next if node.attributes.empty?
         
     | 
| 
      
 96 
     | 
    
         
            +
             
     | 
| 
      
 97 
     | 
    
         
            +
                      sorted_attrs = node.attributes.sort_by { |name, _| name }
         
     | 
| 
      
 98 
     | 
    
         
            +
                      node.attributes.each_key { |name| node.remove_attribute(name) }
         
     | 
| 
      
 99 
     | 
    
         
            +
                      sorted_attrs.each { |name, attr| node[name] = attr.value }
         
     | 
| 
      
 100 
     | 
    
         
            +
                    end
         
     | 
| 
      
 101 
     | 
    
         
            +
                  end
         
     | 
| 
      
 102 
     | 
    
         
            +
             
     | 
| 
      
 103 
     | 
    
         
            +
                  # Normalize whitespace by removing whitespace-only text nodes
         
     | 
| 
      
 104 
     | 
    
         
            +
                  # between elements and collapsing whitespace within text content
         
     | 
| 
      
 105 
     | 
    
         
            +
                  # @param doc [Nokogiri::HTML::Document] Document to process
         
     | 
| 
      
 106 
     | 
    
         
            +
                  def self.normalize_whitespace(doc)
         
     | 
| 
      
 107 
     | 
    
         
            +
                    # Normalize whitespace in text nodes
         
     | 
| 
      
 108 
     | 
    
         
            +
                    doc.traverse do |node|
         
     | 
| 
      
 109 
     | 
    
         
            +
                      next unless node.text?
         
     | 
| 
      
 110 
     | 
    
         
            +
             
     | 
| 
      
 111 
     | 
    
         
            +
                      # Handle whitespace-only text nodes
         
     | 
| 
      
 112 
     | 
    
         
            +
                      if node.text.strip.empty? && node.parent&.element?
         
     | 
| 
      
 113 
     | 
    
         
            +
                        # Check if this text node is between block-level elements
         
     | 
| 
      
 114 
     | 
    
         
            +
                        prev_sibling = node.previous_sibling
         
     | 
| 
      
 115 
     | 
    
         
            +
                        next_sibling = node.next_sibling
         
     | 
| 
      
 116 
     | 
    
         
            +
             
     | 
| 
      
 117 
     | 
    
         
            +
                        # If between block elements, preserve one space
         
     | 
| 
      
 118 
     | 
    
         
            +
                        if block_element?(prev_sibling) || block_element?(next_sibling) ||
         
     | 
| 
      
 119 
     | 
    
         
            +
                            block_element?(node.parent)
         
     | 
| 
      
 120 
     | 
    
         
            +
                          node.content = " "
         
     | 
| 
      
 121 
     | 
    
         
            +
                        else
         
     | 
| 
      
 122 
     | 
    
         
            +
                          # Otherwise remove it
         
     | 
| 
      
 123 
     | 
    
         
            +
                          node.remove
         
     | 
| 
      
 124 
     | 
    
         
            +
                        end
         
     | 
| 
      
 125 
     | 
    
         
            +
                      else
         
     | 
| 
      
 126 
     | 
    
         
            +
                        # Collapse multiple whitespace characters into single spaces
         
     | 
| 
      
 127 
     | 
    
         
            +
                        # but preserve leading/trailing single spaces for inline content
         
     | 
| 
      
 128 
     | 
    
         
            +
                        normalized = node.text.gsub(/\s+/, " ")
         
     | 
| 
      
 129 
     | 
    
         
            +
                        # Only strip if the entire parent chain suggests it's appropriate
         
     | 
| 
      
 130 
     | 
    
         
            +
                        # (e.g., at document boundaries)
         
     | 
| 
      
 131 
     | 
    
         
            +
                        if node.parent&.name == "body" &&
         
     | 
| 
      
 132 
     | 
    
         
            +
                            (node.previous_sibling.nil? || node.next_sibling.nil?)
         
     | 
| 
      
 133 
     | 
    
         
            +
                          normalized = normalized.strip
         
     | 
| 
      
 134 
     | 
    
         
            +
                        end
         
     | 
| 
      
 135 
     | 
    
         
            +
                        node.content = normalized
         
     | 
| 
      
 136 
     | 
    
         
            +
                      end
         
     | 
| 
      
 137 
     | 
    
         
            +
                    end
         
     | 
| 
      
 138 
     | 
    
         
            +
                  end
         
     | 
| 
      
 139 
     | 
    
         
            +
             
     | 
| 
      
 140 
     | 
    
         
            +
                  # Ensure spacing between block element tags in serialized HTML
         
     | 
| 
      
 141 
     | 
    
         
            +
                  # @param html [String] Serialized HTML string
         
     | 
| 
      
 142 
     | 
    
         
            +
                  # @return [String] HTML with proper spacing between block elements
         
     | 
| 
      
 143 
     | 
    
         
            +
                  def self.ensure_block_element_spacing(html)
         
     | 
| 
      
 144 
     | 
    
         
            +
                    # Build regex pattern for block element tags
         
     | 
| 
      
 145 
     | 
    
         
            +
                    block_tags = BLOCK_ELEMENTS.join("|")
         
     | 
| 
      
 146 
     | 
    
         
            +
             
     | 
| 
      
 147 
     | 
    
         
            +
                    # Add space between closing and opening block element tags
         
     | 
| 
      
 148 
     | 
    
         
            +
                    # Match: ><opening_block_tag or </closing_block_tag><opening_block_tag
         
     | 
| 
      
 149 
     | 
    
         
            +
                    html.gsub(/(<\/(?:#{block_tags})>)(<(?:#{block_tags})[\s>])/, '\1 \2')
         
     | 
| 
      
 150 
     | 
    
         
            +
                  end
         
     | 
| 
      
 151 
     | 
    
         
            +
             
     | 
| 
      
 152 
     | 
    
         
            +
                  # Check if a node is a block-level element
         
     | 
| 
      
 153 
     | 
    
         
            +
                  # @param node [Nokogiri::XML::Node, nil] Node to check
         
     | 
| 
      
 154 
     | 
    
         
            +
                  # @return [Boolean] true if node is a block element
         
     | 
| 
      
 155 
     | 
    
         
            +
                  def self.block_element?(node)
         
     | 
| 
      
 156 
     | 
    
         
            +
                    node&.element? && BLOCK_ELEMENTS.include?(node.name.downcase)
         
     | 
| 
      
 157 
     | 
    
         
            +
                  end
         
     | 
| 
      
 158 
     | 
    
         
            +
             
     | 
| 
      
 159 
     | 
    
         
            +
                  private_class_method :sort_attributes, :normalize_whitespace,
         
     | 
| 
      
 160 
     | 
    
         
            +
                                       :ensure_block_element_spacing, :block_element?
         
     | 
| 
      
 161 
     | 
    
         
            +
                end
         
     | 
| 
      
 162 
     | 
    
         
            +
              end
         
     | 
| 
      
 163 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -1,6 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # frozen_string_literal: true
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            require "json"
         
     | 
| 
      
 4 
     | 
    
         
            +
            require_relative "../validators/json_validator"
         
     | 
| 
       4 
5 
     | 
    
         | 
| 
       5 
6 
     | 
    
         
             
            module Canon
         
     | 
| 
       6 
7 
     | 
    
         
             
              module Formatters
         
     | 
| 
         @@ -12,6 +13,8 @@ module Canon 
     | 
|
| 
       12 
13 
     | 
    
         
             
                  end
         
     | 
| 
       13 
14 
     | 
    
         | 
| 
       14 
15 
     | 
    
         
             
                  def self.parse(json)
         
     | 
| 
      
 16 
     | 
    
         
            +
                    # Validate before parsing
         
     | 
| 
      
 17 
     | 
    
         
            +
                    Canon::Validators::JsonValidator.validate!(json)
         
     | 
| 
       15 
18 
     | 
    
         
             
                    JSON.parse(json)
         
     | 
| 
       16 
19 
     | 
    
         
             
                  end
         
     | 
| 
       17 
20 
     | 
    
         | 
| 
         @@ -1,68 +1,33 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # frozen_string_literal: true
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            require "nokogiri"
         
     | 
| 
      
 4 
     | 
    
         
            +
            require_relative "../xml/c14n"
         
     | 
| 
      
 5 
     | 
    
         
            +
            require_relative "../pretty_printer/xml"
         
     | 
| 
      
 6 
     | 
    
         
            +
            require_relative "../validators/xml_validator"
         
     | 
| 
       4 
7 
     | 
    
         | 
| 
       5 
8 
     | 
    
         
             
            module Canon
         
     | 
| 
       6 
9 
     | 
    
         
             
              module Formatters
         
     | 
| 
       7 
     | 
    
         
            -
                # XML formatter  
     | 
| 
      
 10 
     | 
    
         
            +
                # XML formatter using Canonical XML 1.1 or pretty printing
         
     | 
| 
       8 
11 
     | 
    
         
             
                class XmlFormatter
         
     | 
| 
       9 
     | 
    
         
            -
                  #  
     | 
| 
       10 
     | 
    
         
            -
                  #  
     | 
| 
       11 
     | 
    
         
            -
                   
     | 
| 
       12 
     | 
    
         
            -
             
     | 
| 
       13 
     | 
    
         
            -
             
     | 
| 
       14 
     | 
    
         
            -
             
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
             
     | 
| 
       17 
     | 
    
         
            -
                     
     | 
| 
       18 
     | 
    
         
            -
                       
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
       20 
     | 
    
         
            -
                      <xsl:template match="comment() | processing-instruction()">
         
     | 
| 
       21 
     | 
    
         
            -
                        <xsl:param name="indent" select="''"/>
         
     | 
| 
       22 
     | 
    
         
            -
                        <xsl:call-template name="newline"/>
         
     | 
| 
       23 
     | 
    
         
            -
                        <xsl:value-of select="$indent"/>
         
     | 
| 
       24 
     | 
    
         
            -
                        <xsl:copy />
         
     | 
| 
       25 
     | 
    
         
            -
                      </xsl:template>
         
     | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
                      <xsl:template match="text()">
         
     | 
| 
       28 
     | 
    
         
            -
                        <xsl:param name="indent" select="''"/>
         
     | 
| 
       29 
     | 
    
         
            -
                        <xsl:call-template name="newline"/>
         
     | 
| 
       30 
     | 
    
         
            -
                        <xsl:value-of select="$indent"/>
         
     | 
| 
       31 
     | 
    
         
            -
                        <xsl:value-of select="normalize-space(.)"/>
         
     | 
| 
       32 
     | 
    
         
            -
                      </xsl:template>
         
     | 
| 
       33 
     | 
    
         
            -
             
     | 
| 
       34 
     | 
    
         
            -
                      <xsl:template match="text()[normalize-space(.)='']"/>
         
     | 
| 
       35 
     | 
    
         
            -
             
     | 
| 
       36 
     | 
    
         
            -
                      <xsl:template match="*">
         
     | 
| 
       37 
     | 
    
         
            -
                        <xsl:param name="indent" select="''"/>
         
     | 
| 
       38 
     | 
    
         
            -
                        <xsl:call-template name="newline"/>
         
     | 
| 
       39 
     | 
    
         
            -
                        <xsl:value-of select="$indent"/>
         
     | 
| 
       40 
     | 
    
         
            -
                        <xsl:choose>
         
     | 
| 
       41 
     | 
    
         
            -
                          <xsl:when test="count(child::*) > 0">
         
     | 
| 
       42 
     | 
    
         
            -
                            <xsl:copy>
         
     | 
| 
       43 
     | 
    
         
            -
                            <xsl:copy-of select="@*"/>
         
     | 
| 
       44 
     | 
    
         
            -
                            <xsl:apply-templates select="*|text()">
         
     | 
| 
       45 
     | 
    
         
            -
                              <xsl:with-param name="indent" select="concat ($indent, $indent-increment)"/>
         
     | 
| 
       46 
     | 
    
         
            -
                            </xsl:apply-templates>
         
     | 
| 
       47 
     | 
    
         
            -
                            <xsl:call-template name="newline"/>
         
     | 
| 
       48 
     | 
    
         
            -
                            <xsl:value-of select="$indent"/>
         
     | 
| 
       49 
     | 
    
         
            -
                            </xsl:copy>
         
     | 
| 
       50 
     | 
    
         
            -
                          </xsl:when>
         
     | 
| 
       51 
     | 
    
         
            -
                          <xsl:otherwise>
         
     | 
| 
       52 
     | 
    
         
            -
                            <xsl:copy-of select="."/>
         
     | 
| 
       53 
     | 
    
         
            -
                          </xsl:otherwise>
         
     | 
| 
       54 
     | 
    
         
            -
                        </xsl:choose>
         
     | 
| 
       55 
     | 
    
         
            -
                      </xsl:template>
         
     | 
| 
       56 
     | 
    
         
            -
                    </xsl:stylesheet>
         
     | 
| 
       57 
     | 
    
         
            -
                  XSL
         
     | 
| 
       58 
     | 
    
         
            -
             
     | 
| 
       59 
     | 
    
         
            -
                  def self.format(xml)
         
     | 
| 
       60 
     | 
    
         
            -
                    Nokogiri::XSLT(NOKOGIRI_C14N_XSL)
         
     | 
| 
       61 
     | 
    
         
            -
                      .transform(Nokogiri::XML(xml, &:noblanks))
         
     | 
| 
       62 
     | 
    
         
            -
                      .to_xml(indent: 2, pretty: true, encoding: "UTF-8")
         
     | 
| 
      
 12 
     | 
    
         
            +
                  # Format XML with pretty printing by default
         
     | 
| 
      
 13 
     | 
    
         
            +
                  # @param xml [String] XML document to format
         
     | 
| 
      
 14 
     | 
    
         
            +
                  # @param pretty [Boolean] Whether to pretty print (default: true)
         
     | 
| 
      
 15 
     | 
    
         
            +
                  # @param indent [Integer] Number of spaces for indentation (default: 2)
         
     | 
| 
      
 16 
     | 
    
         
            +
                  # @return [String] Formatted XML
         
     | 
| 
      
 17 
     | 
    
         
            +
                  def self.format(xml, pretty: true, indent: 2)
         
     | 
| 
      
 18 
     | 
    
         
            +
                    if pretty
         
     | 
| 
      
 19 
     | 
    
         
            +
                      Canon::PrettyPrinter::Xml.new(indent: indent).format(xml)
         
     | 
| 
      
 20 
     | 
    
         
            +
                    else
         
     | 
| 
      
 21 
     | 
    
         
            +
                      Canon::Xml::C14n.canonicalize(xml, with_comments: false)
         
     | 
| 
      
 22 
     | 
    
         
            +
                    end
         
     | 
| 
       63 
23 
     | 
    
         
             
                  end
         
     | 
| 
       64 
24 
     | 
    
         | 
| 
      
 25 
     | 
    
         
            +
                  # Parse XML into a Nokogiri document
         
     | 
| 
      
 26 
     | 
    
         
            +
                  # @param xml [String] XML document to parse
         
     | 
| 
      
 27 
     | 
    
         
            +
                  # @return [Nokogiri::XML::Document] Parsed XML document
         
     | 
| 
       65 
28 
     | 
    
         
             
                  def self.parse(xml)
         
     | 
| 
      
 29 
     | 
    
         
            +
                    # Validate before parsing
         
     | 
| 
      
 30 
     | 
    
         
            +
                    Canon::Validators::XmlValidator.validate!(xml)
         
     | 
| 
       66 
31 
     | 
    
         
             
                    Nokogiri::XML(xml)
         
     | 
| 
       67 
32 
     | 
    
         
             
                  end
         
     | 
| 
       68 
33 
     | 
    
         
             
                end
         
     | 
| 
         @@ -1,6 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # frozen_string_literal: true
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            require "yaml"
         
     | 
| 
      
 4 
     | 
    
         
            +
            require_relative "../validators/yaml_validator"
         
     | 
| 
       4 
5 
     | 
    
         | 
| 
       5 
6 
     | 
    
         
             
            module Canon
         
     | 
| 
       6 
7 
     | 
    
         
             
              module Formatters
         
     | 
| 
         @@ -12,7 +13,9 @@ module Canon 
     | 
|
| 
       12 
13 
     | 
    
         
             
                  end
         
     | 
| 
       13 
14 
     | 
    
         | 
| 
       14 
15 
     | 
    
         
             
                  def self.parse(yaml)
         
     | 
| 
       15 
     | 
    
         
            -
                     
     | 
| 
      
 16 
     | 
    
         
            +
                    # Validate before parsing
         
     | 
| 
      
 17 
     | 
    
         
            +
                    Canon::Validators::YamlValidator.validate!(yaml)
         
     | 
| 
      
 18 
     | 
    
         
            +
                    YAML.safe_load(yaml, permitted_classes: [Symbol, Date, Time])
         
     | 
| 
       16 
19 
     | 
    
         
             
                  end
         
     | 
| 
       17 
20 
     | 
    
         | 
| 
       18 
21 
     | 
    
         
             
                  def self.sort_yaml_keys(obj)
         
     | 
| 
         @@ -0,0 +1,57 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require "nokogiri"
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            module Canon
         
     | 
| 
      
 6 
     | 
    
         
            +
              module PrettyPrinter
         
     | 
| 
      
 7 
     | 
    
         
            +
                # Pretty printer for HTML with consistent indentation
         
     | 
| 
      
 8 
     | 
    
         
            +
                class Html
         
     | 
| 
      
 9 
     | 
    
         
            +
                  def initialize(indent: 2, indent_type: "space")
         
     | 
| 
      
 10 
     | 
    
         
            +
                    @indent = indent.to_i
         
     | 
| 
      
 11 
     | 
    
         
            +
                    @indent_type = indent_type
         
     | 
| 
      
 12 
     | 
    
         
            +
                  end
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
                  # Pretty print HTML with consistent indentation
         
     | 
| 
      
 15 
     | 
    
         
            +
                  def format(html_string)
         
     | 
| 
      
 16 
     | 
    
         
            +
                    # Detect if this is XHTML or HTML
         
     | 
| 
      
 17 
     | 
    
         
            +
                    if xhtml?(html_string)
         
     | 
| 
      
 18 
     | 
    
         
            +
                      format_as_xhtml(html_string)
         
     | 
| 
      
 19 
     | 
    
         
            +
                    else
         
     | 
| 
      
 20 
     | 
    
         
            +
                      format_as_html(html_string)
         
     | 
| 
      
 21 
     | 
    
         
            +
                    end
         
     | 
| 
      
 22 
     | 
    
         
            +
                  end
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
                  private
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                  def xhtml?(html_string)
         
     | 
| 
      
 27 
     | 
    
         
            +
                    # Check for XHTML DOCTYPE or xmlns attribute
         
     | 
| 
      
 28 
     | 
    
         
            +
                    html_string.include?("XHTML") ||
         
     | 
| 
      
 29 
     | 
    
         
            +
                      html_string.include?('xmlns="http://www.w3.org/1999/xhtml"')
         
     | 
| 
      
 30 
     | 
    
         
            +
                  end
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                  def format_as_xhtml(html_string)
         
     | 
| 
      
 33 
     | 
    
         
            +
                    # Parse as XML for XHTML
         
     | 
| 
      
 34 
     | 
    
         
            +
                    doc = Nokogiri::XML(html_string, &:noblanks)
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
                    # Use Nokogiri's built-in pretty printing
         
     | 
| 
      
 37 
     | 
    
         
            +
                    if @indent_type == "tab"
         
     | 
| 
      
 38 
     | 
    
         
            +
                      doc.to_xml(indent: 1, indent_text: "\t", encoding: "UTF-8")
         
     | 
| 
      
 39 
     | 
    
         
            +
                    else
         
     | 
| 
      
 40 
     | 
    
         
            +
                      doc.to_xml(indent: @indent, encoding: "UTF-8")
         
     | 
| 
      
 41 
     | 
    
         
            +
                    end
         
     | 
| 
      
 42 
     | 
    
         
            +
                  end
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
                  def format_as_html(html_string)
         
     | 
| 
      
 45 
     | 
    
         
            +
                    # Parse as HTML5
         
     | 
| 
      
 46 
     | 
    
         
            +
                    doc = Nokogiri::HTML5(html_string)
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
                    # Use Nokogiri's built-in pretty printing
         
     | 
| 
      
 49 
     | 
    
         
            +
                    if @indent_type == "tab"
         
     | 
| 
      
 50 
     | 
    
         
            +
                      doc.to_html(indent: 1, indent_text: "\t", encoding: "UTF-8")
         
     | 
| 
      
 51 
     | 
    
         
            +
                    else
         
     | 
| 
      
 52 
     | 
    
         
            +
                      doc.to_html(indent: @indent, encoding: "UTF-8")
         
     | 
| 
      
 53 
     | 
    
         
            +
                    end
         
     | 
| 
      
 54 
     | 
    
         
            +
                  end
         
     | 
| 
      
 55 
     | 
    
         
            +
                end
         
     | 
| 
      
 56 
     | 
    
         
            +
              end
         
     | 
| 
      
 57 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,25 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require "json"
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            module Canon
         
     | 
| 
      
 6 
     | 
    
         
            +
              module PrettyPrinter
         
     | 
| 
      
 7 
     | 
    
         
            +
                # Pretty printer for JSON with consistent indentation
         
     | 
| 
      
 8 
     | 
    
         
            +
                class Json
         
     | 
| 
      
 9 
     | 
    
         
            +
                  def initialize(indent: 2, indent_type: "space")
         
     | 
| 
      
 10 
     | 
    
         
            +
                    @indent = indent.to_i
         
     | 
| 
      
 11 
     | 
    
         
            +
                    @indent_type = indent_type
         
     | 
| 
      
 12 
     | 
    
         
            +
                  end
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
                  # Pretty print JSON with consistent indentation
         
     | 
| 
      
 15 
     | 
    
         
            +
                  def format(json_string)
         
     | 
| 
      
 16 
     | 
    
         
            +
                    obj = JSON.parse(json_string)
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
                    # Determine indent string
         
     | 
| 
      
 19 
     | 
    
         
            +
                    indent_str = @indent_type == "tab" ? "\t" : " " * @indent
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
                    JSON.pretty_generate(obj, indent: indent_str)
         
     | 
| 
      
 22 
     | 
    
         
            +
                  end
         
     | 
| 
      
 23 
     | 
    
         
            +
                end
         
     | 
| 
      
 24 
     | 
    
         
            +
              end
         
     | 
| 
      
 25 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,29 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require "nokogiri"
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            module Canon
         
     | 
| 
      
 6 
     | 
    
         
            +
              module PrettyPrinter
         
     | 
| 
      
 7 
     | 
    
         
            +
                # Pretty printer for XML with consistent indentation
         
     | 
| 
      
 8 
     | 
    
         
            +
                class Xml
         
     | 
| 
      
 9 
     | 
    
         
            +
                  def initialize(indent: 2, indent_type: "space")
         
     | 
| 
      
 10 
     | 
    
         
            +
                    @indent = indent.to_i
         
     | 
| 
      
 11 
     | 
    
         
            +
                    @indent_type = indent_type
         
     | 
| 
      
 12 
     | 
    
         
            +
                  end
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
                  # Pretty print XML with consistent indentation
         
     | 
| 
      
 15 
     | 
    
         
            +
                  def format(xml_string)
         
     | 
| 
      
 16 
     | 
    
         
            +
                    doc = Nokogiri::XML(xml_string, &:noblanks)
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
                    # Use Nokogiri's built-in pretty printing
         
     | 
| 
      
 19 
     | 
    
         
            +
                    if @indent_type == "tab"
         
     | 
| 
      
 20 
     | 
    
         
            +
                      # For tabs, use indent_text parameter
         
     | 
| 
      
 21 
     | 
    
         
            +
                      doc.to_xml(indent: 1, indent_text: "\t", encoding: "UTF-8")
         
     | 
| 
      
 22 
     | 
    
         
            +
                    else
         
     | 
| 
      
 23 
     | 
    
         
            +
                      # For spaces, use indent parameter
         
     | 
| 
      
 24 
     | 
    
         
            +
                      doc.to_xml(indent: @indent, encoding: "UTF-8")
         
     | 
| 
      
 25 
     | 
    
         
            +
                    end
         
     | 
| 
      
 26 
     | 
    
         
            +
                  end
         
     | 
| 
      
 27 
     | 
    
         
            +
                end
         
     | 
| 
      
 28 
     | 
    
         
            +
              end
         
     | 
| 
      
 29 
     | 
    
         
            +
            end
         
     |