canon 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -1
  3. data/.rubocop_todo.yml +276 -7
  4. data/README.adoc +203 -138
  5. data/_config.yml +116 -0
  6. data/docs/ADVANCED_TOPICS.adoc +20 -0
  7. data/docs/BASIC_USAGE.adoc +16 -0
  8. data/docs/CHARACTER_VISUALIZATION.adoc +567 -0
  9. data/docs/CLI.adoc +493 -0
  10. data/docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
  11. data/docs/DIFF_ARCHITECTURE.adoc +435 -0
  12. data/docs/DIFF_FORMATTING.adoc +540 -0
  13. data/docs/FORMATS.adoc +447 -0
  14. data/docs/INDEX.adoc +222 -0
  15. data/docs/INPUT_VALIDATION.adoc +477 -0
  16. data/docs/MATCH_ARCHITECTURE.adoc +463 -0
  17. data/docs/MATCH_OPTIONS.adoc +719 -0
  18. data/docs/MODES.adoc +432 -0
  19. data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
  20. data/docs/OPTIONS.adoc +1387 -0
  21. data/docs/PREPROCESSING.adoc +491 -0
  22. data/docs/RSPEC.adoc +605 -0
  23. data/docs/RUBY_API.adoc +478 -0
  24. data/docs/SEMANTIC_DIFF_REPORT.adoc +528 -0
  25. data/docs/UNDERSTANDING_CANON.adoc +17 -0
  26. data/docs/VERBOSE.adoc +482 -0
  27. data/exe/canon +7 -0
  28. data/lib/canon/cli.rb +179 -0
  29. data/lib/canon/commands/diff_command.rb +195 -0
  30. data/lib/canon/commands/format_command.rb +113 -0
  31. data/lib/canon/comparison/base_comparator.rb +39 -0
  32. data/lib/canon/comparison/comparison_result.rb +79 -0
  33. data/lib/canon/comparison/html_comparator.rb +410 -0
  34. data/lib/canon/comparison/json_comparator.rb +212 -0
  35. data/lib/canon/comparison/match_options.rb +616 -0
  36. data/lib/canon/comparison/xml_comparator.rb +566 -0
  37. data/lib/canon/comparison/yaml_comparator.rb +93 -0
  38. data/lib/canon/comparison.rb +239 -0
  39. data/lib/canon/config.rb +172 -0
  40. data/lib/canon/diff/diff_block.rb +71 -0
  41. data/lib/canon/diff/diff_block_builder.rb +105 -0
  42. data/lib/canon/diff/diff_classifier.rb +46 -0
  43. data/lib/canon/diff/diff_context.rb +85 -0
  44. data/lib/canon/diff/diff_context_builder.rb +107 -0
  45. data/lib/canon/diff/diff_line.rb +77 -0
  46. data/lib/canon/diff/diff_node.rb +56 -0
  47. data/lib/canon/diff/diff_node_mapper.rb +148 -0
  48. data/lib/canon/diff/diff_report.rb +133 -0
  49. data/lib/canon/diff/diff_report_builder.rb +62 -0
  50. data/lib/canon/diff_formatter/by_line/base_formatter.rb +407 -0
  51. data/lib/canon/diff_formatter/by_line/html_formatter.rb +672 -0
  52. data/lib/canon/diff_formatter/by_line/json_formatter.rb +284 -0
  53. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +190 -0
  54. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +860 -0
  55. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +292 -0
  56. data/lib/canon/diff_formatter/by_object/base_formatter.rb +199 -0
  57. data/lib/canon/diff_formatter/by_object/json_formatter.rb +305 -0
  58. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +248 -0
  59. data/lib/canon/diff_formatter/by_object/yaml_formatter.rb +17 -0
  60. data/lib/canon/diff_formatter/character_map.yml +197 -0
  61. data/lib/canon/diff_formatter/debug_output.rb +431 -0
  62. data/lib/canon/diff_formatter/diff_detail_formatter.rb +551 -0
  63. data/lib/canon/diff_formatter/legend.rb +141 -0
  64. data/lib/canon/diff_formatter.rb +520 -0
  65. data/lib/canon/errors.rb +56 -0
  66. data/lib/canon/formatters/html4_formatter.rb +17 -0
  67. data/lib/canon/formatters/html5_formatter.rb +17 -0
  68. data/lib/canon/formatters/html_formatter.rb +37 -0
  69. data/lib/canon/formatters/html_formatter_base.rb +163 -0
  70. data/lib/canon/formatters/json_formatter.rb +3 -0
  71. data/lib/canon/formatters/xml_formatter.rb +20 -55
  72. data/lib/canon/formatters/yaml_formatter.rb +4 -1
  73. data/lib/canon/pretty_printer/html.rb +57 -0
  74. data/lib/canon/pretty_printer/json.rb +25 -0
  75. data/lib/canon/pretty_printer/xml.rb +29 -0
  76. data/lib/canon/rspec_matchers.rb +222 -80
  77. data/lib/canon/validators/base_validator.rb +49 -0
  78. data/lib/canon/validators/html_validator.rb +138 -0
  79. data/lib/canon/validators/json_validator.rb +89 -0
  80. data/lib/canon/validators/xml_validator.rb +53 -0
  81. data/lib/canon/validators/yaml_validator.rb +73 -0
  82. data/lib/canon/version.rb +1 -1
  83. data/lib/canon/xml/attribute_handler.rb +80 -0
  84. data/lib/canon/xml/c14n.rb +36 -0
  85. data/lib/canon/xml/character_encoder.rb +38 -0
  86. data/lib/canon/xml/data_model.rb +225 -0
  87. data/lib/canon/xml/element_matcher.rb +196 -0
  88. data/lib/canon/xml/line_range_mapper.rb +158 -0
  89. data/lib/canon/xml/namespace_handler.rb +86 -0
  90. data/lib/canon/xml/node.rb +32 -0
  91. data/lib/canon/xml/nodes/attribute_node.rb +54 -0
  92. data/lib/canon/xml/nodes/comment_node.rb +23 -0
  93. data/lib/canon/xml/nodes/element_node.rb +56 -0
  94. data/lib/canon/xml/nodes/namespace_node.rb +38 -0
  95. data/lib/canon/xml/nodes/processing_instruction_node.rb +24 -0
  96. data/lib/canon/xml/nodes/root_node.rb +16 -0
  97. data/lib/canon/xml/nodes/text_node.rb +23 -0
  98. data/lib/canon/xml/processor.rb +151 -0
  99. data/lib/canon/xml/whitespace_normalizer.rb +72 -0
  100. data/lib/canon/xml/xml_base_handler.rb +188 -0
  101. data/lib/canon.rb +14 -3
  102. metadata +116 -21
@@ -0,0 +1,163 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ module Canon
6
+ module Formatters
7
+ # Base class for HTML formatters with shared canonicalization logic
8
+ #
9
+ # This abstract base class provides common HTML canonicalization logic
10
+ # for both HTML4 and HTML5 formatters. It handles:
11
+ # - Attribute sorting for consistency
12
+ # - Whitespace normalization
13
+ # - Block element spacing
14
+ #
15
+ # == Canonicalization Process
16
+ #
17
+ # 1. Parse HTML using format-specific parser (subclass responsibility)
18
+ # 2. Sort all element attributes alphabetically
19
+ # 3. Normalize whitespace (remove whitespace-only text nodes, collapse runs)
20
+ # 4. Ensure proper spacing between block-level elements
21
+ # 5. Serialize to HTML string
22
+ #
23
+ # == Subclass Implementation
24
+ #
25
+ # Subclasses must implement the `parse` class method:
26
+ #
27
+ # def self.parse(html)
28
+ # # Return Nokogiri::HTML4::Document or Nokogiri::HTML5::Document
29
+ # end
30
+ #
31
+ # == Block Elements
32
+ #
33
+ # The following elements are treated as block-level and will have spacing
34
+ # preserved between them: address, article, aside, blockquote, dd, details,
35
+ # dialog, div, dl, dt, fieldset, figcaption, figure, footer, form, h1-h6,
36
+ # header, hgroup, hr, li, main, nav, ol, p, pre, section, table, tbody,
37
+ # td, tfoot, th, thead, tr, ul
38
+ #
39
+ # == Usage
40
+ #
41
+ # # Via subclass (Html4Formatter or Html5Formatter)
42
+ # canonical_html = Canon::Formatters::Html4Formatter.format(html_string)
43
+ #
44
+ class HtmlFormatterBase
45
+ # Block-level HTML elements that should preserve spacing between them
46
+ BLOCK_ELEMENTS = %w[
47
+ address article aside blockquote dd details dialog div dl dt
48
+ fieldset figcaption figure footer form h1 h2 h3 h4 h5 h6
49
+ header hgroup hr li main nav ol p pre section table tbody
50
+ td tfoot th thead tr ul
51
+ ].freeze
52
+ # Format HTML using canonical form
53
+ # @param html [String] HTML document to canonicalize
54
+ # @return [String] Canonical form of HTML
55
+ def self.format(html)
56
+ doc = parse(html)
57
+ canonicalize(doc)
58
+ end
59
+
60
+ # Parse HTML into a Nokogiri document
61
+ # @param html [String] HTML document to parse
62
+ # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document]
63
+ # Parsed HTML document
64
+ def self.parse(_html)
65
+ raise NotImplementedError,
66
+ "Subclasses must implement the parse method"
67
+ end
68
+
69
+ # Canonicalize HTML document
70
+ # @param doc [Nokogiri::HTML::Document] Parsed HTML document
71
+ # @return [String] Canonical HTML string
72
+ def self.canonicalize(doc)
73
+ # Sort attributes for consistency
74
+ sort_attributes(doc)
75
+
76
+ # Normalize whitespace between elements
77
+ normalize_whitespace(doc)
78
+
79
+ # Serialize with consistent formatting
80
+ html = doc.to_html(
81
+ save_with: Nokogiri::XML::Node::SaveOptions::NO_DECLARATION,
82
+ ).strip
83
+
84
+ # Post-process: ensure spaces between block element tags
85
+ # This is needed because Nokogiri's serialization may remove
86
+ # whitespace text nodes between block elements
87
+ ensure_block_element_spacing(html)
88
+ end
89
+
90
+ # Sort element attributes alphabetically throughout document
91
+ # @param doc [Nokogiri::HTML::Document] Document to process
92
+ def self.sort_attributes(doc)
93
+ doc.traverse do |node|
94
+ next unless node.element?
95
+ next if node.attributes.empty?
96
+
97
+ sorted_attrs = node.attributes.sort_by { |name, _| name }
98
+ node.attributes.each_key { |name| node.remove_attribute(name) }
99
+ sorted_attrs.each { |name, attr| node[name] = attr.value }
100
+ end
101
+ end
102
+
103
+ # Normalize whitespace by removing whitespace-only text nodes
104
+ # between elements and collapsing whitespace within text content
105
+ # @param doc [Nokogiri::HTML::Document] Document to process
106
+ def self.normalize_whitespace(doc)
107
+ # Normalize whitespace in text nodes
108
+ doc.traverse do |node|
109
+ next unless node.text?
110
+
111
+ # Handle whitespace-only text nodes
112
+ if node.text.strip.empty? && node.parent&.element?
113
+ # Check if this text node is between block-level elements
114
+ prev_sibling = node.previous_sibling
115
+ next_sibling = node.next_sibling
116
+
117
+ # If between block elements, preserve one space
118
+ if block_element?(prev_sibling) || block_element?(next_sibling) ||
119
+ block_element?(node.parent)
120
+ node.content = " "
121
+ else
122
+ # Otherwise remove it
123
+ node.remove
124
+ end
125
+ else
126
+ # Collapse multiple whitespace characters into single spaces
127
+ # but preserve leading/trailing single spaces for inline content
128
+ normalized = node.text.gsub(/\s+/, " ")
129
+ # Only strip if the entire parent chain suggests it's appropriate
130
+ # (e.g., at document boundaries)
131
+ if node.parent&.name == "body" &&
132
+ (node.previous_sibling.nil? || node.next_sibling.nil?)
133
+ normalized = normalized.strip
134
+ end
135
+ node.content = normalized
136
+ end
137
+ end
138
+ end
139
+
140
+ # Ensure spacing between block element tags in serialized HTML
141
+ # @param html [String] Serialized HTML string
142
+ # @return [String] HTML with proper spacing between block elements
143
+ def self.ensure_block_element_spacing(html)
144
+ # Build regex pattern for block element tags
145
+ block_tags = BLOCK_ELEMENTS.join("|")
146
+
147
+ # Add space between closing and opening block element tags
148
+ # Match: ><opening_block_tag or </closing_block_tag><opening_block_tag
149
+ html.gsub(/(<\/(?:#{block_tags})>)(<(?:#{block_tags})[\s>])/, '\1 \2')
150
+ end
151
+
152
+ # Check if a node is a block-level element
153
+ # @param node [Nokogiri::XML::Node, nil] Node to check
154
+ # @return [Boolean] true if node is a block element
155
+ def self.block_element?(node)
156
+ node&.element? && BLOCK_ELEMENTS.include?(node.name.downcase)
157
+ end
158
+
159
+ private_class_method :sort_attributes, :normalize_whitespace,
160
+ :ensure_block_element_spacing, :block_element?
161
+ end
162
+ end
163
+ end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "json"
4
+ require_relative "../validators/json_validator"
4
5
 
5
6
  module Canon
6
7
  module Formatters
@@ -12,6 +13,8 @@ module Canon
12
13
  end
13
14
 
14
15
  def self.parse(json)
16
+ # Validate before parsing
17
+ Canon::Validators::JsonValidator.validate!(json)
15
18
  JSON.parse(json)
16
19
  end
17
20
 
@@ -1,68 +1,33 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "nokogiri"
4
+ require_relative "../xml/c14n"
5
+ require_relative "../pretty_printer/xml"
6
+ require_relative "../validators/xml_validator"
4
7
 
5
8
  module Canon
6
9
  module Formatters
7
- # XML formatter for canonicalization
10
+ # XML formatter using Canonical XML 1.1 or pretty printing
8
11
  class XmlFormatter
9
- # Source of XSLT
10
- # https://emmanueloga.wordpress.com/2009/09/29/pretty-printing-xhtml-with-nokogiri-and-xslt/
11
- NOKOGIRI_C14N_XSL = <<~XSL
12
- <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
13
- <xsl:output method="xml" encoding="ISO-8859-1"/>
14
- <xsl:param name="indent-increment" select="' '"/>
15
- <xsl:template name="newline">
16
- <xsl:text disable-output-escaping="yes">
17
- </xsl:text>
18
- </xsl:template>
19
-
20
- <xsl:template match="comment() | processing-instruction()">
21
- <xsl:param name="indent" select="''"/>
22
- <xsl:call-template name="newline"/>
23
- <xsl:value-of select="$indent"/>
24
- <xsl:copy />
25
- </xsl:template>
26
-
27
- <xsl:template match="text()">
28
- <xsl:param name="indent" select="''"/>
29
- <xsl:call-template name="newline"/>
30
- <xsl:value-of select="$indent"/>
31
- <xsl:value-of select="normalize-space(.)"/>
32
- </xsl:template>
33
-
34
- <xsl:template match="text()[normalize-space(.)='']"/>
35
-
36
- <xsl:template match="*">
37
- <xsl:param name="indent" select="''"/>
38
- <xsl:call-template name="newline"/>
39
- <xsl:value-of select="$indent"/>
40
- <xsl:choose>
41
- <xsl:when test="count(child::*) > 0">
42
- <xsl:copy>
43
- <xsl:copy-of select="@*"/>
44
- <xsl:apply-templates select="*|text()">
45
- <xsl:with-param name="indent" select="concat ($indent, $indent-increment)"/>
46
- </xsl:apply-templates>
47
- <xsl:call-template name="newline"/>
48
- <xsl:value-of select="$indent"/>
49
- </xsl:copy>
50
- </xsl:when>
51
- <xsl:otherwise>
52
- <xsl:copy-of select="."/>
53
- </xsl:otherwise>
54
- </xsl:choose>
55
- </xsl:template>
56
- </xsl:stylesheet>
57
- XSL
58
-
59
- def self.format(xml)
60
- Nokogiri::XSLT(NOKOGIRI_C14N_XSL)
61
- .transform(Nokogiri::XML(xml, &:noblanks))
62
- .to_xml(indent: 2, pretty: true, encoding: "UTF-8")
12
+ # Format XML with pretty printing by default
13
+ # @param xml [String] XML document to format
14
+ # @param pretty [Boolean] Whether to pretty print (default: true)
15
+ # @param indent [Integer] Number of spaces for indentation (default: 2)
16
+ # @return [String] Formatted XML
17
+ def self.format(xml, pretty: true, indent: 2)
18
+ if pretty
19
+ Canon::PrettyPrinter::Xml.new(indent: indent).format(xml)
20
+ else
21
+ Canon::Xml::C14n.canonicalize(xml, with_comments: false)
22
+ end
63
23
  end
64
24
 
25
+ # Parse XML into a Nokogiri document
26
+ # @param xml [String] XML document to parse
27
+ # @return [Nokogiri::XML::Document] Parsed XML document
65
28
  def self.parse(xml)
29
+ # Validate before parsing
30
+ Canon::Validators::XmlValidator.validate!(xml)
66
31
  Nokogiri::XML(xml)
67
32
  end
68
33
  end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "yaml"
4
+ require_relative "../validators/yaml_validator"
4
5
 
5
6
  module Canon
6
7
  module Formatters
@@ -12,7 +13,9 @@ module Canon
12
13
  end
13
14
 
14
15
  def self.parse(yaml)
15
- YAML.safe_load(yaml)
16
+ # Validate before parsing
17
+ Canon::Validators::YamlValidator.validate!(yaml)
18
+ YAML.safe_load(yaml, permitted_classes: [Symbol, Date, Time])
16
19
  end
17
20
 
18
21
  def self.sort_yaml_keys(obj)
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ module Canon
6
+ module PrettyPrinter
7
+ # Pretty printer for HTML with consistent indentation
8
+ class Html
9
+ def initialize(indent: 2, indent_type: "space")
10
+ @indent = indent.to_i
11
+ @indent_type = indent_type
12
+ end
13
+
14
+ # Pretty print HTML with consistent indentation
15
+ def format(html_string)
16
+ # Detect if this is XHTML or HTML
17
+ if xhtml?(html_string)
18
+ format_as_xhtml(html_string)
19
+ else
20
+ format_as_html(html_string)
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ def xhtml?(html_string)
27
+ # Check for XHTML DOCTYPE or xmlns attribute
28
+ html_string.include?("XHTML") ||
29
+ html_string.include?('xmlns="http://www.w3.org/1999/xhtml"')
30
+ end
31
+
32
+ def format_as_xhtml(html_string)
33
+ # Parse as XML for XHTML
34
+ doc = Nokogiri::XML(html_string, &:noblanks)
35
+
36
+ # Use Nokogiri's built-in pretty printing
37
+ if @indent_type == "tab"
38
+ doc.to_xml(indent: 1, indent_text: "\t", encoding: "UTF-8")
39
+ else
40
+ doc.to_xml(indent: @indent, encoding: "UTF-8")
41
+ end
42
+ end
43
+
44
+ def format_as_html(html_string)
45
+ # Parse as HTML5
46
+ doc = Nokogiri::HTML5(html_string)
47
+
48
+ # Use Nokogiri's built-in pretty printing
49
+ if @indent_type == "tab"
50
+ doc.to_html(indent: 1, indent_text: "\t", encoding: "UTF-8")
51
+ else
52
+ doc.to_html(indent: @indent, encoding: "UTF-8")
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Canon
6
+ module PrettyPrinter
7
+ # Pretty printer for JSON with consistent indentation
8
+ class Json
9
+ def initialize(indent: 2, indent_type: "space")
10
+ @indent = indent.to_i
11
+ @indent_type = indent_type
12
+ end
13
+
14
+ # Pretty print JSON with consistent indentation
15
+ def format(json_string)
16
+ obj = JSON.parse(json_string)
17
+
18
+ # Determine indent string
19
+ indent_str = @indent_type == "tab" ? "\t" : " " * @indent
20
+
21
+ JSON.pretty_generate(obj, indent: indent_str)
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ module Canon
6
+ module PrettyPrinter
7
+ # Pretty printer for XML with consistent indentation
8
+ class Xml
9
+ def initialize(indent: 2, indent_type: "space")
10
+ @indent = indent.to_i
11
+ @indent_type = indent_type
12
+ end
13
+
14
+ # Pretty print XML with consistent indentation
15
+ def format(xml_string)
16
+ doc = Nokogiri::XML(xml_string, &:noblanks)
17
+
18
+ # Use Nokogiri's built-in pretty printing
19
+ if @indent_type == "tab"
20
+ # For tabs, use indent_text parameter
21
+ doc.to_xml(indent: 1, indent_text: "\t", encoding: "UTF-8")
22
+ else
23
+ # For spaces, use indent parameter
24
+ doc.to_xml(indent: @indent, encoding: "UTF-8")
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end