canon 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +25 -135
  3. data/README.adoc +13 -13
  4. data/docs/.lycheeignore +69 -0
  5. data/docs/advanced/extending-canon.adoc +193 -0
  6. data/docs/internals/diffnode-enrichment.adoc +611 -0
  7. data/docs/internals/index.adoc +251 -0
  8. data/docs/lychee.toml +13 -6
  9. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +250 -0
  10. data/docs/understanding/architecture.adoc +749 -33
  11. data/docs/understanding/comparison-pipeline.adoc +122 -0
  12. data/false_positive_analysis.txt +0 -0
  13. data/file1.html +1 -0
  14. data/file2.html +1 -0
  15. data/lib/canon/cache.rb +129 -0
  16. data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
  17. data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
  18. data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
  19. data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
  20. data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
  21. data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
  22. data/lib/canon/comparison/dimensions/registry.rb +77 -0
  23. data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
  24. data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
  25. data/lib/canon/comparison/dimensions.rb +54 -0
  26. data/lib/canon/comparison/format_detector.rb +86 -0
  27. data/lib/canon/comparison/html_comparator.rb +51 -18
  28. data/lib/canon/comparison/html_parser.rb +80 -0
  29. data/lib/canon/comparison/json_comparator.rb +12 -0
  30. data/lib/canon/comparison/json_parser.rb +19 -0
  31. data/lib/canon/comparison/markup_comparator.rb +293 -0
  32. data/lib/canon/comparison/match_options/base_resolver.rb +143 -0
  33. data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
  34. data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
  35. data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
  36. data/lib/canon/comparison/match_options.rb +68 -463
  37. data/lib/canon/comparison/profile_definition.rb +149 -0
  38. data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
  39. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
  40. data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
  41. data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
  42. data/lib/canon/comparison/xml_comparator/child_comparison.rb +189 -0
  43. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
  44. data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
  45. data/lib/canon/comparison/xml_comparator/node_parser.rb +74 -0
  46. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +95 -0
  47. data/lib/canon/comparison/xml_comparator.rb +52 -664
  48. data/lib/canon/comparison/xml_node_comparison.rb +297 -0
  49. data/lib/canon/comparison/xml_parser.rb +19 -0
  50. data/lib/canon/comparison/yaml_comparator.rb +3 -3
  51. data/lib/canon/comparison.rb +265 -110
  52. data/lib/canon/diff/diff_node.rb +32 -2
  53. data/lib/canon/diff/node_serializer.rb +191 -0
  54. data/lib/canon/diff/path_builder.rb +143 -0
  55. data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
  56. data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
  57. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
  58. data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
  59. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
  60. data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
  61. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
  62. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
  63. data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
  64. data/lib/canon/diff_formatter.rb +1 -1
  65. data/lib/canon/rspec_matchers.rb +1 -1
  66. data/lib/canon/tree_diff/operation_converter.rb +92 -338
  67. data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
  68. data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
  69. data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
  70. data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
  71. data/lib/canon/version.rb +1 -1
  72. data/old-docs/ADVANCED_TOPICS.adoc +20 -0
  73. data/old-docs/BASIC_USAGE.adoc +16 -0
  74. data/old-docs/CHARACTER_VISUALIZATION.adoc +567 -0
  75. data/old-docs/CLI.adoc +497 -0
  76. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
  77. data/old-docs/DIFF_ARCHITECTURE.adoc +435 -0
  78. data/old-docs/DIFF_FORMATTING.adoc +540 -0
  79. data/old-docs/DIFF_PARAMETERS.adoc +261 -0
  80. data/old-docs/DOM_DIFF.adoc +1017 -0
  81. data/old-docs/ENV_CONFIG.adoc +876 -0
  82. data/old-docs/FORMATS.adoc +867 -0
  83. data/old-docs/INPUT_VALIDATION.adoc +477 -0
  84. data/old-docs/MATCHER_BEHAVIOR.adoc +90 -0
  85. data/old-docs/MATCH_ARCHITECTURE.adoc +463 -0
  86. data/old-docs/MATCH_OPTIONS.adoc +912 -0
  87. data/old-docs/MODES.adoc +432 -0
  88. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
  89. data/old-docs/OPTIONS.adoc +1387 -0
  90. data/old-docs/PREPROCESSING.adoc +491 -0
  91. data/old-docs/README.old.adoc +2831 -0
  92. data/old-docs/RSPEC.adoc +814 -0
  93. data/old-docs/RUBY_API.adoc +485 -0
  94. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +646 -0
  95. data/old-docs/SEMANTIC_TREE_DIFF.adoc +765 -0
  96. data/old-docs/STRING_COMPARE.adoc +345 -0
  97. data/old-docs/TMP.adoc +3384 -0
  98. data/old-docs/TREE_DIFF.adoc +1080 -0
  99. data/old-docs/UNDERSTANDING_CANON.adoc +17 -0
  100. data/old-docs/VERBOSE.adoc +482 -0
  101. data/old-docs/VISUALIZATION_MAP.adoc +625 -0
  102. data/old-docs/WHITESPACE_TREATMENT.adoc +1155 -0
  103. data/scripts/analyze_current_state.rb +85 -0
  104. data/scripts/analyze_false_positives.rb +114 -0
  105. data/scripts/analyze_remaining_failures.rb +105 -0
  106. data/scripts/compare_current_failures.rb +95 -0
  107. data/scripts/compare_dom_tree_diff.rb +158 -0
  108. data/scripts/compare_failures.rb +151 -0
  109. data/scripts/debug_attribute_extraction.rb +66 -0
  110. data/scripts/debug_blocks_839.rb +115 -0
  111. data/scripts/debug_meta_matching.rb +52 -0
  112. data/scripts/debug_p_matching.rb +192 -0
  113. data/scripts/debug_signature_matching.rb +118 -0
  114. data/scripts/debug_sourcecode_124.rb +32 -0
  115. data/scripts/debug_whitespace_sensitive.rb +192 -0
  116. data/scripts/extract_false_positives.rb +138 -0
  117. data/scripts/find_actual_false_positives.rb +125 -0
  118. data/scripts/investigate_all_false_positives.rb +161 -0
  119. data/scripts/investigate_batch1.rb +127 -0
  120. data/scripts/investigate_classification.rb +150 -0
  121. data/scripts/investigate_classification_detailed.rb +190 -0
  122. data/scripts/investigate_common_failures.rb +342 -0
  123. data/scripts/investigate_false_negative.rb +80 -0
  124. data/scripts/investigate_false_positive.rb +83 -0
  125. data/scripts/investigate_false_positives.rb +227 -0
  126. data/scripts/investigate_false_positives_batch.rb +163 -0
  127. data/scripts/investigate_mixed_content.rb +125 -0
  128. data/scripts/investigate_remaining_16.rb +214 -0
  129. data/scripts/run_single_test.rb +29 -0
  130. data/scripts/test_all_false_positives.rb +95 -0
  131. data/scripts/test_attribute_details.rb +61 -0
  132. data/scripts/test_both_algorithms.rb +49 -0
  133. data/scripts/test_both_simple.rb +49 -0
  134. data/scripts/test_enhanced_semantic_output.rb +125 -0
  135. data/scripts/test_readme_examples.rb +131 -0
  136. data/scripts/test_semantic_tree_diff.rb +99 -0
  137. data/scripts/test_semantic_ux_improvements.rb +135 -0
  138. data/scripts/test_single_false_positive.rb +119 -0
  139. data/scripts/test_size_limits.rb +99 -0
  140. data/test_html_1.html +21 -0
  141. data/test_html_2.html +21 -0
  142. data/test_nokogiri.rb +33 -0
  143. data/test_normalize.rb +45 -0
  144. metadata +123 -2
@@ -0,0 +1,251 @@
1
+ ---
2
+ title: Internals
3
+ parent: Advanced
4
+ nav_order: 7
5
+ has_children: true
6
+ ---
7
+ = Internals
8
+
9
+ == Purpose
10
+
11
+ This section contains detailed implementation documentation for Canon's internal systems. These documents explain how Canon works under the hood, covering data structures, algorithms, and architectural patterns.
12
+
13
+ == Audience
14
+
15
+ These documents are intended for:
16
+
17
+ * Canon contributors and maintainers
18
+ * Developers extending Canon with custom functionality
19
+ * Anyone debugging complex comparison issues
20
+ * Users wanting to understand implementation details
21
+
22
+ == Topics
23
+
24
+ link:diffnode-enrichment[**DiffNode Enrichment**]::
25
+ How DiffNode objects carry location, serialized content, and attribute metadata through the comparison pipeline. Covers PathBuilder (canonical paths with ordinal indices), NodeSerializer (library-agnostic serialization), and how Layer 2 algorithms populate metadata for Layer 4 rendering.
26
+
27
+ link:../advanced/dom-diff-internals[**DOM Diff Internals**]::
28
+ Deep dive into Canon's default DOM diff algorithm: position-based matching, operation detection, and diff generation.
29
+
30
+ link:../advanced/semantic-tree-diff-internals[**Semantic Tree Diff Internals**]::
31
+ How the experimental semantic tree diff works: signature calculation, similarity matching, and operation classification.
32
+
33
+ link:../advanced/verbose-mode-architecture[**Verbose Mode Architecture**]::
34
+ The two-tier diff output system: normative vs informative diffs, and how verbose mode enriches output.
35
+
36
+ link:../advanced/diff-classification[**Diff Classification System**]::
37
+ How Canon classifies differences as normative (structural) or informative (presentational).
38
+
39
+ link:../advanced/diff-pipeline[**Diff Pipeline Architecture**]::
40
+ The six-layer technical pipeline from input to formatted output.
41
+
42
+ link:../advanced/extending-canon[**Extending Canon**]::
43
+ How to create custom comparators, formatters, and match strategies.
44
+
45
+ == Core Concepts
46
+
47
+ === Library Agnosticism
48
+
49
+ Canon is designed to work with multiple parsing libraries (Nokogiri, Moxml, Canon::Xml::Node) without being tied to any specific implementation. This is achieved through:
50
+
51
+ * **Adapter pattern**: Format-specific adapters normalize different node types
52
+ * **Utility classes**: PathBuilder and NodeSerializer work with any library
53
+ * **Interface-based design**: Code depends on behavior (respond_to?) not concrete types
54
+
55
+ This allows Canon to:
56
+ * Support new parsing libraries without major refactoring
57
+ * Switch libraries for better performance or features
58
+ * Remain compatible as libraries evolve
59
+
60
+ === The 4-Layer Architecture
61
+
62
+ Canon separates comparison concerns into four layers:
63
+
64
+ * **Layer 1**: Preprocessing - Normalize documents before comparison
65
+ * **Layer 2**: Algorithm - Choose comparison strategy (DOM vs Semantic)
66
+ * **Layer 3**: Match Options - Configure what to compare
67
+ * **Layer 4**: Diff Formatting - Control output presentation
68
+
69
+ Only Layer 2 differs between algorithms, and the enriched DiffNode structure ensures clean communication between layers.
70
+
71
+ See link:../understanding/architecture.adoc[Architecture] for the complete overview.
72
+
73
+ === Enriched Metadata Flow
74
+
75
+ [mermaid]
76
+ ----
77
+ graph LR
78
+ A[Layer 2: Algorithm] -->|Creates DiffNode| B[PathBuilder]
79
+ A -->|Creates DiffNode| C[NodeSerializer]
80
+ B -->|Enriches| D[DiffNode.path]
81
+ C -->|Enriches| E[DiffNode.serialized_before/after]
82
+ C -->|Enriches| F[DiffNode.attributes_before/after]
83
+ D --> G[Layer 4: Rendering]
84
+ E --> G
85
+ F --> G
86
+ G --> H[Accurate diff output]
87
+
88
+ style A fill:#fff4e1
89
+ style D fill:#e1f5ff
90
+ style G fill:#e1ffe1
91
+ ----
92
+
93
+ Key insight: Metadata is captured at diff creation time (Layer 2) and carried through to rendering (Layer 4), ensuring accurate display even if nodes are modified during comparison.
94
+
95
+ == Data Structures
96
+
97
+ === DiffNode
98
+
99
+ Represents a semantic difference between two nodes:
100
+
101
+ [source,ruby]
102
+ ----
103
+ class DiffNode
104
+ # Core properties
105
+ attr_reader :node1, :node2 # Raw node references
106
+ attr_accessor :dimension, :reason # What changed and why
107
+ attr_accessor :normative, :formatting # Classification
108
+
109
+ # Enriched metadata for Layer 4 rendering
110
+ attr_accessor :path # Canonical path with ordinal indices
111
+ attr_accessor :serialized_before # Serialized "before" content
112
+ attr_accessor :serialized_after # Serialized "after" content
113
+ attr_accessor :attributes_before # Normalized "before" attributes
114
+ attr_accessor :attributes_after # Normalized "after" attributes
115
+ end
116
+ ----
117
+
118
+ See link:diffnode-enrichment[DiffNode Enrichment] for details.
119
+
120
+ === TreeNode (Semantic Diff)
121
+
122
+ Canonical node representation from semantic diff:
123
+
124
+ [source,ruby]
125
+ ----
126
+ class TreeNode
127
+ attr_reader :label # Element name (e.g., "div", "span")
128
+ attr_reader :parent # Parent TreeNode
129
+ attr_reader :children # Array of child TreeNodes
130
+ attr_reader :attributes # Normalized attribute hash
131
+ attr_reader :source_node # Original parsing library node
132
+ attr_reader :signature # Calculated signature for matching
133
+ end
134
+ ----
135
+
136
+ See link:../advanced/semantic-tree-diff-internals[Semantic Tree Diff Internals] for details.
137
+
138
+ === ComparisonResult
139
+
140
+ Result object from verbose comparison:
141
+
142
+ [source,ruby]
143
+ ----
144
+ class ComparisonResult
145
+ attr_reader :differences # Array of DiffNode objects
146
+ attr_reader :preprocessed_strings # Preprocessed document strings
147
+ attr_reader :original_strings # Original document strings
148
+ attr_reader :format # :xml, :html, :json, :yaml
149
+ attr_reader :match_options # Resolved match options
150
+ attr_reader :algorithm # :dom or :semantic
151
+ end
152
+ ----
153
+
154
+ == Utility Classes
155
+
156
+ === PathBuilder
157
+
158
+ Generates canonical XPath-like paths with ordinal indices:
159
+
160
+ [source,ruby]
161
+ ----
162
+ # Build path for any node type
163
+ path = Canon::Diff::PathBuilder.build(node)
164
+ # => "/#document/div[0]/body[0]/p[1]/span[2]"
165
+
166
+ # Build human-readable path
167
+ human = Canon::Diff::PathBuilder.human_path(node)
168
+ # => "#document → div[0] → body[0] → p[1] → span[2]"
169
+ ----
170
+
171
+ **Features**:
172
+ * Library-agnostic: works with TreeNodes, Canon::Xml::Node, Nokogiri nodes
173
+ * Ordinal indices: uniquely identifies nodes among siblings
174
+ * Traverses parent hierarchy: builds complete path from root to node
175
+
176
+ See link:diffnode-enrichment#pathbuilder-canonical-paths-with-ordinal-indices[PathBuilder documentation] for details.
177
+
178
+ === NodeSerializer
179
+
180
+ Serializes nodes and extracts attributes regardless of parsing library:
181
+
182
+ [source,ruby]
183
+ ----
184
+ # Serialize any node
185
+ serialized = Canon::Diff::NodeSerializer.serialize(node)
186
+
187
+ # Extract normalized attributes
188
+ attrs = Canon::Diff::NodeSerializer.extract_attributes(node)
189
+ # => {"lang" => "EN-GB", "xml:lang" => "EN-GB", "id" => "example"}
190
+ ----
191
+
192
+ **Features**:
193
+ * Library-agnostic: handles Canon::Xml::Node, Nokogiri, Moxml
194
+ * Normalized output: consistent format regardless of source library
195
+ * Attribute extraction: returns hash of name-value pairs
196
+
197
+ See link:diffnode-enrichment#nodeserializer-library-agnostic-serialization[NodeSerializer documentation] for details.
198
+
199
+ == Algorithm Integration
200
+
201
+ === DOM Algorithm
202
+
203
+ Enriches DiffNodes during positional comparison in `lib/canon/comparison/xml_comparator.rb`:
204
+
205
+ [source,ruby]
206
+ ----
207
+ def add_difference(node1, node2, diff1, diff2, dimension, opts, differences)
208
+ metadata = enrich_diff_metadata(node1, node2)
209
+ diff_node = Canon::Diff::DiffNode.new(
210
+ node1: node1,
211
+ node2: node2,
212
+ dimension: dimension,
213
+ reason: build_difference_reason(node1, node2, diff1, diff2, dimension),
214
+ **metadata # Enriched from raw Nokogiri/Canon nodes
215
+ )
216
+ differences << diff_node
217
+ end
218
+ ----
219
+
220
+ See link:../understanding/algorithms/dom-diff.adoc[DOM Diff Algorithm] for details.
221
+
222
+ === Semantic Algorithm
223
+
224
+ Enriches DiffNodes during operation conversion in `lib/canon/tree_diff/operation_converter.rb`:
225
+
226
+ [source,ruby]
227
+ ----
228
+ def convert_insert(operation)
229
+ tree_node2 = operation[:node]
230
+ node2 = extract_source_node(tree_node2)
231
+ metadata = enrich_diff_metadata(nil, tree_node2)
232
+ diff_node = Canon::Diff::DiffNode.new(
233
+ node1: nil,
234
+ node2: node2,
235
+ dimension: :element_structure,
236
+ reason: build_insert_reason(operation),
237
+ **metadata # Enriched from TreeNode
238
+ )
239
+ diff_node.normative = determine_normative(:element_structure)
240
+ diff_node
241
+ end
242
+ ----
243
+
244
+ See link:../understanding/algorithms/semantic-tree-diff.adoc[Semantic Tree Diff Algorithm] for details.
245
+
246
+ == See Also
247
+
248
+ * link:../understanding/architecture.adoc[Architecture] - 4-layer architecture overview
249
+ * link:../understanding/algorithms/[Algorithms] - DOM and Semantic algorithm details
250
+ * link:../features/diff-formatting/[Diff Formatting] - Layer 4 rendering options
251
+ * link:../advanced/[Advanced Topics] - Deep technical documentation
data/docs/lychee.toml CHANGED
@@ -12,9 +12,11 @@ include_verbatim = true
12
12
  # Recursively check all files
13
13
  recursive = true
14
14
 
15
- # File types to check
15
+ # File types to check (regex patterns)
16
16
  include = [
17
- "_site/**/*.html"
17
+ "_site/**/*.html",
18
+ ".*\\.adoc$",
19
+ ".*\\.md$"
18
20
  ]
19
21
 
20
22
  # Excluded paths
@@ -25,7 +27,9 @@ exclude = [
25
27
  "vendor",
26
28
  ".bundle",
27
29
  ".sass-cache",
28
- ".jekyll-cache"
30
+ ".jekyll-cache",
31
+ "_site/.jekyll-cache",
32
+ "Gemfile.lock"
29
33
  ]
30
34
 
31
35
  # Link checking behavior
@@ -56,10 +60,13 @@ include_mail = false # Don't check mailto: links
56
60
  max_concurrency = 10
57
61
 
58
62
  # Verbose output for debugging
59
- verbose = "info"
63
+ verbose = "warn"
60
64
 
61
65
  # Require HTTPS where possible
62
66
  require_https = false # Don't enforce
63
67
 
64
- # Index files
65
- index_files = ["index.html"]
68
+ # Index files for directory URLs
69
+ index_files = ["index.html"]
70
+
71
+ # Ignore patterns file
72
+ ignore_file = ".lycheeignore"
@@ -0,0 +1,250 @@
1
+ = HTML Parser Selection Fix Design
2
+ :doctype: article
3
+ :date: 2025-01-17
4
+ :status: Approved
5
+
6
+ == Problem Statement
7
+
8
+ When comparing HTML documents with `lang` and `xml:lang` attributes, users see false attribute differences:
9
+
10
+ ----
11
+ ⊖ Expected (File 1):
12
+ <span> with 1 attribute: xml:lang
13
+
14
+ ⊕ Actual (File 2):
15
+ <span> with 2 attributes: lang, xml:lang
16
+ ----
17
+
18
+ Both HTML strings have identical attributes (`lang="EN-GB" xml:lang="EN-GB"`), but the comparison shows different attribute counts. This happens because:
19
+
20
+ . *DOM path* uses `Nokogiri::XML.fragment` for all HTML, which treats `lang` and `xml:lang` as the same attribute (XML namespace behavior)
21
+ . *Semantic path* uses `Nokogiri::HTML5.fragment` or `Nokogiri::HTML4.fragment`, which correctly treats them as distinct
22
+ . *The `parse_html` method ignores the format parameter* and returns raw strings, causing inconsistent parsing
23
+
24
+ == Root Cause
25
+
26
+ In `lib/canon/comparison.rb`, the `parse_html` method at line 374:
27
+
28
+ [source,ruby]
29
+ ----
30
+ def parse_html(content, _format) # format is IGNORED!
31
+ return content unless content.is_a?(String)
32
+ # ... returns raw string instead of parsing
33
+ end
34
+ ----
35
+
36
+ This causes HTML version information to be lost, and `HtmlComparator#parse_node` ends up using `XML.fragment` for all HTML content.
37
+
38
+ == Solution
39
+
40
+ === Architecture
41
+
42
+ Fix the 4-layer architecture to respect user's parser choice:
43
+
44
+ ----
45
+ User specifies format: :html5
46
+ |
47
+ v
48
+ Level 1: Preprocessing
49
+ parse_html(html, :html5) -> Nokogiri::HTML5.fragment ✓
50
+ |
51
+ v
52
+ Level 2: Diff Algorithm (DiffNode creation)
53
+ Parsed nodes have accurate attributes ✓
54
+ |
55
+ v
56
+ Level 3: Diff Report
57
+ Enriched metadata is correct ✓
58
+ |
59
+ v
60
+ Level 4: Diff Rendering
61
+ Accurate attribute counts in output ✓
62
+ ----
63
+
64
+ === Component Changes
65
+
66
+ ==== 1. `parse_html` Method (`lib/canon/comparison.rb`)
67
+
68
+ *Current behavior:* Ignores format parameter, returns raw string
69
+
70
+ *New behavior:* Parse with correct Nokogiri parser based on format
71
+
72
+ [source,ruby]
73
+ ----
74
+ def parse_html(content, format)
75
+ return content unless content.is_a?(String)
76
+ return content if already_parsed?(content)
77
+
78
+ begin
79
+ case format
80
+ when :html5
81
+ Nokogiri::HTML5.fragment(content)
82
+ when :html4
83
+ Nokogiri::HTML4.fragment(content)
84
+ when :html
85
+ detect_and_parse_html(content)
86
+ else
87
+ content
88
+ end
89
+ rescue StandardError
90
+ content
91
+ end
92
+ end
93
+
94
+ private
95
+
96
+ def already_parsed?(content)
97
+ content.is_a?(Nokogiri::HTML::Document) ||
98
+ content.is_a?(Nokogiri::HTML5::Document) ||
99
+ content.is_a?(Nokogiri::HTML::DocumentFragment) ||
100
+ content.is_a?(Nokogiri::HTML5::DocumentFragment) ||
101
+ content.is_a?(Nokogiri::XML::DocumentFragment)
102
+ end
103
+
104
+ def detect_and_parse_html(content)
105
+ version = detect_html_version(content)
106
+ version == :html5 ?
107
+ Nokogiri::HTML5.fragment(content) :
108
+ Nokogiri::HTML4.fragment(content)
109
+ end
110
+
111
+ def detect_html_version(content)
112
+ content.include?('<!DOCTYPE html>') ? :html5 : :html4
113
+ end
114
+ ----
115
+
116
+ ==== 2. `dom_diff` Method (`lib/canon/comparison.rb`)
117
+
118
+ *Current behavior:* Normalizes `html4`/`html5` to `:html` at line 320
119
+
120
+ *New behavior:* Preserve format information
121
+
122
+ Remove or modify line 320:
123
+
124
+ [source,ruby]
125
+ ----
126
+ # OLD: format1 = format2 = :html
127
+ # NEW: Keep format1, format2 as html4 or html5
128
+ ----
129
+
130
+ This ensures the format is passed through to `HtmlComparator` and used consistently.
131
+
132
+ === Error Handling
133
+
134
+ . *Parse failures:* Fall back to raw string (maintains backward compatibility)
135
+ . *Already-parsed documents:* Return as-is, don't re-parse
136
+ . *Mixed input types:* Both documents parsed with consistent parser based on format parameter
137
+
138
+ === Testing Strategy
139
+
140
+ ==== Unit Tests (`spec/canon/comparison_spec.rb`)
141
+
142
+ [source,ruby]
143
+ ----
144
+ context "parse_html with format parameter" do
145
+ it "parses HTML5 with HTML5.fragment when format is :html5" do
146
+ html = '<span lang="en" xml:lang="en">text</span>'
147
+ result = Canon::Comparison.send(:parse_html, html, :html5)
148
+
149
+ expect(result).to be_a(Nokogiri::HTML5::DocumentFragment)
150
+ expect(result.at_css('span').attributes.keys).to eq(['lang', 'xml:lang'])
151
+ end
152
+
153
+ it "parses HTML4 with HTML4.fragment when format is :html4" do
154
+ html = '<span lang="en" xml:lang="en">text</span>'
155
+ result = Canon::Comparison.send(:parse_html, html, :html4)
156
+
157
+ expect(result).to be_a(Nokogiri::HTML4::DocumentFragment)
158
+ end
159
+
160
+ it "returns already-parsed documents as-is" do
161
+ frag = Nokogiri::HTML5.fragment('<span>text</span>')
162
+ result = Canon::Comparison.send(:parse_html, frag, :html5)
163
+
164
+ expect(result).to eq(frag)
165
+ end
166
+ end
167
+ ----
168
+
169
+ ==== Integration Tests (`spec/canon/html_comparison_spec.rb`)
170
+
171
+ [source,ruby]
172
+ ----
173
+ context "HTML5 lang and xml:lang attributes" do
174
+ it "treats lang and xml:lang as distinct attributes in HTML5" do
175
+ html1 = '<span lang="EN-GB" xml:lang="EN-GB">text</span>'
176
+ html2 = '<span lang="EN-GB" xml:lang="EN-GB">text</span>'
177
+
178
+ result = Canon::Comparison.equivalent?(
179
+ html1, html2,
180
+ format: :html5,
181
+ verbose: true
182
+ )
183
+
184
+ expect(result).to be_equivalent
185
+ end
186
+
187
+ it "does NOT show false attribute differences" do
188
+ html1 = '<span lang="EN-GB" xml:lang="EN-GB">&#xA0;</span>'
189
+ html2 = '<span lang="EN-GB" xml:lang="EN-GB">␣</span>'
190
+
191
+ result = Canon::Comparison.equivalent?(
192
+ html1, html2,
193
+ format: :html5,
194
+ verbose: true
195
+ )
196
+
197
+ # Only difference should be the non-breaking space encoding
198
+ # No attribute differences should be reported
199
+ attr_diffs = result.differences.select { |d| d.dimension == :attribute_values }
200
+ expect(attr_diffs).to be_empty
201
+ end
202
+ end
203
+ ----
204
+
205
+ ==== Backward Compatibility Tests
206
+
207
+ [source,ruby]
208
+ ----
209
+ context "backward compatibility" do
210
+ it "works when format is not specified (auto-detect)" do
211
+ html1 = '<span>text</span>'
212
+ html2 = '<span>text</span>'
213
+
214
+ expect(Canon::Comparison.equivalent?(html1, html2)).to be true
215
+ end
216
+
217
+ it "handles strings with :html format (legacy behavior)" do
218
+ html1 = '<span>text</span>'
219
+ html2 = '<span>text</span>'
220
+
221
+ expect(Canon::Comparison.equivalent?(html1, html2, format: :html)).to be true
222
+ end
223
+ end
224
+ ----
225
+
226
+ == Implementation Checklist
227
+
228
+ * [ ] Modify `parse_html` in `lib/canon/comparison.rb`
229
+ * [ ] Add helper methods: `already_parsed?`, `detect_and_parse_html`, `detect_html_version`
230
+ * [ ] Update `dom_diff` to preserve format (line 320)
231
+ * [ ] Add unit tests for `parse_html` method
232
+ * [ ] Add integration tests for lang/xml:lang
233
+ * [ ] Add backward compatibility tests
234
+ * [ ] Run full test suite to ensure no regressions
235
+
236
+ == Expected Outcomes
237
+
238
+ After this fix:
239
+
240
+ . *`lang` and `xml:lang` are treated as distinct attributes in HTML5/HTML4*
241
+ . *No false attribute differences when both documents have identical attributes*
242
+ . *User can explicitly control parser via `format: :html5` or `format: :html4`*
243
+ . *Backward compatible with existing code (auto-detect still works)*
244
+ . *Consistent parsing regardless of input format (string vs DocumentFragment)*
245
+
246
+ == Notes
247
+
248
+ . HTML entity normalization (`&#xa0;` vs `␣`) is intentionally NOT changed - these are semantically equivalent but different serializations, and the diff correctly shows this difference
249
+ . XML comparison continues to use `XML.fragment` - this fix only affects HTML parsing
250
+ . The semantic path already works correctly via `Canon::Html::DataModel.from_html`