canon 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +25 -135
- data/README.adoc +13 -13
- data/docs/.lycheeignore +69 -0
- data/docs/advanced/extending-canon.adoc +193 -0
- data/docs/internals/diffnode-enrichment.adoc +611 -0
- data/docs/internals/index.adoc +251 -0
- data/docs/lychee.toml +13 -6
- data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +250 -0
- data/docs/understanding/architecture.adoc +749 -33
- data/docs/understanding/comparison-pipeline.adoc +122 -0
- data/false_positive_analysis.txt +0 -0
- data/file1.html +1 -0
- data/file2.html +1 -0
- data/lib/canon/cache.rb +129 -0
- data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
- data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
- data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
- data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
- data/lib/canon/comparison/dimensions/registry.rb +77 -0
- data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
- data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
- data/lib/canon/comparison/dimensions.rb +54 -0
- data/lib/canon/comparison/format_detector.rb +86 -0
- data/lib/canon/comparison/html_comparator.rb +51 -18
- data/lib/canon/comparison/html_parser.rb +80 -0
- data/lib/canon/comparison/json_comparator.rb +12 -0
- data/lib/canon/comparison/json_parser.rb +19 -0
- data/lib/canon/comparison/markup_comparator.rb +293 -0
- data/lib/canon/comparison/match_options/base_resolver.rb +143 -0
- data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
- data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
- data/lib/canon/comparison/match_options.rb +68 -463
- data/lib/canon/comparison/profile_definition.rb +149 -0
- data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
- data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
- data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +189 -0
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
- data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +74 -0
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +95 -0
- data/lib/canon/comparison/xml_comparator.rb +52 -664
- data/lib/canon/comparison/xml_node_comparison.rb +297 -0
- data/lib/canon/comparison/xml_parser.rb +19 -0
- data/lib/canon/comparison/yaml_comparator.rb +3 -3
- data/lib/canon/comparison.rb +265 -110
- data/lib/canon/diff/diff_node.rb +32 -2
- data/lib/canon/diff/node_serializer.rb +191 -0
- data/lib/canon/diff/path_builder.rb +143 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
- data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
- data/lib/canon/diff_formatter.rb +1 -1
- data/lib/canon/rspec_matchers.rb +1 -1
- data/lib/canon/tree_diff/operation_converter.rb +92 -338
- data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
- data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
- data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
- data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
- data/lib/canon/version.rb +1 -1
- data/old-docs/ADVANCED_TOPICS.adoc +20 -0
- data/old-docs/BASIC_USAGE.adoc +16 -0
- data/old-docs/CHARACTER_VISUALIZATION.adoc +567 -0
- data/old-docs/CLI.adoc +497 -0
- data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
- data/old-docs/DIFF_ARCHITECTURE.adoc +435 -0
- data/old-docs/DIFF_FORMATTING.adoc +540 -0
- data/old-docs/DIFF_PARAMETERS.adoc +261 -0
- data/old-docs/DOM_DIFF.adoc +1017 -0
- data/old-docs/ENV_CONFIG.adoc +876 -0
- data/old-docs/FORMATS.adoc +867 -0
- data/old-docs/INPUT_VALIDATION.adoc +477 -0
- data/old-docs/MATCHER_BEHAVIOR.adoc +90 -0
- data/old-docs/MATCH_ARCHITECTURE.adoc +463 -0
- data/old-docs/MATCH_OPTIONS.adoc +912 -0
- data/old-docs/MODES.adoc +432 -0
- data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
- data/old-docs/OPTIONS.adoc +1387 -0
- data/old-docs/PREPROCESSING.adoc +491 -0
- data/old-docs/README.old.adoc +2831 -0
- data/old-docs/RSPEC.adoc +814 -0
- data/old-docs/RUBY_API.adoc +485 -0
- data/old-docs/SEMANTIC_DIFF_REPORT.adoc +646 -0
- data/old-docs/SEMANTIC_TREE_DIFF.adoc +765 -0
- data/old-docs/STRING_COMPARE.adoc +345 -0
- data/old-docs/TMP.adoc +3384 -0
- data/old-docs/TREE_DIFF.adoc +1080 -0
- data/old-docs/UNDERSTANDING_CANON.adoc +17 -0
- data/old-docs/VERBOSE.adoc +482 -0
- data/old-docs/VISUALIZATION_MAP.adoc +625 -0
- data/old-docs/WHITESPACE_TREATMENT.adoc +1155 -0
- data/scripts/analyze_current_state.rb +85 -0
- data/scripts/analyze_false_positives.rb +114 -0
- data/scripts/analyze_remaining_failures.rb +105 -0
- data/scripts/compare_current_failures.rb +95 -0
- data/scripts/compare_dom_tree_diff.rb +158 -0
- data/scripts/compare_failures.rb +151 -0
- data/scripts/debug_attribute_extraction.rb +66 -0
- data/scripts/debug_blocks_839.rb +115 -0
- data/scripts/debug_meta_matching.rb +52 -0
- data/scripts/debug_p_matching.rb +192 -0
- data/scripts/debug_signature_matching.rb +118 -0
- data/scripts/debug_sourcecode_124.rb +32 -0
- data/scripts/debug_whitespace_sensitive.rb +192 -0
- data/scripts/extract_false_positives.rb +138 -0
- data/scripts/find_actual_false_positives.rb +125 -0
- data/scripts/investigate_all_false_positives.rb +161 -0
- data/scripts/investigate_batch1.rb +127 -0
- data/scripts/investigate_classification.rb +150 -0
- data/scripts/investigate_classification_detailed.rb +190 -0
- data/scripts/investigate_common_failures.rb +342 -0
- data/scripts/investigate_false_negative.rb +80 -0
- data/scripts/investigate_false_positive.rb +83 -0
- data/scripts/investigate_false_positives.rb +227 -0
- data/scripts/investigate_false_positives_batch.rb +163 -0
- data/scripts/investigate_mixed_content.rb +125 -0
- data/scripts/investigate_remaining_16.rb +214 -0
- data/scripts/run_single_test.rb +29 -0
- data/scripts/test_all_false_positives.rb +95 -0
- data/scripts/test_attribute_details.rb +61 -0
- data/scripts/test_both_algorithms.rb +49 -0
- data/scripts/test_both_simple.rb +49 -0
- data/scripts/test_enhanced_semantic_output.rb +125 -0
- data/scripts/test_readme_examples.rb +131 -0
- data/scripts/test_semantic_tree_diff.rb +99 -0
- data/scripts/test_semantic_ux_improvements.rb +135 -0
- data/scripts/test_single_false_positive.rb +119 -0
- data/scripts/test_size_limits.rb +99 -0
- data/test_html_1.html +21 -0
- data/test_html_2.html +21 -0
- data/test_nokogiri.rb +33 -0
- data/test_normalize.rb +45 -0
- metadata +123 -2
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Internals
|
|
3
|
+
parent: Advanced
|
|
4
|
+
nav_order: 7
|
|
5
|
+
has_children: true
|
|
6
|
+
---
|
|
7
|
+
= Internals
|
|
8
|
+
|
|
9
|
+
== Purpose
|
|
10
|
+
|
|
11
|
+
This section contains detailed implementation documentation for Canon's internal systems. These documents explain how Canon works under the hood, covering data structures, algorithms, and architectural patterns.
|
|
12
|
+
|
|
13
|
+
== Audience
|
|
14
|
+
|
|
15
|
+
These documents are intended for:
|
|
16
|
+
|
|
17
|
+
* Canon contributors and maintainers
|
|
18
|
+
* Developers extending Canon with custom functionality
|
|
19
|
+
* Anyone debugging complex comparison issues
|
|
20
|
+
* Users wanting to understand implementation details
|
|
21
|
+
|
|
22
|
+
== Topics
|
|
23
|
+
|
|
24
|
+
link:diffnode-enrichment[**DiffNode Enrichment**]::
|
|
25
|
+
How DiffNode objects carry location, serialized content, and attribute metadata through the comparison pipeline. Covers PathBuilder (canonical paths with ordinal indices), NodeSerializer (library-agnostic serialization), and how Layer 2 algorithms populate metadata for Layer 4 rendering.
|
|
26
|
+
|
|
27
|
+
link:../advanced/dom-diff-internals[**DOM Diff Internals**]::
|
|
28
|
+
Deep dive into Canon's default DOM diff algorithm: position-based matching, operation detection, and diff generation.
|
|
29
|
+
|
|
30
|
+
link:../advanced/semantic-tree-diff-internals[**Semantic Tree Diff Internals**]::
|
|
31
|
+
How the experimental semantic tree diff works: signature calculation, similarity matching, and operation classification.
|
|
32
|
+
|
|
33
|
+
link:../advanced/verbose-mode-architecture[**Verbose Mode Architecture**]::
|
|
34
|
+
The two-tier diff output system: normative vs informative diffs, and how verbose mode enriches output.
|
|
35
|
+
|
|
36
|
+
link:../advanced/diff-classification[**Diff Classification System**]::
|
|
37
|
+
How Canon classifies differences as normative (structural) or informative (presentational).
|
|
38
|
+
|
|
39
|
+
link:../advanced/diff-pipeline[**Diff Pipeline Architecture**]::
|
|
40
|
+
The six-layer technical pipeline from input to formatted output.
|
|
41
|
+
|
|
42
|
+
link:../advanced/extending-canon[**Extending Canon**]::
|
|
43
|
+
How to create custom comparators, formatters, and match strategies.
|
|
44
|
+
|
|
45
|
+
== Core Concepts
|
|
46
|
+
|
|
47
|
+
=== Library Agnosticism
|
|
48
|
+
|
|
49
|
+
Canon is designed to work with multiple parsing libraries (Nokogiri, Moxml, Canon::Xml::Node) without being tied to any specific implementation. This is achieved through:
|
|
50
|
+
|
|
51
|
+
* **Adapter pattern**: Format-specific adapters normalize different node types
|
|
52
|
+
* **Utility classes**: PathBuilder and NodeSerializer work with any library
|
|
53
|
+
* **Interface-based design**: Code depends on behavior (respond_to?) not concrete types
|
|
54
|
+
|
|
55
|
+
This allows Canon to:
|
|
56
|
+
* Support new parsing libraries without major refactoring
|
|
57
|
+
* Switch libraries for better performance or features
|
|
58
|
+
* Remain compatible as libraries evolve
|
|
59
|
+
|
|
60
|
+
=== The 4-Layer Architecture
|
|
61
|
+
|
|
62
|
+
Canon separates comparison concerns into four layers:
|
|
63
|
+
|
|
64
|
+
* **Layer 1**: Preprocessing - Normalize documents before comparison
|
|
65
|
+
* **Layer 2**: Algorithm - Choose comparison strategy (DOM vs Semantic)
|
|
66
|
+
* **Layer 3**: Match Options - Configure what to compare
|
|
67
|
+
* **Layer 4**: Diff Formatting - Control output presentation
|
|
68
|
+
|
|
69
|
+
Only Layer 2 differs between algorithms, and the enriched DiffNode structure ensures clean communication between layers.
|
|
70
|
+
|
|
71
|
+
See link:../understanding/architecture.adoc[Architecture] for the complete overview.
|
|
72
|
+
|
|
73
|
+
=== Enriched Metadata Flow
|
|
74
|
+
|
|
75
|
+
[mermaid]
|
|
76
|
+
----
|
|
77
|
+
graph LR
|
|
78
|
+
A[Layer 2: Algorithm] -->|Creates DiffNode| B[PathBuilder]
|
|
79
|
+
A -->|Creates DiffNode| C[NodeSerializer]
|
|
80
|
+
B -->|Enriches| D[DiffNode.path]
|
|
81
|
+
C -->|Enriches| E[DiffNode.serialized_before/after]
|
|
82
|
+
C -->|Enriches| F[DiffNode.attributes_before/after]
|
|
83
|
+
D --> G[Layer 4: Rendering]
|
|
84
|
+
E --> G
|
|
85
|
+
F --> G
|
|
86
|
+
G --> H[Accurate diff output]
|
|
87
|
+
|
|
88
|
+
style A fill:#fff4e1
|
|
89
|
+
style D fill:#e1f5ff
|
|
90
|
+
style G fill:#e1ffe1
|
|
91
|
+
----
|
|
92
|
+
|
|
93
|
+
Key insight: Metadata is captured at diff creation time (Layer 2) and carried through to rendering (Layer 4), ensuring accurate display even if nodes are modified during comparison.
|
|
94
|
+
|
|
95
|
+
== Data Structures
|
|
96
|
+
|
|
97
|
+
=== DiffNode
|
|
98
|
+
|
|
99
|
+
Represents a semantic difference between two nodes:
|
|
100
|
+
|
|
101
|
+
[source,ruby]
|
|
102
|
+
----
|
|
103
|
+
class DiffNode
|
|
104
|
+
# Core properties
|
|
105
|
+
attr_reader :node1, :node2 # Raw node references
|
|
106
|
+
attr_accessor :dimension, :reason # What changed and why
|
|
107
|
+
attr_accessor :normative, :formatting # Classification
|
|
108
|
+
|
|
109
|
+
# Enriched metadata for Layer 4 rendering
|
|
110
|
+
attr_accessor :path # Canonical path with ordinal indices
|
|
111
|
+
attr_accessor :serialized_before # Serialized "before" content
|
|
112
|
+
attr_accessor :serialized_after # Serialized "after" content
|
|
113
|
+
attr_accessor :attributes_before # Normalized "before" attributes
|
|
114
|
+
attr_accessor :attributes_after # Normalized "after" attributes
|
|
115
|
+
end
|
|
116
|
+
----
|
|
117
|
+
|
|
118
|
+
See link:diffnode-enrichment[DiffNode Enrichment] for details.
|
|
119
|
+
|
|
120
|
+
=== TreeNode (Semantic Diff)
|
|
121
|
+
|
|
122
|
+
Canonical node representation from semantic diff:
|
|
123
|
+
|
|
124
|
+
[source,ruby]
|
|
125
|
+
----
|
|
126
|
+
class TreeNode
|
|
127
|
+
attr_reader :label # Element name (e.g., "div", "span")
|
|
128
|
+
attr_reader :parent # Parent TreeNode
|
|
129
|
+
attr_reader :children # Array of child TreeNodes
|
|
130
|
+
attr_reader :attributes # Normalized attribute hash
|
|
131
|
+
attr_reader :source_node # Original parsing library node
|
|
132
|
+
attr_reader :signature # Calculated signature for matching
|
|
133
|
+
end
|
|
134
|
+
----
|
|
135
|
+
|
|
136
|
+
See link:../advanced/semantic-tree-diff-internals[Semantic Tree Diff Internals] for details.
|
|
137
|
+
|
|
138
|
+
=== ComparisonResult
|
|
139
|
+
|
|
140
|
+
Result object from verbose comparison:
|
|
141
|
+
|
|
142
|
+
[source,ruby]
|
|
143
|
+
----
|
|
144
|
+
class ComparisonResult
|
|
145
|
+
attr_reader :differences # Array of DiffNode objects
|
|
146
|
+
attr_reader :preprocessed_strings # Preprocessed document strings
|
|
147
|
+
attr_reader :original_strings # Original document strings
|
|
148
|
+
attr_reader :format # :xml, :html, :json, :yaml
|
|
149
|
+
attr_reader :match_options # Resolved match options
|
|
150
|
+
attr_reader :algorithm # :dom or :semantic
|
|
151
|
+
end
|
|
152
|
+
----
|
|
153
|
+
|
|
154
|
+
== Utility Classes
|
|
155
|
+
|
|
156
|
+
=== PathBuilder
|
|
157
|
+
|
|
158
|
+
Generates canonical XPath-like paths with ordinal indices:
|
|
159
|
+
|
|
160
|
+
[source,ruby]
|
|
161
|
+
----
|
|
162
|
+
# Build path for any node type
|
|
163
|
+
path = Canon::Diff::PathBuilder.build(node)
|
|
164
|
+
# => "/#document/div[0]/body[0]/p[1]/span[2]"
|
|
165
|
+
|
|
166
|
+
# Build human-readable path
|
|
167
|
+
human = Canon::Diff::PathBuilder.human_path(node)
|
|
168
|
+
# => "#document → div[0] → body[0] → p[1] → span[2]"
|
|
169
|
+
----
|
|
170
|
+
|
|
171
|
+
**Features**:
|
|
172
|
+
* Library-agnostic: works with TreeNodes, Canon::Xml::Node, Nokogiri nodes
|
|
173
|
+
* Ordinal indices: uniquely identifies nodes among siblings
|
|
174
|
+
* Traverses parent hierarchy: builds complete path from root to node
|
|
175
|
+
|
|
176
|
+
See link:diffnode-enrichment#pathbuilder-canonical-paths-with-ordinal-indices[PathBuilder documentation] for details.
|
|
177
|
+
|
|
178
|
+
=== NodeSerializer
|
|
179
|
+
|
|
180
|
+
Serializes nodes and extracts attributes regardless of parsing library:
|
|
181
|
+
|
|
182
|
+
[source,ruby]
|
|
183
|
+
----
|
|
184
|
+
# Serialize any node
|
|
185
|
+
serialized = Canon::Diff::NodeSerializer.serialize(node)
|
|
186
|
+
|
|
187
|
+
# Extract normalized attributes
|
|
188
|
+
attrs = Canon::Diff::NodeSerializer.extract_attributes(node)
|
|
189
|
+
# => {"lang" => "EN-GB", "xml:lang" => "EN-GB", "id" => "example"}
|
|
190
|
+
----
|
|
191
|
+
|
|
192
|
+
**Features**:
|
|
193
|
+
* Library-agnostic: handles Canon::Xml::Node, Nokogiri, Moxml
|
|
194
|
+
* Normalized output: consistent format regardless of source library
|
|
195
|
+
* Attribute extraction: returns hash of name-value pairs
|
|
196
|
+
|
|
197
|
+
See link:diffnode-enrichment#nodeserializer-library-agnostic-serialization[NodeSerializer documentation] for details.
|
|
198
|
+
|
|
199
|
+
== Algorithm Integration
|
|
200
|
+
|
|
201
|
+
=== DOM Algorithm
|
|
202
|
+
|
|
203
|
+
Enriches DiffNodes during positional comparison in `lib/canon/comparison/xml_comparator.rb`:
|
|
204
|
+
|
|
205
|
+
[source,ruby]
|
|
206
|
+
----
|
|
207
|
+
def add_difference(node1, node2, diff1, diff2, dimension, opts, differences)
|
|
208
|
+
metadata = enrich_diff_metadata(node1, node2)
|
|
209
|
+
diff_node = Canon::Diff::DiffNode.new(
|
|
210
|
+
node1: node1,
|
|
211
|
+
node2: node2,
|
|
212
|
+
dimension: dimension,
|
|
213
|
+
reason: build_difference_reason(node1, node2, diff1, diff2, dimension),
|
|
214
|
+
**metadata # Enriched from raw Nokogiri/Canon nodes
|
|
215
|
+
)
|
|
216
|
+
differences << diff_node
|
|
217
|
+
end
|
|
218
|
+
----
|
|
219
|
+
|
|
220
|
+
See link:../understanding/algorithms/dom-diff.adoc[DOM Diff Algorithm] for details.
|
|
221
|
+
|
|
222
|
+
=== Semantic Algorithm
|
|
223
|
+
|
|
224
|
+
Enriches DiffNodes during operation conversion in `lib/canon/tree_diff/operation_converter.rb`:
|
|
225
|
+
|
|
226
|
+
[source,ruby]
|
|
227
|
+
----
|
|
228
|
+
def convert_insert(operation)
|
|
229
|
+
tree_node2 = operation[:node]
|
|
230
|
+
node2 = extract_source_node(tree_node2)
|
|
231
|
+
metadata = enrich_diff_metadata(nil, tree_node2)
|
|
232
|
+
diff_node = Canon::Diff::DiffNode.new(
|
|
233
|
+
node1: nil,
|
|
234
|
+
node2: node2,
|
|
235
|
+
dimension: :element_structure,
|
|
236
|
+
reason: build_insert_reason(operation),
|
|
237
|
+
**metadata # Enriched from TreeNode
|
|
238
|
+
)
|
|
239
|
+
diff_node.normative = determine_normative(:element_structure)
|
|
240
|
+
diff_node
|
|
241
|
+
end
|
|
242
|
+
----
|
|
243
|
+
|
|
244
|
+
See link:../understanding/algorithms/semantic-tree-diff.adoc[Semantic Tree Diff Algorithm] for details.
|
|
245
|
+
|
|
246
|
+
== See Also
|
|
247
|
+
|
|
248
|
+
* link:../understanding/architecture.adoc[Architecture] - 4-layer architecture overview
|
|
249
|
+
* link:../understanding/algorithms/[Algorithms] - DOM and Semantic algorithm details
|
|
250
|
+
* link:../features/diff-formatting/[Diff Formatting] - Layer 4 rendering options
|
|
251
|
+
* link:../advanced/[Advanced Topics] - Deep technical documentation
|
data/docs/lychee.toml
CHANGED
|
@@ -12,9 +12,11 @@ include_verbatim = true
|
|
|
12
12
|
# Recursively check all files
|
|
13
13
|
recursive = true
|
|
14
14
|
|
|
15
|
-
# File types to check
|
|
15
|
+
# File types to check (regex patterns)
|
|
16
16
|
include = [
|
|
17
|
-
"_site/**/*.html"
|
|
17
|
+
"_site/**/*.html",
|
|
18
|
+
".*\\.adoc$",
|
|
19
|
+
".*\\.md$"
|
|
18
20
|
]
|
|
19
21
|
|
|
20
22
|
# Excluded paths
|
|
@@ -25,7 +27,9 @@ exclude = [
|
|
|
25
27
|
"vendor",
|
|
26
28
|
".bundle",
|
|
27
29
|
".sass-cache",
|
|
28
|
-
".jekyll-cache"
|
|
30
|
+
".jekyll-cache",
|
|
31
|
+
"_site/.jekyll-cache",
|
|
32
|
+
"Gemfile.lock"
|
|
29
33
|
]
|
|
30
34
|
|
|
31
35
|
# Link checking behavior
|
|
@@ -56,10 +60,13 @@ include_mail = false # Don't check mailto: links
|
|
|
56
60
|
max_concurrency = 10
|
|
57
61
|
|
|
58
62
|
# Verbose output for debugging
|
|
59
|
-
verbose = "
|
|
63
|
+
verbose = "warn"
|
|
60
64
|
|
|
61
65
|
# Require HTTPS where possible
|
|
62
66
|
require_https = false # Don't enforce
|
|
63
67
|
|
|
64
|
-
# Index files
|
|
65
|
-
index_files = ["index.html"]
|
|
68
|
+
# Index files for directory URLs
|
|
69
|
+
index_files = ["index.html"]
|
|
70
|
+
|
|
71
|
+
# Ignore patterns file
|
|
72
|
+
ignore_file = ".lycheeignore"
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
= HTML Parser Selection Fix Design
|
|
2
|
+
:doctype: article
|
|
3
|
+
:date: 2025-01-17
|
|
4
|
+
:status: Approved
|
|
5
|
+
|
|
6
|
+
== Problem Statement
|
|
7
|
+
|
|
8
|
+
When comparing HTML documents with `lang` and `xml:lang` attributes, users see false attribute differences:
|
|
9
|
+
|
|
10
|
+
----
|
|
11
|
+
⊖ Expected (File 1):
|
|
12
|
+
<span> with 1 attribute: xml:lang
|
|
13
|
+
|
|
14
|
+
⊕ Actual (File 2):
|
|
15
|
+
<span> with 2 attributes: lang, xml:lang
|
|
16
|
+
----
|
|
17
|
+
|
|
18
|
+
Both HTML strings have identical attributes (`lang="EN-GB" xml:lang="EN-GB"`), but the comparison shows different attribute counts. This happens because:
|
|
19
|
+
|
|
20
|
+
. *DOM path* uses `Nokogiri::XML.fragment` for all HTML, which treats `lang` and `xml:lang` as the same attribute (XML namespace behavior)
|
|
21
|
+
. *Semantic path* uses `Nokogiri::HTML5.fragment` or `Nokogiri::HTML4.fragment`, which correctly treats them as distinct
|
|
22
|
+
. *The `parse_html` method ignores the format parameter* and returns raw strings, causing inconsistent parsing
|
|
23
|
+
|
|
24
|
+
== Root Cause
|
|
25
|
+
|
|
26
|
+
In `lib/canon/comparison.rb`, the `parse_html` method at line 374:
|
|
27
|
+
|
|
28
|
+
[source,ruby]
|
|
29
|
+
----
|
|
30
|
+
def parse_html(content, _format) # format is IGNORED!
|
|
31
|
+
return content unless content.is_a?(String)
|
|
32
|
+
# ... returns raw string instead of parsing
|
|
33
|
+
end
|
|
34
|
+
----
|
|
35
|
+
|
|
36
|
+
This causes HTML version information to be lost, and `HtmlComparator#parse_node` ends up using `XML.fragment` for all HTML content.
|
|
37
|
+
|
|
38
|
+
== Solution
|
|
39
|
+
|
|
40
|
+
=== Architecture
|
|
41
|
+
|
|
42
|
+
Fix the 4-layer architecture to respect user's parser choice:
|
|
43
|
+
|
|
44
|
+
----
|
|
45
|
+
User specifies format: :html5
|
|
46
|
+
|
|
|
47
|
+
v
|
|
48
|
+
Level 1: Preprocessing
|
|
49
|
+
parse_html(html, :html5) -> Nokogiri::HTML5.fragment ✓
|
|
50
|
+
|
|
|
51
|
+
v
|
|
52
|
+
Level 2: Diff Algorithm (DiffNode creation)
|
|
53
|
+
Parsed nodes have accurate attributes ✓
|
|
54
|
+
|
|
|
55
|
+
v
|
|
56
|
+
Level 3: Diff Report
|
|
57
|
+
Enriched metadata is correct ✓
|
|
58
|
+
|
|
|
59
|
+
v
|
|
60
|
+
Level 4: Diff Rendering
|
|
61
|
+
Accurate attribute counts in output ✓
|
|
62
|
+
----
|
|
63
|
+
|
|
64
|
+
=== Component Changes
|
|
65
|
+
|
|
66
|
+
==== 1. `parse_html` Method (`lib/canon/comparison.rb`)
|
|
67
|
+
|
|
68
|
+
*Current behavior:* Ignores format parameter, returns raw string
|
|
69
|
+
|
|
70
|
+
*New behavior:* Parse with correct Nokogiri parser based on format
|
|
71
|
+
|
|
72
|
+
[source,ruby]
|
|
73
|
+
----
|
|
74
|
+
def parse_html(content, format)
|
|
75
|
+
return content unless content.is_a?(String)
|
|
76
|
+
return content if already_parsed?(content)
|
|
77
|
+
|
|
78
|
+
begin
|
|
79
|
+
case format
|
|
80
|
+
when :html5
|
|
81
|
+
Nokogiri::HTML5.fragment(content)
|
|
82
|
+
when :html4
|
|
83
|
+
Nokogiri::HTML4.fragment(content)
|
|
84
|
+
when :html
|
|
85
|
+
detect_and_parse_html(content)
|
|
86
|
+
else
|
|
87
|
+
content
|
|
88
|
+
end
|
|
89
|
+
rescue StandardError
|
|
90
|
+
content
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
private
|
|
95
|
+
|
|
96
|
+
def already_parsed?(content)
|
|
97
|
+
content.is_a?(Nokogiri::HTML::Document) ||
|
|
98
|
+
content.is_a?(Nokogiri::HTML5::Document) ||
|
|
99
|
+
content.is_a?(Nokogiri::HTML::DocumentFragment) ||
|
|
100
|
+
content.is_a?(Nokogiri::HTML5::DocumentFragment) ||
|
|
101
|
+
content.is_a?(Nokogiri::XML::DocumentFragment)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def detect_and_parse_html(content)
|
|
105
|
+
version = detect_html_version(content)
|
|
106
|
+
version == :html5 ?
|
|
107
|
+
Nokogiri::HTML5.fragment(content) :
|
|
108
|
+
Nokogiri::HTML4.fragment(content)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def detect_html_version(content)
|
|
112
|
+
content.include?('<!DOCTYPE html>') ? :html5 : :html4
|
|
113
|
+
end
|
|
114
|
+
----
|
|
115
|
+
|
|
116
|
+
==== 2. `dom_diff` Method (`lib/canon/comparison.rb`)
|
|
117
|
+
|
|
118
|
+
*Current behavior:* Normalizes `html4`/`html5` to `:html` at line 320
|
|
119
|
+
|
|
120
|
+
*New behavior:* Preserve format information
|
|
121
|
+
|
|
122
|
+
Remove or modify line 320:
|
|
123
|
+
|
|
124
|
+
[source,ruby]
|
|
125
|
+
----
|
|
126
|
+
# OLD: format1 = format2 = :html
|
|
127
|
+
# NEW: Keep format1, format2 as html4 or html5
|
|
128
|
+
----
|
|
129
|
+
|
|
130
|
+
This ensures the format is passed through to `HtmlComparator` and used consistently.
|
|
131
|
+
|
|
132
|
+
=== Error Handling
|
|
133
|
+
|
|
134
|
+
. *Parse failures:* Fall back to raw string (maintains backward compatibility)
|
|
135
|
+
. *Already-parsed documents:* Return as-is, don't re-parse
|
|
136
|
+
. *Mixed input types:* Both documents parsed with consistent parser based on format parameter
|
|
137
|
+
|
|
138
|
+
=== Testing Strategy
|
|
139
|
+
|
|
140
|
+
==== Unit Tests (`spec/canon/comparison_spec.rb`)
|
|
141
|
+
|
|
142
|
+
[source,ruby]
|
|
143
|
+
----
|
|
144
|
+
context "parse_html with format parameter" do
|
|
145
|
+
it "parses HTML5 with HTML5.fragment when format is :html5" do
|
|
146
|
+
html = '<span lang="en" xml:lang="en">text</span>'
|
|
147
|
+
result = Canon::Comparison.send(:parse_html, html, :html5)
|
|
148
|
+
|
|
149
|
+
expect(result).to be_a(Nokogiri::HTML5::DocumentFragment)
|
|
150
|
+
expect(result.at_css('span').attributes.keys).to eq(['lang', 'xml:lang'])
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
it "parses HTML4 with HTML4.fragment when format is :html4" do
|
|
154
|
+
html = '<span lang="en" xml:lang="en">text</span>'
|
|
155
|
+
result = Canon::Comparison.send(:parse_html, html, :html4)
|
|
156
|
+
|
|
157
|
+
expect(result).to be_a(Nokogiri::HTML4::DocumentFragment)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
it "returns already-parsed documents as-is" do
|
|
161
|
+
frag = Nokogiri::HTML5.fragment('<span>text</span>')
|
|
162
|
+
result = Canon::Comparison.send(:parse_html, frag, :html5)
|
|
163
|
+
|
|
164
|
+
expect(result).to eq(frag)
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
----
|
|
168
|
+
|
|
169
|
+
==== Integration Tests (`spec/canon/html_comparison_spec.rb`)
|
|
170
|
+
|
|
171
|
+
[source,ruby]
|
|
172
|
+
----
|
|
173
|
+
context "HTML5 lang and xml:lang attributes" do
|
|
174
|
+
it "treats lang and xml:lang as distinct attributes in HTML5" do
|
|
175
|
+
html1 = '<span lang="EN-GB" xml:lang="EN-GB">text</span>'
|
|
176
|
+
html2 = '<span lang="EN-GB" xml:lang="EN-GB">text</span>'
|
|
177
|
+
|
|
178
|
+
result = Canon::Comparison.equivalent?(
|
|
179
|
+
html1, html2,
|
|
180
|
+
format: :html5,
|
|
181
|
+
verbose: true
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
expect(result).to be_equivalent
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
it "does NOT show false attribute differences" do
|
|
188
|
+
html1 = '<span lang="EN-GB" xml:lang="EN-GB"> </span>'
|
|
189
|
+
html2 = '<span lang="EN-GB" xml:lang="EN-GB">␣</span>'
|
|
190
|
+
|
|
191
|
+
result = Canon::Comparison.equivalent?(
|
|
192
|
+
html1, html2,
|
|
193
|
+
format: :html5,
|
|
194
|
+
verbose: true
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Only difference should be the non-breaking space encoding
|
|
198
|
+
# No attribute differences should be reported
|
|
199
|
+
attr_diffs = result.differences.select { |d| d.dimension == :attribute_values }
|
|
200
|
+
expect(attr_diffs).to be_empty
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
----
|
|
204
|
+
|
|
205
|
+
==== Backward Compatibility Tests
|
|
206
|
+
|
|
207
|
+
[source,ruby]
|
|
208
|
+
----
|
|
209
|
+
context "backward compatibility" do
|
|
210
|
+
it "works when format is not specified (auto-detect)" do
|
|
211
|
+
html1 = '<span>text</span>'
|
|
212
|
+
html2 = '<span>text</span>'
|
|
213
|
+
|
|
214
|
+
expect(Canon::Comparison.equivalent?(html1, html2)).to be true
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
it "handles strings with :html format (legacy behavior)" do
|
|
218
|
+
html1 = '<span>text</span>'
|
|
219
|
+
html2 = '<span>text</span>'
|
|
220
|
+
|
|
221
|
+
expect(Canon::Comparison.equivalent?(html1, html2, format: :html)).to be true
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
----
|
|
225
|
+
|
|
226
|
+
== Implementation Checklist
|
|
227
|
+
|
|
228
|
+
* [ ] Modify `parse_html` in `lib/canon/comparison.rb`
|
|
229
|
+
* [ ] Add helper methods: `already_parsed?`, `detect_and_parse_html`, `detect_html_version`
|
|
230
|
+
* [ ] Update `dom_diff` to preserve format (line 320)
|
|
231
|
+
* [ ] Add unit tests for `parse_html` method
|
|
232
|
+
* [ ] Add integration tests for lang/xml:lang
|
|
233
|
+
* [ ] Add backward compatibility tests
|
|
234
|
+
* [ ] Run full test suite to ensure no regressions
|
|
235
|
+
|
|
236
|
+
== Expected Outcomes
|
|
237
|
+
|
|
238
|
+
After this fix:
|
|
239
|
+
|
|
240
|
+
. *`lang` and `xml:lang` are treated as distinct attributes in HTML5/HTML4*
|
|
241
|
+
. *No false attribute differences when both documents have identical attributes*
|
|
242
|
+
. *User can explicitly control parser via `format: :html5` or `format: :html4`*
|
|
243
|
+
. *Backward compatible with existing code (auto-detect still works)*
|
|
244
|
+
. *Consistent parsing regardless of input format (string vs DocumentFragment)*
|
|
245
|
+
|
|
246
|
+
== Notes
|
|
247
|
+
|
|
248
|
+
. HTML entity normalization (` ` vs `␣`) is intentionally NOT changed - these are semantically equivalent but different serializations, and the diff correctly shows this difference
|
|
249
|
+
. XML comparison continues to use `XML.fragment` - this fix only affects HTML parsing
|
|
250
|
+
. The semantic path already works correctly via `Canon::Html::DataModel.from_html`
|