canon 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +69 -92
- data/README.adoc +13 -13
- data/docs/.lycheeignore +69 -0
- data/docs/Gemfile +1 -0
- data/docs/_config.yml +90 -1
- data/docs/advanced/diff-classification.adoc +82 -2
- data/docs/advanced/extending-canon.adoc +193 -0
- data/docs/features/match-options/index.adoc +239 -1
- data/docs/internals/diffnode-enrichment.adoc +611 -0
- data/docs/internals/index.adoc +251 -0
- data/docs/lychee.toml +13 -6
- data/docs/understanding/architecture.adoc +749 -33
- data/docs/understanding/comparison-pipeline.adoc +122 -0
- data/lib/canon/cache.rb +129 -0
- data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
- data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
- data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
- data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
- data/lib/canon/comparison/dimensions/registry.rb +77 -0
- data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
- data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
- data/lib/canon/comparison/dimensions.rb +54 -0
- data/lib/canon/comparison/format_detector.rb +87 -0
- data/lib/canon/comparison/html_comparator.rb +70 -26
- data/lib/canon/comparison/html_compare_profile.rb +8 -2
- data/lib/canon/comparison/html_parser.rb +80 -0
- data/lib/canon/comparison/json_comparator.rb +12 -0
- data/lib/canon/comparison/json_parser.rb +19 -0
- data/lib/canon/comparison/markup_comparator.rb +293 -0
- data/lib/canon/comparison/match_options/base_resolver.rb +150 -0
- data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
- data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
- data/lib/canon/comparison/match_options.rb +68 -463
- data/lib/canon/comparison/profile_definition.rb +149 -0
- data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
- data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
- data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
- data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +197 -0
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
- data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +79 -0
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +102 -0
- data/lib/canon/comparison/xml_comparator.rb +97 -684
- data/lib/canon/comparison/xml_node_comparison.rb +319 -0
- data/lib/canon/comparison/xml_parser.rb +19 -0
- data/lib/canon/comparison/yaml_comparator.rb +3 -3
- data/lib/canon/comparison.rb +265 -110
- data/lib/canon/diff/diff_classifier.rb +101 -2
- data/lib/canon/diff/diff_node.rb +32 -2
- data/lib/canon/diff/formatting_detector.rb +1 -1
- data/lib/canon/diff/node_serializer.rb +191 -0
- data/lib/canon/diff/path_builder.rb +143 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
- data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
- data/lib/canon/diff_formatter.rb +1 -1
- data/lib/canon/rspec_matchers.rb +38 -9
- data/lib/canon/tree_diff/operation_converter.rb +92 -338
- data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
- data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
- data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
- data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +24 -13
- metadata +48 -2
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Internals
|
|
3
|
+
parent: Advanced
|
|
4
|
+
nav_order: 7
|
|
5
|
+
has_children: true
|
|
6
|
+
---
|
|
7
|
+
= Internals
|
|
8
|
+
|
|
9
|
+
== Purpose
|
|
10
|
+
|
|
11
|
+
This section contains detailed implementation documentation for Canon's internal systems. These documents explain how Canon works under the hood, covering data structures, algorithms, and architectural patterns.
|
|
12
|
+
|
|
13
|
+
== Audience
|
|
14
|
+
|
|
15
|
+
These documents are intended for:
|
|
16
|
+
|
|
17
|
+
* Canon contributors and maintainers
|
|
18
|
+
* Developers extending Canon with custom functionality
|
|
19
|
+
* Anyone debugging complex comparison issues
|
|
20
|
+
* Users wanting to understand implementation details
|
|
21
|
+
|
|
22
|
+
== Topics
|
|
23
|
+
|
|
24
|
+
link:diffnode-enrichment[**DiffNode Enrichment**]::
|
|
25
|
+
How DiffNode objects carry location, serialized content, and attribute metadata through the comparison pipeline. Covers PathBuilder (canonical paths with ordinal indices), NodeSerializer (library-agnostic serialization), and how Layer 2 algorithms populate metadata for Layer 4 rendering.
|
|
26
|
+
|
|
27
|
+
link:../advanced/dom-diff-internals[**DOM Diff Internals**]::
|
|
28
|
+
Deep dive into Canon's default DOM diff algorithm: position-based matching, operation detection, and diff generation.
|
|
29
|
+
|
|
30
|
+
link:../advanced/semantic-tree-diff-internals[**Semantic Tree Diff Internals**]::
|
|
31
|
+
How the experimental semantic tree diff works: signature calculation, similarity matching, and operation classification.
|
|
32
|
+
|
|
33
|
+
link:../advanced/verbose-mode-architecture[**Verbose Mode Architecture**]::
|
|
34
|
+
The two-tier diff output system: normative vs informative diffs, and how verbose mode enriches output.
|
|
35
|
+
|
|
36
|
+
link:../advanced/diff-classification[**Diff Classification System**]::
|
|
37
|
+
How Canon classifies differences as normative (structural) or informative (presentational).
|
|
38
|
+
|
|
39
|
+
link:../advanced/diff-pipeline[**Diff Pipeline Architecture**]::
|
|
40
|
+
The six-layer technical pipeline from input to formatted output.
|
|
41
|
+
|
|
42
|
+
link:../advanced/extending-canon[**Extending Canon**]::
|
|
43
|
+
How to create custom comparators, formatters, and match strategies.
|
|
44
|
+
|
|
45
|
+
== Core Concepts
|
|
46
|
+
|
|
47
|
+
=== Library Agnosticism
|
|
48
|
+
|
|
49
|
+
Canon is designed to work with multiple parsing libraries (Nokogiri, Moxml, Canon::Xml::Node) without being tied to any specific implementation. This is achieved through:
|
|
50
|
+
|
|
51
|
+
* **Adapter pattern**: Format-specific adapters normalize different node types
|
|
52
|
+
* **Utility classes**: PathBuilder and NodeSerializer work with any library
|
|
53
|
+
* **Interface-based design**: Code depends on behavior (respond_to?) not concrete types
|
|
54
|
+
|
|
55
|
+
This allows Canon to:
|
|
56
|
+
* Support new parsing libraries without major refactoring
|
|
57
|
+
* Switch libraries for better performance or features
|
|
58
|
+
* Remain compatible as libraries evolve
|
|
59
|
+
|
|
60
|
+
=== The 4-Layer Architecture
|
|
61
|
+
|
|
62
|
+
Canon separates comparison concerns into four layers:
|
|
63
|
+
|
|
64
|
+
* **Layer 1**: Preprocessing - Normalize documents before comparison
|
|
65
|
+
* **Layer 2**: Algorithm - Choose comparison strategy (DOM vs Semantic)
|
|
66
|
+
* **Layer 3**: Match Options - Configure what to compare
|
|
67
|
+
* **Layer 4**: Diff Formatting - Control output presentation
|
|
68
|
+
|
|
69
|
+
Only Layer 2 differs between algorithms, and the enriched DiffNode structure ensures clean communication between layers.
|
|
70
|
+
|
|
71
|
+
See link:../understanding/architecture.adoc[Architecture] for the complete overview.
|
|
72
|
+
|
|
73
|
+
=== Enriched Metadata Flow
|
|
74
|
+
|
|
75
|
+
[mermaid]
|
|
76
|
+
----
|
|
77
|
+
graph LR
|
|
78
|
+
A[Layer 2: Algorithm] -->|Creates DiffNode| B[PathBuilder]
|
|
79
|
+
A -->|Creates DiffNode| C[NodeSerializer]
|
|
80
|
+
B -->|Enriches| D[DiffNode.path]
|
|
81
|
+
C -->|Enriches| E[DiffNode.serialized_before/after]
|
|
82
|
+
C -->|Enriches| F[DiffNode.attributes_before/after]
|
|
83
|
+
D --> G[Layer 4: Rendering]
|
|
84
|
+
E --> G
|
|
85
|
+
F --> G
|
|
86
|
+
G --> H[Accurate diff output]
|
|
87
|
+
|
|
88
|
+
style A fill:#fff4e1
|
|
89
|
+
style D fill:#e1f5ff
|
|
90
|
+
style G fill:#e1ffe1
|
|
91
|
+
----
|
|
92
|
+
|
|
93
|
+
Key insight: Metadata is captured at diff creation time (Layer 2) and carried through to rendering (Layer 4), ensuring accurate display even if nodes are modified during comparison.
|
|
94
|
+
|
|
95
|
+
== Data Structures
|
|
96
|
+
|
|
97
|
+
=== DiffNode
|
|
98
|
+
|
|
99
|
+
Represents a semantic difference between two nodes:
|
|
100
|
+
|
|
101
|
+
[source,ruby]
|
|
102
|
+
----
|
|
103
|
+
class DiffNode
|
|
104
|
+
# Core properties
|
|
105
|
+
attr_reader :node1, :node2 # Raw node references
|
|
106
|
+
attr_accessor :dimension, :reason # What changed and why
|
|
107
|
+
attr_accessor :normative, :formatting # Classification
|
|
108
|
+
|
|
109
|
+
# Enriched metadata for Layer 4 rendering
|
|
110
|
+
attr_accessor :path # Canonical path with ordinal indices
|
|
111
|
+
attr_accessor :serialized_before # Serialized "before" content
|
|
112
|
+
attr_accessor :serialized_after # Serialized "after" content
|
|
113
|
+
attr_accessor :attributes_before # Normalized "before" attributes
|
|
114
|
+
attr_accessor :attributes_after # Normalized "after" attributes
|
|
115
|
+
end
|
|
116
|
+
----
|
|
117
|
+
|
|
118
|
+
See link:diffnode-enrichment[DiffNode Enrichment] for details.
|
|
119
|
+
|
|
120
|
+
=== TreeNode (Semantic Diff)
|
|
121
|
+
|
|
122
|
+
Canonical node representation from semantic diff:
|
|
123
|
+
|
|
124
|
+
[source,ruby]
|
|
125
|
+
----
|
|
126
|
+
class TreeNode
|
|
127
|
+
attr_reader :label # Element name (e.g., "div", "span")
|
|
128
|
+
attr_reader :parent # Parent TreeNode
|
|
129
|
+
attr_reader :children # Array of child TreeNodes
|
|
130
|
+
attr_reader :attributes # Normalized attribute hash
|
|
131
|
+
attr_reader :source_node # Original parsing library node
|
|
132
|
+
attr_reader :signature # Calculated signature for matching
|
|
133
|
+
end
|
|
134
|
+
----
|
|
135
|
+
|
|
136
|
+
See link:../advanced/semantic-tree-diff-internals[Semantic Tree Diff Internals] for details.
|
|
137
|
+
|
|
138
|
+
=== ComparisonResult
|
|
139
|
+
|
|
140
|
+
Result object from verbose comparison:
|
|
141
|
+
|
|
142
|
+
[source,ruby]
|
|
143
|
+
----
|
|
144
|
+
class ComparisonResult
|
|
145
|
+
attr_reader :differences # Array of DiffNode objects
|
|
146
|
+
attr_reader :preprocessed_strings # Preprocessed document strings
|
|
147
|
+
attr_reader :original_strings # Original document strings
|
|
148
|
+
attr_reader :format # :xml, :html, :json, :yaml
|
|
149
|
+
attr_reader :match_options # Resolved match options
|
|
150
|
+
attr_reader :algorithm # :dom or :semantic
|
|
151
|
+
end
|
|
152
|
+
----
|
|
153
|
+
|
|
154
|
+
== Utility Classes
|
|
155
|
+
|
|
156
|
+
=== PathBuilder
|
|
157
|
+
|
|
158
|
+
Generates canonical XPath-like paths with ordinal indices:
|
|
159
|
+
|
|
160
|
+
[source,ruby]
|
|
161
|
+
----
|
|
162
|
+
# Build path for any node type
|
|
163
|
+
path = Canon::Diff::PathBuilder.build(node)
|
|
164
|
+
# => "/#document/div[0]/body[0]/p[1]/span[2]"
|
|
165
|
+
|
|
166
|
+
# Build human-readable path
|
|
167
|
+
human = Canon::Diff::PathBuilder.human_path(node)
|
|
168
|
+
# => "#document → div[0] → body[0] → p[1] → span[2]"
|
|
169
|
+
----
|
|
170
|
+
|
|
171
|
+
**Features**:
|
|
172
|
+
* Library-agnostic: works with TreeNodes, Canon::Xml::Node, Nokogiri nodes
|
|
173
|
+
* Ordinal indices: uniquely identifies nodes among siblings
|
|
174
|
+
* Traverses parent hierarchy: builds complete path from root to node
|
|
175
|
+
|
|
176
|
+
See link:diffnode-enrichment#pathbuilder-canonical-paths-with-ordinal-indices[PathBuilder documentation] for details.
|
|
177
|
+
|
|
178
|
+
=== NodeSerializer
|
|
179
|
+
|
|
180
|
+
Serializes nodes and extracts attributes regardless of parsing library:
|
|
181
|
+
|
|
182
|
+
[source,ruby]
|
|
183
|
+
----
|
|
184
|
+
# Serialize any node
|
|
185
|
+
serialized = Canon::Diff::NodeSerializer.serialize(node)
|
|
186
|
+
|
|
187
|
+
# Extract normalized attributes
|
|
188
|
+
attrs = Canon::Diff::NodeSerializer.extract_attributes(node)
|
|
189
|
+
# => {"lang" => "EN-GB", "xml:lang" => "EN-GB", "id" => "example"}
|
|
190
|
+
----
|
|
191
|
+
|
|
192
|
+
**Features**:
|
|
193
|
+
* Library-agnostic: handles Canon::Xml::Node, Nokogiri, Moxml
|
|
194
|
+
* Normalized output: consistent format regardless of source library
|
|
195
|
+
* Attribute extraction: returns hash of name-value pairs
|
|
196
|
+
|
|
197
|
+
See link:diffnode-enrichment#nodeserializer-library-agnostic-serialization[NodeSerializer documentation] for details.
|
|
198
|
+
|
|
199
|
+
== Algorithm Integration
|
|
200
|
+
|
|
201
|
+
=== DOM Algorithm
|
|
202
|
+
|
|
203
|
+
Enriches DiffNodes during positional comparison in `lib/canon/comparison/xml_comparator.rb`:
|
|
204
|
+
|
|
205
|
+
[source,ruby]
|
|
206
|
+
----
|
|
207
|
+
def add_difference(node1, node2, diff1, diff2, dimension, opts, differences)
|
|
208
|
+
metadata = enrich_diff_metadata(node1, node2)
|
|
209
|
+
diff_node = Canon::Diff::DiffNode.new(
|
|
210
|
+
node1: node1,
|
|
211
|
+
node2: node2,
|
|
212
|
+
dimension: dimension,
|
|
213
|
+
reason: build_difference_reason(node1, node2, diff1, diff2, dimension),
|
|
214
|
+
**metadata # Enriched from raw Nokogiri/Canon nodes
|
|
215
|
+
)
|
|
216
|
+
differences << diff_node
|
|
217
|
+
end
|
|
218
|
+
----
|
|
219
|
+
|
|
220
|
+
See link:../understanding/algorithms/dom-diff.adoc[DOM Diff Algorithm] for details.
|
|
221
|
+
|
|
222
|
+
=== Semantic Algorithm
|
|
223
|
+
|
|
224
|
+
Enriches DiffNodes during operation conversion in `lib/canon/tree_diff/operation_converter.rb`:
|
|
225
|
+
|
|
226
|
+
[source,ruby]
|
|
227
|
+
----
|
|
228
|
+
def convert_insert(operation)
|
|
229
|
+
tree_node2 = operation[:node]
|
|
230
|
+
node2 = extract_source_node(tree_node2)
|
|
231
|
+
metadata = enrich_diff_metadata(nil, tree_node2)
|
|
232
|
+
diff_node = Canon::Diff::DiffNode.new(
|
|
233
|
+
node1: nil,
|
|
234
|
+
node2: node2,
|
|
235
|
+
dimension: :element_structure,
|
|
236
|
+
reason: build_insert_reason(operation),
|
|
237
|
+
**metadata # Enriched from TreeNode
|
|
238
|
+
)
|
|
239
|
+
diff_node.normative = determine_normative(:element_structure)
|
|
240
|
+
diff_node
|
|
241
|
+
end
|
|
242
|
+
----
|
|
243
|
+
|
|
244
|
+
See link:../understanding/algorithms/semantic-tree-diff.adoc[Semantic Tree Diff Algorithm] for details.
|
|
245
|
+
|
|
246
|
+
== See Also
|
|
247
|
+
|
|
248
|
+
* link:../understanding/architecture.adoc[Architecture] - 4-layer architecture overview
|
|
249
|
+
* link:../understanding/algorithms/[Algorithms] - DOM and Semantic algorithm details
|
|
250
|
+
* link:../features/diff-formatting/[Diff Formatting] - Layer 4 rendering options
|
|
251
|
+
* link:../advanced/[Advanced Topics] - Deep technical documentation
|
data/docs/lychee.toml
CHANGED
|
@@ -12,9 +12,11 @@ include_verbatim = true
|
|
|
12
12
|
# Recursively check all files
|
|
13
13
|
recursive = true
|
|
14
14
|
|
|
15
|
-
# File types to check
|
|
15
|
+
# File types to check (regex patterns)
|
|
16
16
|
include = [
|
|
17
|
-
"_site/**/*.html"
|
|
17
|
+
"_site/**/*.html",
|
|
18
|
+
".*\\.adoc$",
|
|
19
|
+
".*\\.md$"
|
|
18
20
|
]
|
|
19
21
|
|
|
20
22
|
# Excluded paths
|
|
@@ -25,7 +27,9 @@ exclude = [
|
|
|
25
27
|
"vendor",
|
|
26
28
|
".bundle",
|
|
27
29
|
".sass-cache",
|
|
28
|
-
".jekyll-cache"
|
|
30
|
+
".jekyll-cache",
|
|
31
|
+
"_site/.jekyll-cache",
|
|
32
|
+
"Gemfile.lock"
|
|
29
33
|
]
|
|
30
34
|
|
|
31
35
|
# Link checking behavior
|
|
@@ -56,10 +60,13 @@ include_mail = false # Don't check mailto: links
|
|
|
56
60
|
max_concurrency = 10
|
|
57
61
|
|
|
58
62
|
# Verbose output for debugging
|
|
59
|
-
verbose = "
|
|
63
|
+
verbose = "warn"
|
|
60
64
|
|
|
61
65
|
# Require HTTPS where possible
|
|
62
66
|
require_https = false # Don't enforce
|
|
63
67
|
|
|
64
|
-
# Index files
|
|
65
|
-
index_files = ["index.html"]
|
|
68
|
+
# Index files for directory URLs
|
|
69
|
+
index_files = ["index.html"]
|
|
70
|
+
|
|
71
|
+
# Ignore patterns file
|
|
72
|
+
ignore_file = ".lycheeignore"
|