canon 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +69 -92
  3. data/README.adoc +13 -13
  4. data/docs/.lycheeignore +69 -0
  5. data/docs/Gemfile +1 -0
  6. data/docs/_config.yml +90 -1
  7. data/docs/advanced/diff-classification.adoc +82 -2
  8. data/docs/advanced/extending-canon.adoc +193 -0
  9. data/docs/features/match-options/index.adoc +239 -1
  10. data/docs/internals/diffnode-enrichment.adoc +611 -0
  11. data/docs/internals/index.adoc +251 -0
  12. data/docs/lychee.toml +13 -6
  13. data/docs/understanding/architecture.adoc +749 -33
  14. data/docs/understanding/comparison-pipeline.adoc +122 -0
  15. data/lib/canon/cache.rb +129 -0
  16. data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
  17. data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
  18. data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
  19. data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
  20. data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
  21. data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
  22. data/lib/canon/comparison/dimensions/registry.rb +77 -0
  23. data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
  24. data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
  25. data/lib/canon/comparison/dimensions.rb +54 -0
  26. data/lib/canon/comparison/format_detector.rb +87 -0
  27. data/lib/canon/comparison/html_comparator.rb +70 -26
  28. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  29. data/lib/canon/comparison/html_parser.rb +80 -0
  30. data/lib/canon/comparison/json_comparator.rb +12 -0
  31. data/lib/canon/comparison/json_parser.rb +19 -0
  32. data/lib/canon/comparison/markup_comparator.rb +293 -0
  33. data/lib/canon/comparison/match_options/base_resolver.rb +150 -0
  34. data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
  35. data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
  36. data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
  37. data/lib/canon/comparison/match_options.rb +68 -463
  38. data/lib/canon/comparison/profile_definition.rb +149 -0
  39. data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
  40. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
  41. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  42. data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
  43. data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
  44. data/lib/canon/comparison/xml_comparator/child_comparison.rb +197 -0
  45. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
  46. data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
  47. data/lib/canon/comparison/xml_comparator/node_parser.rb +79 -0
  48. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +102 -0
  49. data/lib/canon/comparison/xml_comparator.rb +97 -684
  50. data/lib/canon/comparison/xml_node_comparison.rb +319 -0
  51. data/lib/canon/comparison/xml_parser.rb +19 -0
  52. data/lib/canon/comparison/yaml_comparator.rb +3 -3
  53. data/lib/canon/comparison.rb +265 -110
  54. data/lib/canon/diff/diff_classifier.rb +101 -2
  55. data/lib/canon/diff/diff_node.rb +32 -2
  56. data/lib/canon/diff/formatting_detector.rb +1 -1
  57. data/lib/canon/diff/node_serializer.rb +191 -0
  58. data/lib/canon/diff/path_builder.rb +143 -0
  59. data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
  60. data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
  61. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
  62. data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
  63. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
  64. data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
  65. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
  66. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
  67. data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
  68. data/lib/canon/diff_formatter.rb +1 -1
  69. data/lib/canon/rspec_matchers.rb +38 -9
  70. data/lib/canon/tree_diff/operation_converter.rb +92 -338
  71. data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
  72. data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
  73. data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
  74. data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
  75. data/lib/canon/version.rb +1 -1
  76. data/lib/canon/xml/data_model.rb +24 -13
  77. metadata +48 -2
@@ -0,0 +1,251 @@
1
+ ---
2
+ title: Internals
3
+ parent: Advanced
4
+ nav_order: 7
5
+ has_children: true
6
+ ---
7
+ = Internals
8
+
9
+ == Purpose
10
+
11
+ This section contains detailed implementation documentation for Canon's internal systems. These documents explain how Canon works under the hood, covering data structures, algorithms, and architectural patterns.
12
+
13
+ == Audience
14
+
15
+ These documents are intended for:
16
+
17
+ * Canon contributors and maintainers
18
+ * Developers extending Canon with custom functionality
19
+ * Anyone debugging complex comparison issues
20
+ * Users wanting to understand implementation details
21
+
22
+ == Topics
23
+
24
+ link:diffnode-enrichment[**DiffNode Enrichment**]::
25
+ How DiffNode objects carry location, serialized content, and attribute metadata through the comparison pipeline. Covers PathBuilder (canonical paths with ordinal indices), NodeSerializer (library-agnostic serialization), and how Layer 2 algorithms populate metadata for Layer 4 rendering.
26
+
27
+ link:../advanced/dom-diff-internals[**DOM Diff Internals**]::
28
+ Deep dive into Canon's default DOM diff algorithm: position-based matching, operation detection, and diff generation.
29
+
30
+ link:../advanced/semantic-tree-diff-internals[**Semantic Tree Diff Internals**]::
31
+ How the experimental semantic tree diff works: signature calculation, similarity matching, and operation classification.
32
+
33
+ link:../advanced/verbose-mode-architecture[**Verbose Mode Architecture**]::
34
+ The two-tier diff output system: normative vs informative diffs, and how verbose mode enriches output.
35
+
36
+ link:../advanced/diff-classification[**Diff Classification System**]::
37
+ How Canon classifies differences as normative (structural) or informative (presentational).
38
+
39
+ link:../advanced/diff-pipeline[**Diff Pipeline Architecture**]::
40
+ The six-layer technical pipeline from input to formatted output.
41
+
42
+ link:../advanced/extending-canon[**Extending Canon**]::
43
+ How to create custom comparators, formatters, and match strategies.
44
+
45
+ == Core Concepts
46
+
47
+ === Library Agnosticism
48
+
49
+ Canon is designed to work with multiple parsing libraries (Nokogiri, Moxml, Canon::Xml::Node) without being tied to any specific implementation. This is achieved through:
50
+
51
+ * **Adapter pattern**: Format-specific adapters normalize different node types
52
+ * **Utility classes**: PathBuilder and NodeSerializer work with any library
53
+ * **Interface-based design**: Code depends on behavior (respond_to?) not concrete types
54
+
55
+ This allows Canon to:
56
+ * Support new parsing libraries without major refactoring
57
+ * Switch libraries for better performance or features
58
+ * Remain compatible as libraries evolve
59
+
60
+ === The 4-Layer Architecture
61
+
62
+ Canon separates comparison concerns into four layers:
63
+
64
+ * **Layer 1**: Preprocessing - Normalize documents before comparison
65
+ * **Layer 2**: Algorithm - Choose comparison strategy (DOM vs Semantic)
66
+ * **Layer 3**: Match Options - Configure what to compare
67
+ * **Layer 4**: Diff Formatting - Control output presentation
68
+
69
+ Only Layer 2 differs between algorithms, and the enriched DiffNode structure ensures clean communication between layers.
70
+
71
+ See link:../understanding/architecture.adoc[Architecture] for the complete overview.
72
+
73
+ === Enriched Metadata Flow
74
+
75
+ [mermaid]
76
+ ----
77
+ graph LR
78
+ A[Layer 2: Algorithm] -->|Creates DiffNode| B[PathBuilder]
79
+ A -->|Creates DiffNode| C[NodeSerializer]
80
+ B -->|Enriches| D[DiffNode.path]
81
+ C -->|Enriches| E[DiffNode.serialized_before/after]
82
+ C -->|Enriches| F[DiffNode.attributes_before/after]
83
+ D --> G[Layer 4: Rendering]
84
+ E --> G
85
+ F --> G
86
+ G --> H[Accurate diff output]
87
+
88
+ style A fill:#fff4e1
89
+ style D fill:#e1f5ff
90
+ style G fill:#e1ffe1
91
+ ----
92
+
93
+ Key insight: Metadata is captured at diff creation time (Layer 2) and carried through to rendering (Layer 4), ensuring accurate display even if nodes are modified during comparison.
94
+
95
+ == Data Structures
96
+
97
+ === DiffNode
98
+
99
+ Represents a semantic difference between two nodes:
100
+
101
+ [source,ruby]
102
+ ----
103
+ class DiffNode
104
+ # Core properties
105
+ attr_reader :node1, :node2 # Raw node references
106
+ attr_accessor :dimension, :reason # What changed and why
107
+ attr_accessor :normative, :formatting # Classification
108
+
109
+ # Enriched metadata for Layer 4 rendering
110
+ attr_accessor :path # Canonical path with ordinal indices
111
+ attr_accessor :serialized_before # Serialized "before" content
112
+ attr_accessor :serialized_after # Serialized "after" content
113
+ attr_accessor :attributes_before # Normalized "before" attributes
114
+ attr_accessor :attributes_after # Normalized "after" attributes
115
+ end
116
+ ----
117
+
118
+ See link:diffnode-enrichment[DiffNode Enrichment] for details.
119
+
120
+ === TreeNode (Semantic Diff)
121
+
122
+ Canonical node representation from semantic diff:
123
+
124
+ [source,ruby]
125
+ ----
126
+ class TreeNode
127
+ attr_reader :label # Element name (e.g., "div", "span")
128
+ attr_reader :parent # Parent TreeNode
129
+ attr_reader :children # Array of child TreeNodes
130
+ attr_reader :attributes # Normalized attribute hash
131
+ attr_reader :source_node # Original parsing library node
132
+ attr_reader :signature # Calculated signature for matching
133
+ end
134
+ ----
135
+
136
+ See link:../advanced/semantic-tree-diff-internals[Semantic Tree Diff Internals] for details.
137
+
138
+ === ComparisonResult
139
+
140
+ Result object from verbose comparison:
141
+
142
+ [source,ruby]
143
+ ----
144
+ class ComparisonResult
145
+ attr_reader :differences # Array of DiffNode objects
146
+ attr_reader :preprocessed_strings # Preprocessed document strings
147
+ attr_reader :original_strings # Original document strings
148
+ attr_reader :format # :xml, :html, :json, :yaml
149
+ attr_reader :match_options # Resolved match options
150
+ attr_reader :algorithm # :dom or :semantic
151
+ end
152
+ ----
153
+
154
+ == Utility Classes
155
+
156
+ === PathBuilder
157
+
158
+ Generates canonical XPath-like paths with ordinal indices:
159
+
160
+ [source,ruby]
161
+ ----
162
+ # Build path for any node type
163
+ path = Canon::Diff::PathBuilder.build(node)
164
+ # => "/#document/div[0]/body[0]/p[1]/span[2]"
165
+
166
+ # Build human-readable path
167
+ human = Canon::Diff::PathBuilder.human_path(node)
168
+ # => "#document → div[0] → body[0] → p[1] → span[2]"
169
+ ----
170
+
171
+ **Features**:
172
+ * Library-agnostic: works with TreeNodes, Canon::Xml::Node, Nokogiri nodes
173
+ * Ordinal indices: uniquely identifies nodes among siblings
174
+ * Traverses parent hierarchy: builds complete path from root to node
175
+
176
+ See link:diffnode-enrichment#pathbuilder-canonical-paths-with-ordinal-indices[PathBuilder documentation] for details.
177
+
178
+ === NodeSerializer
179
+
180
+ Serializes nodes and extracts attributes regardless of parsing library:
181
+
182
+ [source,ruby]
183
+ ----
184
+ # Serialize any node
185
+ serialized = Canon::Diff::NodeSerializer.serialize(node)
186
+
187
+ # Extract normalized attributes
188
+ attrs = Canon::Diff::NodeSerializer.extract_attributes(node)
189
+ # => {"lang" => "EN-GB", "xml:lang" => "EN-GB", "id" => "example"}
190
+ ----
191
+
192
+ **Features**:
193
+ * Library-agnostic: handles Canon::Xml::Node, Nokogiri, Moxml
194
+ * Normalized output: consistent format regardless of source library
195
+ * Attribute extraction: returns hash of name-value pairs
196
+
197
+ See link:diffnode-enrichment#nodeserializer-library-agnostic-serialization[NodeSerializer documentation] for details.
198
+
199
+ == Algorithm Integration
200
+
201
+ === DOM Algorithm
202
+
203
+ Enriches DiffNodes during positional comparison in `lib/canon/comparison/xml_comparator.rb`:
204
+
205
+ [source,ruby]
206
+ ----
207
+ def add_difference(node1, node2, diff1, diff2, dimension, opts, differences)
208
+ metadata = enrich_diff_metadata(node1, node2)
209
+ diff_node = Canon::Diff::DiffNode.new(
210
+ node1: node1,
211
+ node2: node2,
212
+ dimension: dimension,
213
+ reason: build_difference_reason(node1, node2, diff1, diff2, dimension),
214
+ **metadata # Enriched from raw Nokogiri/Canon nodes
215
+ )
216
+ differences << diff_node
217
+ end
218
+ ----
219
+
220
+ See link:../understanding/algorithms/dom-diff.adoc[DOM Diff Algorithm] for details.
221
+
222
+ === Semantic Algorithm
223
+
224
+ Enriches DiffNodes during operation conversion in `lib/canon/tree_diff/operation_converter.rb`:
225
+
226
+ [source,ruby]
227
+ ----
228
+ def convert_insert(operation)
229
+ tree_node2 = operation[:node]
230
+ node2 = extract_source_node(tree_node2)
231
+ metadata = enrich_diff_metadata(nil, tree_node2)
232
+ diff_node = Canon::Diff::DiffNode.new(
233
+ node1: nil,
234
+ node2: node2,
235
+ dimension: :element_structure,
236
+ reason: build_insert_reason(operation),
237
+ **metadata # Enriched from TreeNode
238
+ )
239
+ diff_node.normative = determine_normative(:element_structure)
240
+ diff_node
241
+ end
242
+ ----
243
+
244
+ See link:../understanding/algorithms/semantic-tree-diff.adoc[Semantic Tree Diff Algorithm] for details.
245
+
246
+ == See Also
247
+
248
+ * link:../understanding/architecture.adoc[Architecture] - 4-layer architecture overview
249
+ * link:../understanding/algorithms/[Algorithms] - DOM and Semantic algorithm details
250
+ * link:../features/diff-formatting/[Diff Formatting] - Layer 4 rendering options
251
+ * link:../advanced/[Advanced Topics] - Deep technical documentation
data/docs/lychee.toml CHANGED
@@ -12,9 +12,11 @@ include_verbatim = true
12
12
  # Recursively check all files
13
13
  recursive = true
14
14
 
15
- # File types to check
15
+ # File types to check (regex patterns)
16
16
  include = [
17
- "_site/**/*.html"
17
+ "_site/**/*.html",
18
+ ".*\\.adoc$",
19
+ ".*\\.md$"
18
20
  ]
19
21
 
20
22
  # Excluded paths
@@ -25,7 +27,9 @@ exclude = [
25
27
  "vendor",
26
28
  ".bundle",
27
29
  ".sass-cache",
28
- ".jekyll-cache"
30
+ ".jekyll-cache",
31
+ "_site/.jekyll-cache",
32
+ "Gemfile.lock"
29
33
  ]
30
34
 
31
35
  # Link checking behavior
@@ -56,10 +60,13 @@ include_mail = false # Don't check mailto: links
56
60
  max_concurrency = 10
57
61
 
58
62
  # Verbose output for debugging
59
- verbose = "info"
63
+ verbose = "warn"
60
64
 
61
65
  # Require HTTPS where possible
62
66
  require_https = false # Don't enforce
63
67
 
64
- # Index files
65
- index_files = ["index.html"]
68
+ # Index files for directory URLs
69
+ index_files = ["index.html"]
70
+
71
+ # Ignore patterns file
72
+ ignore_file = ".lycheeignore"