canon 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -1
  3. data/.rubocop_todo.yml +276 -7
  4. data/README.adoc +203 -138
  5. data/_config.yml +116 -0
  6. data/docs/ADVANCED_TOPICS.adoc +20 -0
  7. data/docs/BASIC_USAGE.adoc +16 -0
  8. data/docs/CHARACTER_VISUALIZATION.adoc +567 -0
  9. data/docs/CLI.adoc +493 -0
  10. data/docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
  11. data/docs/DIFF_ARCHITECTURE.adoc +435 -0
  12. data/docs/DIFF_FORMATTING.adoc +540 -0
  13. data/docs/FORMATS.adoc +447 -0
  14. data/docs/INDEX.adoc +222 -0
  15. data/docs/INPUT_VALIDATION.adoc +477 -0
  16. data/docs/MATCH_ARCHITECTURE.adoc +463 -0
  17. data/docs/MATCH_OPTIONS.adoc +719 -0
  18. data/docs/MODES.adoc +432 -0
  19. data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
  20. data/docs/OPTIONS.adoc +1387 -0
  21. data/docs/PREPROCESSING.adoc +491 -0
  22. data/docs/RSPEC.adoc +605 -0
  23. data/docs/RUBY_API.adoc +478 -0
  24. data/docs/SEMANTIC_DIFF_REPORT.adoc +528 -0
  25. data/docs/UNDERSTANDING_CANON.adoc +17 -0
  26. data/docs/VERBOSE.adoc +482 -0
  27. data/exe/canon +7 -0
  28. data/lib/canon/cli.rb +179 -0
  29. data/lib/canon/commands/diff_command.rb +195 -0
  30. data/lib/canon/commands/format_command.rb +113 -0
  31. data/lib/canon/comparison/base_comparator.rb +39 -0
  32. data/lib/canon/comparison/comparison_result.rb +79 -0
  33. data/lib/canon/comparison/html_comparator.rb +410 -0
  34. data/lib/canon/comparison/json_comparator.rb +212 -0
  35. data/lib/canon/comparison/match_options.rb +616 -0
  36. data/lib/canon/comparison/xml_comparator.rb +566 -0
  37. data/lib/canon/comparison/yaml_comparator.rb +93 -0
  38. data/lib/canon/comparison.rb +239 -0
  39. data/lib/canon/config.rb +172 -0
  40. data/lib/canon/diff/diff_block.rb +71 -0
  41. data/lib/canon/diff/diff_block_builder.rb +105 -0
  42. data/lib/canon/diff/diff_classifier.rb +46 -0
  43. data/lib/canon/diff/diff_context.rb +85 -0
  44. data/lib/canon/diff/diff_context_builder.rb +107 -0
  45. data/lib/canon/diff/diff_line.rb +77 -0
  46. data/lib/canon/diff/diff_node.rb +56 -0
  47. data/lib/canon/diff/diff_node_mapper.rb +148 -0
  48. data/lib/canon/diff/diff_report.rb +133 -0
  49. data/lib/canon/diff/diff_report_builder.rb +62 -0
  50. data/lib/canon/diff_formatter/by_line/base_formatter.rb +407 -0
  51. data/lib/canon/diff_formatter/by_line/html_formatter.rb +672 -0
  52. data/lib/canon/diff_formatter/by_line/json_formatter.rb +284 -0
  53. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +190 -0
  54. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +860 -0
  55. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +292 -0
  56. data/lib/canon/diff_formatter/by_object/base_formatter.rb +199 -0
  57. data/lib/canon/diff_formatter/by_object/json_formatter.rb +305 -0
  58. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +248 -0
  59. data/lib/canon/diff_formatter/by_object/yaml_formatter.rb +17 -0
  60. data/lib/canon/diff_formatter/character_map.yml +197 -0
  61. data/lib/canon/diff_formatter/debug_output.rb +431 -0
  62. data/lib/canon/diff_formatter/diff_detail_formatter.rb +551 -0
  63. data/lib/canon/diff_formatter/legend.rb +141 -0
  64. data/lib/canon/diff_formatter.rb +520 -0
  65. data/lib/canon/errors.rb +56 -0
  66. data/lib/canon/formatters/html4_formatter.rb +17 -0
  67. data/lib/canon/formatters/html5_formatter.rb +17 -0
  68. data/lib/canon/formatters/html_formatter.rb +37 -0
  69. data/lib/canon/formatters/html_formatter_base.rb +163 -0
  70. data/lib/canon/formatters/json_formatter.rb +3 -0
  71. data/lib/canon/formatters/xml_formatter.rb +20 -55
  72. data/lib/canon/formatters/yaml_formatter.rb +4 -1
  73. data/lib/canon/pretty_printer/html.rb +57 -0
  74. data/lib/canon/pretty_printer/json.rb +25 -0
  75. data/lib/canon/pretty_printer/xml.rb +29 -0
  76. data/lib/canon/rspec_matchers.rb +222 -80
  77. data/lib/canon/validators/base_validator.rb +49 -0
  78. data/lib/canon/validators/html_validator.rb +138 -0
  79. data/lib/canon/validators/json_validator.rb +89 -0
  80. data/lib/canon/validators/xml_validator.rb +53 -0
  81. data/lib/canon/validators/yaml_validator.rb +73 -0
  82. data/lib/canon/version.rb +1 -1
  83. data/lib/canon/xml/attribute_handler.rb +80 -0
  84. data/lib/canon/xml/c14n.rb +36 -0
  85. data/lib/canon/xml/character_encoder.rb +38 -0
  86. data/lib/canon/xml/data_model.rb +225 -0
  87. data/lib/canon/xml/element_matcher.rb +196 -0
  88. data/lib/canon/xml/line_range_mapper.rb +158 -0
  89. data/lib/canon/xml/namespace_handler.rb +86 -0
  90. data/lib/canon/xml/node.rb +32 -0
  91. data/lib/canon/xml/nodes/attribute_node.rb +54 -0
  92. data/lib/canon/xml/nodes/comment_node.rb +23 -0
  93. data/lib/canon/xml/nodes/element_node.rb +56 -0
  94. data/lib/canon/xml/nodes/namespace_node.rb +38 -0
  95. data/lib/canon/xml/nodes/processing_instruction_node.rb +24 -0
  96. data/lib/canon/xml/nodes/root_node.rb +16 -0
  97. data/lib/canon/xml/nodes/text_node.rb +23 -0
  98. data/lib/canon/xml/processor.rb +151 -0
  99. data/lib/canon/xml/whitespace_normalizer.rb +72 -0
  100. data/lib/canon/xml/xml_base_handler.rb +188 -0
  101. data/lib/canon.rb +14 -3
  102. metadata +116 -21
@@ -0,0 +1,491 @@
1
+ ---
2
+ layout: default
3
+ title: Preprocessing
4
+ nav_order: 31
5
+ parent: Customizing Behavior
6
+ ---
7
+ = Canon preprocessing options
8
+ :toc:
9
+ :toclevels: 3
10
+
11
+ == Scope
12
+
13
+ This document describes Canon's preprocessing options that transform documents
14
+ before comparison. Preprocessing is Phase 1 of Canon's three-phase comparison
15
+ architecture.
16
+
17
+ For the complete architecture, see link:MATCH_ARCHITECTURE[Match
18
+ architecture].
19
+
20
+ == General
21
+
22
+ Preprocessing transforms documents into a normalized form before semantic
23
+ matching. This eliminates format-specific variations that should not affect
24
+ semantic equivalence.
25
+
26
+ Preprocessing is optional and configured per-comparison. The default is
27
+ `none` (no preprocessing).
28
+
29
+ == Architecture context
30
+
31
+ Preprocessing is Phase 1 in Canon's comparison flow:
32
+
33
+ [source]
34
+ ----
35
+ Phase 1: PREPROCESSING → Phase 2: MATCHING → Phase 3: RENDERING
36
+ ----
37
+
38
+ Documents are preprocessed identically before comparison, ensuring consistent
39
+ input to the semantic matching phase.
40
+
41
+ == Preprocessing options
42
+
43
+ === none (default)
44
+
45
+ **Purpose**: No preprocessing - compare documents exactly as provided.
46
+
47
+ **When to use**:
48
+
49
+ * Documents are already in comparable form
50
+ * You want to detect any differences, including formatting
51
+ * Testing exact output from generators
52
+ * Maximum strictness required
53
+
54
+ **Behavior**:
55
+
56
+ * Documents passed directly to comparison
57
+ * No normalization applied
58
+ * All whitespace preserved
59
+ * All formatting preserved
60
+
61
+ .none example
62
+ [example]
63
+ ====
64
+ [source,ruby]
65
+ ----
66
+ Canon::Comparison.equivalent?(doc1, doc2,
67
+ preprocessing: :none # or omit (default)
68
+ )
69
+ ----
70
+
71
+ Documents compared exactly as-is.
72
+ ====
73
+
74
+ === c14n
75
+
76
+ **Purpose**: Apply canonical form according to format-specific rules.
77
+
78
+ **When to use**:
79
+
80
+ * Eliminate all formatting differences
81
+ * Focus purely on semantic content
82
+ * Compare documents from different sources
83
+ * Maximum normalization needed
84
+
85
+ **Behavior per format**:
86
+
87
+ **XML**::
88
+ * W3C Canonical XML Version 1.1
89
+ * Namespace prefixes sorted
90
+ * Attributes sorted by namespace URI then local name
91
+ * Whitespace normalized
92
+ * Comments removed (unless `--with-comments`)
93
+
94
+ **HTML**::
95
+ * Normalized HTML structure
96
+ * Attributes sorted
97
+ * Whitespace collapsed per CSS rules
98
+ * Empty text nodes removed
99
+
100
+ **JSON**::
101
+ * Keys sorted alphabetically at all levels
102
+ * Whitespace normalized
103
+ * Consistent formatting
104
+
105
+ **YAML**::
106
+ * Keys sorted alphabetically
107
+ * Standard YAML 1.2 format
108
+ * Consistent indentation
109
+
110
+ .c14n examples
111
+ [example]
112
+ ====
113
+ [source,ruby]
114
+ ----
115
+ # Ruby API
116
+ Canon::Comparison.equivalent?(xml1, xml2,
117
+ preprocessing: :c14n
118
+ )
119
+
120
+ # CLI
121
+ $ canon diff file1.xml file2.xml --preprocessing c14n --verbose
122
+ ----
123
+
124
+ **XML before c14n**:
125
+
126
+ [source,xml]
127
+ ----
128
+ <root xmlns:b="http://b.com" xmlns:a="http://a.com">
129
+ <item b:attr="2" a:attr="1">
130
+ Text
131
+ </item>
132
+ </root>
133
+ ----
134
+
135
+ **After c14n**:
136
+
137
+ [source,xml]
138
+ ----
139
+ <root xmlns:a="http://a.com" xmlns:b="http://b.com"><item a:attr="1" b:attr="2">Text</item></root>
140
+ ----
141
+
142
+ Namespaces and attributes sorted, whitespace removed.
143
+ ====
144
+
145
+ === normalize
146
+
147
+ **Purpose**: Normalize whitespace while preserving structure.
148
+
149
+ **When to use**:
150
+
151
+ * Ignore whitespace differences but preserve element/key order
152
+ * Compare documents with different pretty-printing
153
+ * Focus on content with flexible whitespace handling
154
+ * Middle ground between `none` and `c14n`
155
+
156
+ **Behavior**:
157
+
158
+ * Collapse multiple whitespace to single space
159
+ * Trim leading/trailing whitespace
160
+ * Normalize line endings (LF)
161
+ * Preserve element/attribute/key order
162
+ * Preserve structural whitespace
163
+
164
+ .normalize examples
165
+ [example]
166
+ ====
167
+ [source,ruby]
168
+ ----
169
+ # Ruby API
170
+ Canon::Comparison.equivalent?(xml1, xml2,
171
+ preprocessing: :normalize
172
+ )
173
+
174
+ # CLI
175
+ $ canon diff file1.xml file2.xml --preprocessing normalize --verbose
176
+ ----
177
+
178
+ **Before normalize**:
179
+
180
+ [source,xml]
181
+ ----
182
+ <root>
183
+ <item> Text with spaces </item>
184
+ </root>
185
+ ----
186
+
187
+ **After normalize**:
188
+
189
+ [source,xml]
190
+ ----
191
+ <root>
192
+ <item>Text with spaces</item>
193
+ </root>
194
+ ----
195
+
196
+ Whitespace collapsed and trimmed, structure preserved.
197
+ ====
198
+
199
+ === format
200
+
201
+ **Purpose**: Pretty-print documents with standard formatting.
202
+
203
+ **When to use**:
204
+
205
+ * Compare documents where both should be well-formatted
206
+ * Ensure consistent indentation before comparison
207
+ * Generate readable diff output
208
+ * Standardize formatting across sources
209
+
210
+ **Behavior**:
211
+
212
+ * 2-space indentation (default)
213
+ * One element/property per line
214
+ * Consistent structure
215
+ * Attributes/keys in canonical order
216
+
217
+ .format examples
218
+ [example]
219
+ ====
220
+ [source,ruby]
221
+ ----
222
+ # Ruby API
223
+ Canon::Comparison.equivalent?(xml1, xml2,
224
+ preprocessing: :format
225
+ )
226
+
227
+ # CLI
228
+ $ canon diff file1.xml file2.xml --preprocessing format --verbose
229
+ ----
230
+
231
+ **Before format**:
232
+
233
+ [source,xml]
234
+ ----
235
+ <root><a>1</a><b>2</b></root>
236
+ ----
237
+
238
+ **After format**:
239
+
240
+ [source,xml]
241
+ ----
242
+ <root>
243
+ <a>1</a>
244
+ <b>2</b>
245
+ </root>
246
+ ----
247
+
248
+ Consistent 2-space indentation applied.
249
+ ====
250
+
251
+ == Comparison table
252
+
253
+ [cols="1,1,1,1,1"]
254
+ |===
255
+ |Aspect |none |c14n |normalize |format
256
+
257
+ |**Whitespace**
258
+ |Preserved
259
+ |Removed
260
+ |Collapsed
261
+ |Standardized
262
+
263
+ |**Element order**
264
+ |Preserved
265
+ |May change (XML)
266
+ |Preserved
267
+ |Preserved
268
+
269
+ |**Attribute/key order**
270
+ |Preserved
271
+ |Sorted
272
+ |Preserved
273
+ |Sorted
274
+
275
+ |**Formatting**
276
+ |Preserved
277
+ |Compact
278
+ |Minimal
279
+ |Pretty-printed
280
+
281
+ |**Use case**
282
+ |Exact match
283
+ |Maximum normalization
284
+ |Flexible whitespace
285
+ |Consistent formatting
286
+ |===
287
+
288
+ == Choosing preprocessing options
289
+
290
+ === Decision guide
291
+
292
+ **Use `none` when**:
293
+
294
+ * Testing exact generator output
295
+ * Formatting matters
296
+ * Maximum strictness needed
297
+ * Documents should be identical
298
+
299
+ **Use `c14n` when**:
300
+
301
+ * Comparing from different sources
302
+ * Formatting doesn't matter at all
303
+ * Maximum normalization needed
304
+ * Focus purely on semantic content
305
+
306
+ **Use `normalize` when**:
307
+
308
+ * Whitespace differences should be ignored
309
+ * Want to preserve element/key order
310
+ * Middle ground between strict and normalized
311
+ * Comparing hand-written vs generated content
312
+
313
+ **Use `format` when**:
314
+
315
+ * Both documents should be well-formatted
316
+ * Want readable diff output
317
+ * Standardizing before comparison
318
+ * Pretty-printing for review
319
+
320
+ === Common scenarios
321
+
322
+ .Configuration file comparison
323
+ [example]
324
+ ====
325
+ **Scenario**: Compare JSON config files from different environments.
326
+
327
+ **Recommendation**: `c14n`
328
+
329
+ **Reason**: Key order and whitespace don't matter; focus on values.
330
+
331
+ [source,ruby]
332
+ ----
333
+ Canon::Comparison.equivalent?(prod_config, dev_config,
334
+ preprocessing: :c14n
335
+ )
336
+ ----
337
+ ====
338
+
339
+ .HTML output testing
340
+ [example]
341
+ ====
342
+ **Scenario**: Test HTML generator output against expected fixture.
343
+
344
+ **Recommendation**: `normalize`
345
+
346
+ **Reason**: Ignore whitespace differences but preserve structure.
347
+
348
+ [source,ruby]
349
+ ----
350
+ Canon::Comparison.equivalent?(generated_html, expected_html,
351
+ preprocessing: :normalize
352
+ )
353
+ ----
354
+ ====
355
+
356
+ .XML document comparison
357
+ [example]
358
+ ====
359
+ **Scenario**: Compare XML from manual edits vs programmatic generation.
360
+
361
+ **Recommendation**: `format`
362
+
363
+ **Reason**: Standardize formatting for readable diff.
364
+
365
+ [source,ruby]
366
+ ----
367
+ Canon::Comparison.equivalent?(manual_xml, generated_xml,
368
+ preprocessing: :format,
369
+ verbose: true
370
+ )
371
+ ----
372
+ ====
373
+
374
+ .Exact output verification
375
+ [example]
376
+ ====
377
+ **Scenario**: Verify serializer produces exactly correct output.
378
+
379
+ **Recommendation**: `none`
380
+
381
+ **Reason**: Formatting and whitespace matter.
382
+
383
+ [source,ruby]
384
+ ----
385
+ Canon::Comparison.equivalent?(serialized, expected,
386
+ preprocessing: :none # or omit
387
+ )
388
+ ----
389
+ ====
390
+
391
+ == Combining with match options
392
+
393
+ Preprocessing works in combination with match options (Phase 2):
394
+
395
+ .Preprocessing + match options
396
+ [example]
397
+ ====
398
+ [source,ruby]
399
+ ----
400
+ # Normalize whitespace in preprocessing,
401
+ # then ignore comments in matching
402
+ Canon::Comparison.equivalent?(doc1, doc2,
403
+ preprocessing: :normalize,
404
+ match: {
405
+ comments: :ignore
406
+ }
407
+ )
408
+
409
+ # Format for consistency,
410
+ # then strict matching on content
411
+ Canon::Comparison.equivalent?(doc1, doc2,
412
+ preprocessing: :format,
413
+ match: {
414
+ text_content: :strict,
415
+ structural_whitespace: :strict
416
+ }
417
+ )
418
+ ----
419
+
420
+ Preprocessing transforms, then matching compares.
421
+ ====
422
+
423
+ == Usage across interfaces
424
+
425
+ === Ruby API
426
+
427
+ [source,ruby]
428
+ ----
429
+ # Basic usage
430
+ Canon::Comparison.equivalent?(doc1, doc2,
431
+ preprocessing: :c14n
432
+ )
433
+
434
+ # With match options
435
+ Canon::Comparison.equivalent?(doc1, doc2,
436
+ preprocessing: :normalize,
437
+ match: { comments: :ignore }
438
+ )
439
+
440
+ # With diff options
441
+ Canon::Comparison.equivalent?(doc1, doc2,
442
+ preprocessing: :format,
443
+ verbose: true,
444
+ diff: { mode: :by_line }
445
+ )
446
+ ----
447
+
448
+ === CLI
449
+
450
+ [source,bash]
451
+ ----
452
+ # Basic usage
453
+ $ canon diff file1.xml file2.xml --preprocessing c14n --verbose
454
+
455
+ # With match options
456
+ $ canon diff file1.xml file2.xml \
457
+ --preprocessing normalize \
458
+ --comments ignore \
459
+ --verbose
460
+
461
+ # Format before comparison
462
+ $ canon diff file1.json file2.json \
463
+ --preprocessing format \
464
+ --verbose
465
+ ----
466
+
467
+ === RSpec
468
+
469
+ Preprocessing is not typically configured globally for RSpec since it's
470
+ usually task-specific. Use match options for global configuration instead.
471
+
472
+ [source,ruby]
473
+ ----
474
+ # Per-test preprocessing
475
+ it 'compares with c14n preprocessing' do
476
+ # Note: RSpec matchers don't directly support preprocessing parameter
477
+ # Use Canon::Comparison.equivalent? directly instead
478
+ result = Canon::Comparison.equivalent?(actual, expected,
479
+ preprocessing: :c14n
480
+ )
481
+ expect(result).to be true
482
+ end
483
+ ----
484
+
485
+ == See also
486
+
487
+ * link:MATCH_ARCHITECTURE[Match architecture]
488
+ * link:MATCH_OPTIONS[Match options]
489
+ * link:FORMATS[Format support]
490
+ * link:RUBY_API[Ruby API documentation]
491
+ * link:CLI[Command-line interface]