canon 0.1.8 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +83 -22
  3. data/docs/Gemfile +1 -0
  4. data/docs/_config.yml +90 -1
  5. data/docs/advanced/diff-classification.adoc +196 -24
  6. data/docs/features/match-options/index.adoc +239 -1
  7. data/lib/canon/comparison/format_detector.rb +2 -1
  8. data/lib/canon/comparison/html_comparator.rb +19 -8
  9. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  10. data/lib/canon/comparison/markup_comparator.rb +109 -2
  11. data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
  12. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  13. data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
  14. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
  15. data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
  16. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
  17. data/lib/canon/comparison/xml_comparator.rb +240 -23
  18. data/lib/canon/comparison/xml_node_comparison.rb +25 -3
  19. data/lib/canon/diff/diff_classifier.rb +119 -5
  20. data/lib/canon/diff/formatting_detector.rb +1 -1
  21. data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
  22. data/lib/canon/rspec_matchers.rb +37 -8
  23. data/lib/canon/version.rb +1 -1
  24. data/lib/canon/xml/data_model.rb +24 -13
  25. metadata +4 -78
  26. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
  27. data/false_positive_analysis.txt +0 -0
  28. data/file1.html +0 -1
  29. data/file2.html +0 -1
  30. data/old-docs/ADVANCED_TOPICS.adoc +0 -20
  31. data/old-docs/BASIC_USAGE.adoc +0 -16
  32. data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
  33. data/old-docs/CLI.adoc +0 -497
  34. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  35. data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
  36. data/old-docs/DIFF_FORMATTING.adoc +0 -540
  37. data/old-docs/DIFF_PARAMETERS.adoc +0 -261
  38. data/old-docs/DOM_DIFF.adoc +0 -1017
  39. data/old-docs/ENV_CONFIG.adoc +0 -876
  40. data/old-docs/FORMATS.adoc +0 -867
  41. data/old-docs/INPUT_VALIDATION.adoc +0 -477
  42. data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
  43. data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
  44. data/old-docs/MATCH_OPTIONS.adoc +0 -912
  45. data/old-docs/MODES.adoc +0 -432
  46. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  47. data/old-docs/OPTIONS.adoc +0 -1387
  48. data/old-docs/PREPROCESSING.adoc +0 -491
  49. data/old-docs/README.old.adoc +0 -2831
  50. data/old-docs/RSPEC.adoc +0 -814
  51. data/old-docs/RUBY_API.adoc +0 -485
  52. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
  53. data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
  54. data/old-docs/STRING_COMPARE.adoc +0 -345
  55. data/old-docs/TMP.adoc +0 -3384
  56. data/old-docs/TREE_DIFF.adoc +0 -1080
  57. data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
  58. data/old-docs/VERBOSE.adoc +0 -482
  59. data/old-docs/VISUALIZATION_MAP.adoc +0 -625
  60. data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
  61. data/scripts/analyze_current_state.rb +0 -85
  62. data/scripts/analyze_false_positives.rb +0 -114
  63. data/scripts/analyze_remaining_failures.rb +0 -105
  64. data/scripts/compare_current_failures.rb +0 -95
  65. data/scripts/compare_dom_tree_diff.rb +0 -158
  66. data/scripts/compare_failures.rb +0 -151
  67. data/scripts/debug_attribute_extraction.rb +0 -66
  68. data/scripts/debug_blocks_839.rb +0 -115
  69. data/scripts/debug_meta_matching.rb +0 -52
  70. data/scripts/debug_p_matching.rb +0 -192
  71. data/scripts/debug_signature_matching.rb +0 -118
  72. data/scripts/debug_sourcecode_124.rb +0 -32
  73. data/scripts/debug_whitespace_sensitive.rb +0 -192
  74. data/scripts/extract_false_positives.rb +0 -138
  75. data/scripts/find_actual_false_positives.rb +0 -125
  76. data/scripts/investigate_all_false_positives.rb +0 -161
  77. data/scripts/investigate_batch1.rb +0 -127
  78. data/scripts/investigate_classification.rb +0 -150
  79. data/scripts/investigate_classification_detailed.rb +0 -190
  80. data/scripts/investigate_common_failures.rb +0 -342
  81. data/scripts/investigate_false_negative.rb +0 -80
  82. data/scripts/investigate_false_positive.rb +0 -83
  83. data/scripts/investigate_false_positives.rb +0 -227
  84. data/scripts/investigate_false_positives_batch.rb +0 -163
  85. data/scripts/investigate_mixed_content.rb +0 -125
  86. data/scripts/investigate_remaining_16.rb +0 -214
  87. data/scripts/run_single_test.rb +0 -29
  88. data/scripts/test_all_false_positives.rb +0 -95
  89. data/scripts/test_attribute_details.rb +0 -61
  90. data/scripts/test_both_algorithms.rb +0 -49
  91. data/scripts/test_both_simple.rb +0 -49
  92. data/scripts/test_enhanced_semantic_output.rb +0 -125
  93. data/scripts/test_readme_examples.rb +0 -131
  94. data/scripts/test_semantic_tree_diff.rb +0 -99
  95. data/scripts/test_semantic_ux_improvements.rb +0 -135
  96. data/scripts/test_single_false_positive.rb +0 -119
  97. data/scripts/test_size_limits.rb +0 -99
  98. data/test_html_1.html +0 -21
  99. data/test_html_2.html +0 -21
  100. data/test_nokogiri.rb +0 -33
  101. data/test_normalize.rb +0 -45
@@ -1,485 +0,0 @@
1
- ---
2
- layout: default
3
- title: Ruby API
4
- nav_order: 10
5
- parent: Basic Usage
6
- grand_parent: Documentation Index
7
- ---
8
- = Canon Ruby API
9
- :toc:
10
- :toclevels: 3
11
-
12
- == Scope
13
-
14
- This document describes how to use Canon from Ruby code. It covers formatting,
15
- parsing, and comparison APIs.
16
-
17
- For command-line usage, see link:CLI[CLI documentation].
18
-
19
- For RSpec testing, see link:RSPEC[RSpec documentation].
20
-
21
- == General
22
-
23
- Canon provides a unified Ruby API for working with XML, HTML, JSON, and YAML
24
- formats. All methods follow consistent patterns across formats.
25
-
26
- == Formatting
27
-
28
- === Canonical formatting
29
-
30
- The `Canon.format` method produces canonical output (compact, normalized).
31
-
32
- Syntax:
33
-
34
- [source,ruby]
35
- ----
36
- Canon.format(content, format)
37
- Canon.format_{format}(content) # Format-specific shorthand
38
- ----
39
-
40
- Where:
41
-
42
- `content`:: The input string
43
- `format`:: The format type (`:xml`, `:html`, `:json`, or `:yaml`)
44
-
45
- .Canonical formatting examples
46
- [example]
47
- ====
48
- [source,ruby]
49
- ----
50
- require 'canon'
51
-
52
- # XML - compact canonical form
53
- xml = '<root><b>2</b><a>1</a></root>'
54
- Canon.format(xml, :xml)
55
- # => "<root><a>1</a><b>2</b></root>"
56
-
57
- Canon.format_xml(xml) # Shorthand
58
- # => "<root><a>1</a><b>2</b></root>"
59
-
60
- # HTML - compact canonical form
61
- html = '<div><p>Hello</p></div>'
62
- Canon.format(html, :html)
63
- Canon.format_html(html) # Shorthand
64
-
65
- # JSON - canonical with sorted keys
66
- json = '{"z":3,"a":1,"b":2}'
67
- Canon.format(json, :json)
68
- # => {"a":1,"b":2,"z":3}
69
-
70
- # YAML - canonical with sorted keys
71
- yaml = "z: 3\na: 1\nb: 2"
72
- Canon.format(yaml, :yaml)
73
- ----
74
- ====
75
-
76
- === Pretty-print formatting
77
-
78
- For human-readable output with indentation, use format-specific pretty printer
79
- classes.
80
-
81
- Syntax:
82
-
83
- [source,ruby]
84
- ----
85
- Canon::{Format}::PrettyPrinter.new(indent: n, indent_type: type).format(content)
86
- ----
87
-
88
- Where:
89
-
90
- `{Format}`:: The format module (`Xml`, `Html`, `Json`)
91
- `n`:: Number of spaces (default: 2) or tabs (use 1 for tabs)
92
- `type`:: Indentation type: `'space'` (default) or `'tab'`
93
- `content`:: The input string
94
-
95
- .Pretty-print examples
96
- [example]
97
- ====
98
- [source,ruby]
99
- ----
100
- require 'canon/pretty_printer/xml'
101
- require 'canon/pretty_printer/html'
102
- require 'canon/pretty_printer/json'
103
-
104
- xml_input = '<root><b>2</b><a>1</a></root>'
105
-
106
- # XML with 2-space indentation (default)
107
- Canon::Xml::PrettyPrinter.new(indent: 2).format(xml_input)
108
- # =>
109
- # <?xml version="1.0" encoding="UTF-8"?>
110
- # <root>
111
- # <a>1</a>
112
- # <b>2</b>
113
- # </root>
114
-
115
- # XML with 4-space indentation
116
- Canon::Xml::PrettyPrinter.new(indent: 4).format(xml_input)
117
-
118
- # XML with tab indentation
119
- Canon::Xml::PrettyPrinter.new(
120
- indent: 1,
121
- indent_type: 'tab'
122
- ).format(xml_input)
123
-
124
- # HTML with 2-space indentation
125
- html_input = '<div><p>Hello</p></div>'
126
- Canon::Html::PrettyPrinter.new(indent: 2).format(html_input)
127
-
128
- # JSON with 2-space indentation
129
- json_input = '{"z":3,"a":{"b":1}}'
130
- Canon::Json::PrettyPrinter.new(indent: 2).format(json_input)
131
-
132
- # JSON with tab indentation
133
- Canon::Json::PrettyPrinter.new(
134
- indent: 1,
135
- indent_type: 'tab'
136
- ).format(json_input)
137
- ----
138
- ====
139
-
140
- == Parsing
141
-
142
- The `Canon.parse` method parses content into Ruby objects or Nokogiri
143
- documents.
144
-
145
- Syntax:
146
-
147
- [source,ruby]
148
- ----
149
- Canon.parse(content, format)
150
- Canon.parse_{format}(content) # Format-specific shorthand
151
- ----
152
-
153
- Where:
154
-
155
- `content`:: The input string
156
- `format`:: The format type (`:xml`, `:html`, `:json`, or `:yaml`)
157
-
158
- .Parsing examples
159
- [example]
160
- ====
161
- [source,ruby]
162
- ----
163
- # Parse XML → Nokogiri::XML::Document
164
- xml_doc = Canon.parse(xml_input, :xml)
165
- xml_doc = Canon.parse_xml(xml_input)
166
-
167
- # Parse HTML → Nokogiri::HTML5::Document (or XML::Document for XHTML)
168
- html_doc = Canon.parse(html_input, :html)
169
- html_doc = Canon.parse_html(html_input)
170
-
171
- # Parse JSON → Ruby Hash/Array
172
- json_obj = Canon.parse(json_input, :json)
173
- json_obj = Canon.parse_json(json_input)
174
-
175
- # Parse YAML → Ruby Hash/Array
176
- yaml_obj = Canon.parse(yaml_input, :yaml)
177
- yaml_obj = Canon.parse_yaml(yaml_input)
178
- ----
179
- ====
180
-
181
- == Comparison
182
-
183
- === General
184
-
185
- The `Canon::Comparison.equivalent?` method compares two documents semantically.
186
-
187
- The comparison uses depth-first traversal of DOM trees (XML/HTML) or object
188
- graphs (JSON/YAML), comparing nodes/values based on configurable match
189
- dimensions.
190
-
191
- See link:MATCH_OPTIONS[Match options] for details on match dimensions and
192
- profiles.
193
-
194
- === Basic comparison
195
-
196
- Syntax:
197
-
198
- [source,ruby]
199
- ----
200
- Canon::Comparison.equivalent?(obj1, obj2, options = {})
201
- ----
202
-
203
- Where:
204
-
205
- `obj1`:: First document (String, Nokogiri document, or Ruby object)
206
- `obj2`:: Second document (String, Nokogiri document, or Ruby object)
207
- `options`:: Hash of comparison options (optional)
208
-
209
- Returns:
210
-
211
- * `true` if documents are semantically equivalent
212
- * `false` if documents differ
213
- * `ComparisonResult` object if `verbose: true`
214
-
215
- Options:
216
-
217
- * `diff_algorithm`: `:dom` (default) or `:semantic` - chooses the diff algorithm
218
- * `verbose`: `true` or `false` - returns detailed results when true
219
- * `match`: Hash of match dimension options
220
- * `match_profile`: Symbol specifying a predefined profile
221
-
222
- .Basic comparison examples
223
- [example]
224
- ====
225
- [source,ruby]
226
- ----
227
- require 'canon/comparison'
228
-
229
- # HTML comparison - ignores whitespace by default
230
- html1 = '<div><p>Hello</p></div>'
231
- html2 = '<div> <p> Hello </p> </div>'
232
- Canon::Comparison.equivalent?(html1, html2)
233
- # => true
234
-
235
- # XML comparison - element order doesn't matter for children
236
- xml1 = '<root><a>1</a><b>2</b></root>'
237
- xml2 = '<root> <b>2</b> <a>1</a> </root>'
238
- Canon::Comparison.equivalent?(xml1, xml2)
239
- # => true
240
-
241
- # JSON comparison
242
- json1 = '{"a":1,"b":2}'
243
- json2 = '{"b":2,"a":1}'
244
- Canon::Comparison.equivalent?(json1, json2)
245
- # => true
246
-
247
- # With Nokogiri documents
248
- doc1 = Nokogiri::HTML5(html1)
249
- doc2 = Nokogiri::HTML5(html2)
250
- Canon::Comparison.equivalent?(doc1, doc2)
251
- # => true
252
- ----
253
- ====
254
-
255
- === Comparison with match options
256
-
257
- Match options control which aspects of documents are compared and how strictly.
258
-
259
- Syntax:
260
-
261
- [source,ruby]
262
- ----
263
- Canon::Comparison.equivalent?(obj1, obj2,
264
- match: {
265
- dimension1: behavior1,
266
- dimension2: behavior2,
267
- ...
268
- }
269
- )
270
- ----
271
-
272
- See link:MATCH_OPTIONS[Match options] for complete dimension reference.
273
-
274
- .Match option examples
275
- [example]
276
- ====
277
- [source,ruby]
278
- ----
279
- # Normalize whitespace in text content
280
- Canon::Comparison.equivalent?(xml1, xml2,
281
- match: {
282
- text_content: :normalize,
283
- structural_whitespace: :ignore
284
- }
285
- )
286
-
287
- # Ignore comments
288
- Canon::Comparison.equivalent?(xml1, xml2,
289
- match: {
290
- comments: :ignore
291
- }
292
- )
293
-
294
- # Strict attribute order
295
- Canon::Comparison.equivalent?(xml1, xml2,
296
- match: {
297
- attribute_order: :strict
298
- }
299
- )
300
-
301
- # Multiple dimensions
302
- Canon::Comparison.equivalent?(html1, html2,
303
- match: {
304
- text_content: :normalize,
305
- structural_whitespace: :ignore,
306
- attribute_order: :ignore,
307
- comments: :ignore
308
- }
309
- )
310
- ----
311
- ====
312
-
313
- === Using match profiles
314
-
315
- Match profiles are predefined combinations of match dimension settings.
316
-
317
- Syntax:
318
-
319
- [source,ruby]
320
- ----
321
- Canon::Comparison.equivalent?(obj1, obj2,
322
- match_profile: :profile_name
323
- )
324
- ----
325
-
326
- Available profiles:
327
-
328
- `:strict`:: Exact matching - all dimensions use `:strict` behavior
329
- `:rendered`:: Mimics browser rendering - ignores formatting differences
330
- `:spec_friendly`:: Test-friendly - ignores most formatting, focuses on content
331
- `:content_only`:: Maximum tolerance - only semantic content matters
332
-
333
- .Match profile examples
334
- [example]
335
- ====
336
- [source,ruby]
337
- ----
338
- # Use spec_friendly profile (common for tests)
339
- Canon::Comparison.equivalent?(xml1, xml2,
340
- match_profile: :spec_friendly
341
- )
342
-
343
- # Use rendered profile (for HTML)
344
- Canon::Comparison.equivalent?(html1, html2,
345
- match_profile: :rendered
346
- )
347
-
348
- # Override profile with specific dimension
349
- Canon::Comparison.equivalent?(xml1, xml2,
350
- match_profile: :spec_friendly,
351
- match: {
352
- comments: :strict # Override profile setting
353
- }
354
- )
355
- ----
356
- ====
357
-
358
- === Verbose mode
359
-
360
- When `verbose: true` is specified, the method returns detailed comparison
361
- results instead of a boolean.
362
-
363
- Syntax:
364
-
365
- [source,ruby]
366
- ----
367
- result = Canon::Comparison.equivalent?(obj1, obj2, verbose: true)
368
- ----
369
-
370
- Returns:
371
-
372
- A Hash with two keys:
373
-
374
- `:differences`:: Array of difference objects (empty if equivalent)
375
- `:preprocessed`:: Two-element array of preprocessed documents
376
-
377
- .Verbose mode examples
378
- [example]
379
- ====
380
- [source,ruby]
381
- ----
382
- # Get detailed diff information
383
- result = Canon::Comparison.equivalent?(xml1, xml2, verbose: true)
384
-
385
- if result[:differences].empty?
386
- puts "Documents are equivalent"
387
- else
388
- puts "Found #{result[:differences].size} differences"
389
- result[:differences].each do |diff|
390
- puts "Difference: #{diff}"
391
- end
392
- end
393
-
394
- # Access preprocessed content
395
- preprocessed1, preprocessed2 = result[:preprocessed]
396
-
397
- # Verbose with custom options
398
- result = Canon::Comparison.equivalent?(xml1, xml2,
399
- verbose: true,
400
- match: {
401
- text_content: :normalize,
402
- comments: :ignore
403
- }
404
- )
405
- ----
406
- ====
407
-
408
- === Format-specific comparators
409
-
410
- You can use format-specific comparator classes directly.
411
-
412
- Syntax:
413
-
414
- [source,ruby]
415
- ----
416
- Canon::Comparison::XmlComparator.equivalent?(obj1, obj2, options)
417
- Canon::Comparison::HtmlComparator.equivalent?(obj1, obj2, options)
418
- Canon::Comparison::JsonComparator.equivalent?(obj1, obj2, options)
419
- Canon::Comparison::YamlComparator.equivalent?(obj1, obj2, options)
420
- ----
421
-
422
- .Format-specific comparator examples
423
- [example]
424
- ====
425
- [source,ruby]
426
- ----
427
- # XML comparison with strict attribute order
428
- Canon::Comparison::XmlComparator.equivalent?(xml1, xml2,
429
- match: {
430
- attribute_order: :strict
431
- }
432
- )
433
-
434
- # HTML comparison with rendered profile
435
- Canon::Comparison::HtmlComparator.equivalent?(html1, html2,
436
- match_profile: :rendered
437
- )
438
-
439
- # JSON comparison ignoring key order
440
- Canon::Comparison::JsonComparator.equivalent?(json1, json2,
441
- match: {
442
- key_order: :ignore
443
- }
444
- )
445
- ----
446
- ====
447
-
448
- == Validation
449
-
450
- Canon validates input before processing and raises `Canon::ValidationError`
451
- for malformed input.
452
-
453
- .Validation error handling
454
- [example]
455
- ====
456
- [source,ruby]
457
- ----
458
- require 'canon'
459
-
460
- malformed_xml = '<root><unclosed>'
461
-
462
- begin
463
- Canon.format(malformed_xml, :xml)
464
- rescue Canon::ValidationError => e
465
- puts e.message
466
- # => XML Validation Error: Premature end of data in tag unclosed line 1
467
- # Line: 1
468
- # Column: 18
469
-
470
- puts "Format: #{e.format}" # => :xml
471
- puts "Line: #{e.line}" # => 1
472
- puts "Column: #{e.column}" # => 18
473
- end
474
- ----
475
- ====
476
-
477
- See link:INPUT_VALIDATION[Input validation] for details.
478
-
479
- == See also
480
-
481
- * link:CLI[Command-line interface]
482
- * link:RSPEC[RSpec matchers]
483
- * link:MATCH_OPTIONS[Match options reference]
484
- * link:FORMATS[Format support details]
485
- * link:INPUT_VALIDATION[Input validation]