canon 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -1
  3. data/.rubocop_todo.yml +276 -7
  4. data/README.adoc +203 -138
  5. data/_config.yml +116 -0
  6. data/docs/ADVANCED_TOPICS.adoc +20 -0
  7. data/docs/BASIC_USAGE.adoc +16 -0
  8. data/docs/CHARACTER_VISUALIZATION.adoc +567 -0
  9. data/docs/CLI.adoc +493 -0
  10. data/docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
  11. data/docs/DIFF_ARCHITECTURE.adoc +435 -0
  12. data/docs/DIFF_FORMATTING.adoc +540 -0
  13. data/docs/FORMATS.adoc +447 -0
  14. data/docs/INDEX.adoc +222 -0
  15. data/docs/INPUT_VALIDATION.adoc +477 -0
  16. data/docs/MATCH_ARCHITECTURE.adoc +463 -0
  17. data/docs/MATCH_OPTIONS.adoc +719 -0
  18. data/docs/MODES.adoc +432 -0
  19. data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
  20. data/docs/OPTIONS.adoc +1387 -0
  21. data/docs/PREPROCESSING.adoc +491 -0
  22. data/docs/RSPEC.adoc +605 -0
  23. data/docs/RUBY_API.adoc +478 -0
  24. data/docs/SEMANTIC_DIFF_REPORT.adoc +528 -0
  25. data/docs/UNDERSTANDING_CANON.adoc +17 -0
  26. data/docs/VERBOSE.adoc +482 -0
  27. data/exe/canon +7 -0
  28. data/lib/canon/cli.rb +179 -0
  29. data/lib/canon/commands/diff_command.rb +195 -0
  30. data/lib/canon/commands/format_command.rb +113 -0
  31. data/lib/canon/comparison/base_comparator.rb +39 -0
  32. data/lib/canon/comparison/comparison_result.rb +79 -0
  33. data/lib/canon/comparison/html_comparator.rb +410 -0
  34. data/lib/canon/comparison/json_comparator.rb +212 -0
  35. data/lib/canon/comparison/match_options.rb +616 -0
  36. data/lib/canon/comparison/xml_comparator.rb +566 -0
  37. data/lib/canon/comparison/yaml_comparator.rb +93 -0
  38. data/lib/canon/comparison.rb +239 -0
  39. data/lib/canon/config.rb +172 -0
  40. data/lib/canon/diff/diff_block.rb +71 -0
  41. data/lib/canon/diff/diff_block_builder.rb +105 -0
  42. data/lib/canon/diff/diff_classifier.rb +46 -0
  43. data/lib/canon/diff/diff_context.rb +85 -0
  44. data/lib/canon/diff/diff_context_builder.rb +107 -0
  45. data/lib/canon/diff/diff_line.rb +77 -0
  46. data/lib/canon/diff/diff_node.rb +56 -0
  47. data/lib/canon/diff/diff_node_mapper.rb +148 -0
  48. data/lib/canon/diff/diff_report.rb +133 -0
  49. data/lib/canon/diff/diff_report_builder.rb +62 -0
  50. data/lib/canon/diff_formatter/by_line/base_formatter.rb +407 -0
  51. data/lib/canon/diff_formatter/by_line/html_formatter.rb +672 -0
  52. data/lib/canon/diff_formatter/by_line/json_formatter.rb +284 -0
  53. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +190 -0
  54. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +860 -0
  55. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +292 -0
  56. data/lib/canon/diff_formatter/by_object/base_formatter.rb +199 -0
  57. data/lib/canon/diff_formatter/by_object/json_formatter.rb +305 -0
  58. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +248 -0
  59. data/lib/canon/diff_formatter/by_object/yaml_formatter.rb +17 -0
  60. data/lib/canon/diff_formatter/character_map.yml +197 -0
  61. data/lib/canon/diff_formatter/debug_output.rb +431 -0
  62. data/lib/canon/diff_formatter/diff_detail_formatter.rb +551 -0
  63. data/lib/canon/diff_formatter/legend.rb +141 -0
  64. data/lib/canon/diff_formatter.rb +520 -0
  65. data/lib/canon/errors.rb +56 -0
  66. data/lib/canon/formatters/html4_formatter.rb +17 -0
  67. data/lib/canon/formatters/html5_formatter.rb +17 -0
  68. data/lib/canon/formatters/html_formatter.rb +37 -0
  69. data/lib/canon/formatters/html_formatter_base.rb +163 -0
  70. data/lib/canon/formatters/json_formatter.rb +3 -0
  71. data/lib/canon/formatters/xml_formatter.rb +20 -55
  72. data/lib/canon/formatters/yaml_formatter.rb +4 -1
  73. data/lib/canon/pretty_printer/html.rb +57 -0
  74. data/lib/canon/pretty_printer/json.rb +25 -0
  75. data/lib/canon/pretty_printer/xml.rb +29 -0
  76. data/lib/canon/rspec_matchers.rb +222 -80
  77. data/lib/canon/validators/base_validator.rb +49 -0
  78. data/lib/canon/validators/html_validator.rb +138 -0
  79. data/lib/canon/validators/json_validator.rb +89 -0
  80. data/lib/canon/validators/xml_validator.rb +53 -0
  81. data/lib/canon/validators/yaml_validator.rb +73 -0
  82. data/lib/canon/version.rb +1 -1
  83. data/lib/canon/xml/attribute_handler.rb +80 -0
  84. data/lib/canon/xml/c14n.rb +36 -0
  85. data/lib/canon/xml/character_encoder.rb +38 -0
  86. data/lib/canon/xml/data_model.rb +225 -0
  87. data/lib/canon/xml/element_matcher.rb +196 -0
  88. data/lib/canon/xml/line_range_mapper.rb +158 -0
  89. data/lib/canon/xml/namespace_handler.rb +86 -0
  90. data/lib/canon/xml/node.rb +32 -0
  91. data/lib/canon/xml/nodes/attribute_node.rb +54 -0
  92. data/lib/canon/xml/nodes/comment_node.rb +23 -0
  93. data/lib/canon/xml/nodes/element_node.rb +56 -0
  94. data/lib/canon/xml/nodes/namespace_node.rb +38 -0
  95. data/lib/canon/xml/nodes/processing_instruction_node.rb +24 -0
  96. data/lib/canon/xml/nodes/root_node.rb +16 -0
  97. data/lib/canon/xml/nodes/text_node.rb +23 -0
  98. data/lib/canon/xml/processor.rb +151 -0
  99. data/lib/canon/xml/whitespace_normalizer.rb +72 -0
  100. data/lib/canon/xml/xml_base_handler.rb +188 -0
  101. data/lib/canon.rb +14 -3
  102. metadata +116 -21
@@ -0,0 +1,477 @@
1
+ ---
2
+ layout: default
3
+ title: Input Validation
4
+ nav_order: 33
5
+ parent: Customizing Behavior
6
+ ---
7
+ = Canon input validation
8
+ :toc:
9
+ :toclevels: 3
10
+
11
+ == Scope
12
+
13
+ This document describes Canon's input validation system for XML, HTML, JSON,
14
+ and YAML formats. Validation ensures malformed input is detected early with
15
+ clear error messages.
16
+
17
+ For format support details, see link:FORMATS[Format support].
18
+
19
+ == General
20
+
21
+ Canon validates input before processing and raises `Canon::ValidationError`
22
+ for malformed input. Validation provides:
23
+
24
+ * **Early error detection**: Problems caught before processing begins
25
+ * **Precise error location**: Line and column numbers pinpoint the problem
26
+ * **Clear error messages**: Descriptive messages explain what's wrong
27
+ * **Test-friendly**: Errors appear in RSpec output, not hidden in log files
28
+
29
+ == How validation works
30
+
31
+ Canon validates input **before parsing** using format-specific validators:
32
+
33
+ * `Canon::Validators::XmlValidator` - Strict XML syntax validation
34
+ * `Canon::Validators::HtmlValidator` - HTML5 and XHTML validation
35
+ * `Canon::Validators::JsonValidator` - JSON syntax validation
36
+ * `Canon::Validators::YamlValidator` - YAML syntax validation
37
+
38
+ Validation happens automatically when you use Canon's formatters or
39
+ comparison methods.
40
+
41
+ == Validation error format
42
+
43
+ When validation fails, Canon raises `Canon::ValidationError` with:
44
+
45
+ `format`:: The format being validated (`:xml`, `:html`, `:json`, `:yaml`)
46
+ `line`:: Line number where the error occurred (if available)
47
+ `column`:: Column number where the error occurred (if available)
48
+ `details`:: Additional context about the error
49
+
50
+ .Validation error structure
51
+ [example]
52
+ ====
53
+ [source,ruby]
54
+ ----
55
+ begin
56
+ Canon.format(malformed_xml, :xml)
57
+ rescue Canon::ValidationError => e
58
+ puts e.message
59
+ # => XML Validation Error: Premature end of data in tag unclosed line 1
60
+ # Line: 1
61
+ # Column: 18
62
+
63
+ puts "Format: #{e.format}" # => :xml
64
+ puts "Line: #{e.line}" # => 1
65
+ puts "Column: #{e.column}" # => 18
66
+ end
67
+ ----
68
+ ====
69
+
70
+ == Format-specific validation
71
+
72
+ === XML validation
73
+
74
+ Uses Nokogiri's strict XML parsing to detect:
75
+
76
+ * Unclosed tags
77
+ * Mismatched tags
78
+ * Invalid XML declaration
79
+ * Malformed attributes
80
+ * Invalid character references
81
+
82
+ .XML validation examples
83
+ [example]
84
+ ====
85
+ **Unclosed tag**:
86
+
87
+ [source,ruby]
88
+ ----
89
+ Canon.format('<root><item>', :xml)
90
+ # => Canon::ValidationError: XML Validation Error: Premature end of data in tag item line 1
91
+ # Line: 1
92
+ ----
93
+
94
+ **Mismatched tags**:
95
+
96
+ [source,ruby]
97
+ ----
98
+ Canon.format('<root><item></root>', :xml)
99
+ # => Canon::ValidationError: XML Validation Error: Opening and ending tag mismatch: item line 1 and root
100
+ # Line: 1
101
+ ----
102
+
103
+ **Invalid character reference**:
104
+
105
+ [source,ruby]
106
+ ----
107
+ Canon.format('<root>&#xGGGG;</root>', :xml)
108
+ # => Canon::ValidationError: XML Validation Error: xmlParseCharRef: invalid hexadecimal value
109
+ # Line: 1
110
+ ----
111
+ ====
112
+
113
+ === HTML validation
114
+
115
+ Automatically detects HTML5 vs XHTML and applies appropriate validation:
116
+
117
+ * **HTML5**: Uses Nokogiri::HTML5 parser with error filtering
118
+ * **XHTML**: Uses strict XML parsing
119
+
120
+ **Special handling**:
121
+
122
+ * Strips XML declarations from HTML (common in legacy HTML files)
123
+ * Filters out non-critical HTML5 parser warnings
124
+ * Only reports significant errors (level 2+)
125
+
126
+ .HTML validation examples
127
+ [example]
128
+ ====
129
+ **Malformed XHTML**:
130
+
131
+ [source,ruby]
132
+ ----
133
+ xhtml = '<html xmlns="http://www.w3.org/1999/xhtml"><body><p>Unclosed'
134
+ Canon.format(xhtml, :html)
135
+ # => Canon::ValidationError: HTML Validation Error: Premature end of data in tag p line 1
136
+ # Line: 1
137
+ ----
138
+
139
+ **HTML5 with errors**:
140
+
141
+ [source,ruby]
142
+ ----
143
+ html5 = '<div><span></div>'
144
+ Canon.format(html5, :html)
145
+ # => Canon::ValidationError: HTML Validation Error: Unexpected end tag : span
146
+ # Line: 1
147
+ ----
148
+
149
+ **Valid HTML** (no error):
150
+
151
+ [source,ruby]
152
+ ----
153
+ html = '<!DOCTYPE html><html><body><p>Content</p></body></html>'
154
+ Canon.format(html, :html)
155
+ # => Successfully formatted
156
+ ----
157
+ ====
158
+
159
+ === JSON validation
160
+
161
+ Validates JSON syntax using Ruby's JSON parser:
162
+
163
+ * Missing/extra braces or brackets
164
+ * Trailing commas
165
+ * Invalid escape sequences
166
+ * Invalid numbers
167
+
168
+ Provides context showing the error location in the JSON structure.
169
+
170
+ .JSON validation examples
171
+ [example]
172
+ ====
173
+ **Missing closing brace**:
174
+
175
+ [source,ruby]
176
+ ----
177
+ Canon.format('{"key": "value"', :json)
178
+ # => Canon::ValidationError: JSON Validation Error: unexpected token at '{"key": "value"'
179
+ # Details: Error at position 16
180
+ ----
181
+
182
+ **Trailing comma** (invalid in JSON):
183
+
184
+ [source,ruby]
185
+ ----
186
+ Canon.format('{"a": 1,}', :json)
187
+ # => Canon::ValidationError: JSON Validation Error: unexpected token at '{"a": 1,}'
188
+ # Details: Error at position 8
189
+ ----
190
+
191
+ **Invalid number**:
192
+
193
+ [source,ruby]
194
+ ----
195
+ Canon.format('{"value": 01}', :json)
196
+ # => Canon::ValidationError: JSON Validation Error: unexpected token
197
+ ----
198
+
199
+ **Valid JSON** (no error):
200
+
201
+ [source,ruby]
202
+ ----
203
+ Canon.format('{"key": "value"}', :json)
204
+ # => Successfully formatted
205
+ ----
206
+ ====
207
+
208
+ === YAML validation
209
+
210
+ Validates YAML syntax using Psych (Ruby's YAML parser):
211
+
212
+ * Invalid indentation
213
+ * Unclosed brackets/braces
214
+ * Invalid anchors/aliases
215
+ * Type mismatches
216
+
217
+ Shows error location with line numbers and context.
218
+
219
+ .YAML validation examples
220
+ [example]
221
+ ====
222
+ **Unclosed bracket**:
223
+
224
+ [source,ruby]
225
+ ----
226
+ Canon.format("key: {unclosed", :yaml)
227
+ # => Canon::ValidationError: YAML Validation Error: (<unknown>): did not find expected node content...
228
+ # Line: 1
229
+ # Details: Shows context around error
230
+ ----
231
+
232
+ **Invalid indentation**:
233
+
234
+ [source,ruby]
235
+ ----
236
+ yaml = <<~YAML
237
+ parent:
238
+ child: value
239
+ YAML
240
+ Canon.format(yaml, :yaml)
241
+ # => Canon::ValidationError: YAML Validation Error: mapping values are not allowed in this context
242
+ # Line: 2
243
+ ----
244
+
245
+ **Valid YAML** (no error):
246
+
247
+ [source,ruby]
248
+ ----
249
+ yaml = "key: value\nlist:\n - item1\n - item2"
250
+ Canon.format(yaml, :yaml)
251
+ # => Successfully formatted
252
+ ----
253
+ ====
254
+
255
+ == Validation in RSpec tests
256
+
257
+ Canon's RSpec matchers automatically propagate validation errors to test
258
+ output, making it easy to see what's wrong.
259
+
260
+ .RSpec validation error example
261
+ [example]
262
+ ====
263
+ [source,ruby]
264
+ ----
265
+ require 'canon/rspec_matchers'
266
+
267
+ RSpec.describe 'XML validation' do
268
+ it 'validates input' do
269
+ malformed_xml = '<root><unclosed>'
270
+ expected_xml = '<root><item/></root>'
271
+
272
+ # This will fail with a clear validation error message
273
+ expect(malformed_xml).to be_xml_equivalent_to(expected_xml)
274
+ end
275
+ end
276
+
277
+ # Test output shows:
278
+ # Canon::ValidationError:
279
+ # XML Validation Error: Premature end of data in tag unclosed line 1
280
+ # Line: 1
281
+ # Column: 18
282
+ ----
283
+
284
+ The error appears directly in the RSpec output, not hidden in separate error
285
+ files or logs.
286
+ ====
287
+
288
+ == Validation in comparison
289
+
290
+ Validation also occurs when using `Canon::Comparison.equivalent?`:
291
+
292
+ .Comparison validation example
293
+ [example]
294
+ ====
295
+ [source,ruby]
296
+ ----
297
+ require 'canon/comparison'
298
+
299
+ xml1 = '<root><item/></root>'
300
+ xml2 = '<root><unclosed>'
301
+
302
+ Canon::Comparison.equivalent?(xml1, xml2)
303
+ # => Canon::ValidationError: XML Validation Error: Premature end of data in tag unclosed line 1
304
+ # Line: 1
305
+ # Column: 18
306
+ ----
307
+ ====
308
+
309
+ == Error handling strategies
310
+
311
+ === Basic error handling
312
+
313
+ [source,ruby]
314
+ ----
315
+ begin
316
+ Canon.format(input, :xml)
317
+ rescue Canon::ValidationError => e
318
+ puts "Validation failed: #{e.message}"
319
+ puts "Line #{e.line}, Column #{e.column}" if e.line
320
+ end
321
+ ----
322
+
323
+ === Format-specific handling
324
+
325
+ [source,ruby]
326
+ ----
327
+ begin
328
+ Canon.format(input, format)
329
+ rescue Canon::ValidationError => e
330
+ case e.format
331
+ when :xml
332
+ handle_xml_error(e)
333
+ when :html
334
+ handle_html_error(e)
335
+ when :json
336
+ handle_json_error(e)
337
+ when :yaml
338
+ handle_yaml_error(e)
339
+ end
340
+ end
341
+ ----
342
+
343
+ === Validation before processing
344
+
345
+ [source,ruby]
346
+ ----
347
+ def process_xml(xml_string)
348
+ # Validate early
349
+ begin
350
+ Canon.format(xml_string, :xml)
351
+ rescue Canon::ValidationError => e
352
+ log_validation_error(e)
353
+ return { error: e.message, line: e.line }
354
+ end
355
+
356
+ # Proceed with processing
357
+ process_valid_xml(xml_string)
358
+ end
359
+ ----
360
+
361
+ == Common validation errors
362
+
363
+ === XML common errors
364
+
365
+ [cols="1,2,1"]
366
+ |===
367
+ |Error |Cause |Solution
368
+
369
+ |Premature end of data
370
+ |Unclosed tag
371
+ |Close all tags
372
+
373
+ |Tag mismatch
374
+ |Opening/closing tags don't match
375
+ |Match tag names exactly
376
+
377
+ |Invalid character reference
378
+ |Bad entity or character code
379
+ |Use valid entities
380
+
381
+ |Invalid XML declaration
382
+ |Malformed `<?xml...?>` tag
383
+ |Fix or remove declaration
384
+ |===
385
+
386
+ === HTML common errors
387
+
388
+ [cols="1,2,1"]
389
+ |===
390
+ |Error |Cause |Solution
391
+
392
+ |Unexpected end tag
393
+ |Mismatched or extra closing tag
394
+ |Match opening/closing tags
395
+
396
+ |Invalid DOCTYPE
397
+ |Malformed document type declaration
398
+ |Use standard DOCTYPE
399
+
400
+ |Unclosed tag
401
+ |Missing closing tag
402
+ |Close all tags properly
403
+
404
+ |Invalid attribute
405
+ |Malformed attribute syntax
406
+ |Fix attribute syntax
407
+ |===
408
+
409
+ === JSON common errors
410
+
411
+ [cols="1,2,1"]
412
+ |===
413
+ |Error |Cause |Solution
414
+
415
+ |Unexpected token
416
+ |Syntax error in JSON
417
+ |Check JSON syntax
418
+
419
+ |Trailing comma
420
+ |Comma after last element
421
+ |Remove trailing commas
422
+
423
+ |Unclosed bracket/brace
424
+ |Missing `]` or `}`
425
+ |Close all brackets/braces
426
+
427
+ |Invalid number
428
+ |Leading zeros or invalid format
429
+ |Use valid number format
430
+ |===
431
+
432
+ === YAML common errors
433
+
434
+ [cols="1,2,1"]
435
+ |===
436
+ |Error |Cause |Solution
437
+
438
+ |Invalid indentation
439
+ |Inconsistent indentation
440
+ |Use consistent spaces
441
+
442
+ |Unclosed bracket
443
+ |Missing closing bracket
444
+ |Close all brackets
445
+
446
+ |Invalid anchor
447
+ |Malformed anchor/alias
448
+ |Fix anchor syntax
449
+
450
+ |Type mismatch
451
+ |Value doesn't match expected type
452
+ |Fix value or type
453
+ |===
454
+
455
+ == Benefits
456
+
457
+ **Early error detection**:: Problems caught before processing begins, saving
458
+ time and providing clear feedback
459
+
460
+ **Precise error location**:: Line and column numbers pinpoint exactly where
461
+ the problem is, especially useful in large documents
462
+
463
+ **Clear error messages**:: Descriptive messages explain what's wrong and
464
+ often suggest how to fix it
465
+
466
+ **Test-friendly**:: Errors appear in RSpec output where developers expect
467
+ them, not in separate log files
468
+
469
+ **Format-aware**:: Each validator understands format-specific rules and
470
+ provides relevant error details
471
+
472
+ == See also
473
+
474
+ * link:FORMATS[Format support]
475
+ * link:RUBY_API[Ruby API documentation]
476
+ * link:RSPEC[RSpec matchers]
477
+ * link:CLI[Command-line interface]