canon 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -1
  3. data/.rubocop_todo.yml +276 -7
  4. data/README.adoc +203 -138
  5. data/_config.yml +116 -0
  6. data/docs/ADVANCED_TOPICS.adoc +20 -0
  7. data/docs/BASIC_USAGE.adoc +16 -0
  8. data/docs/CHARACTER_VISUALIZATION.adoc +567 -0
  9. data/docs/CLI.adoc +493 -0
  10. data/docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
  11. data/docs/DIFF_ARCHITECTURE.adoc +435 -0
  12. data/docs/DIFF_FORMATTING.adoc +540 -0
  13. data/docs/FORMATS.adoc +447 -0
  14. data/docs/INDEX.adoc +222 -0
  15. data/docs/INPUT_VALIDATION.adoc +477 -0
  16. data/docs/MATCH_ARCHITECTURE.adoc +463 -0
  17. data/docs/MATCH_OPTIONS.adoc +719 -0
  18. data/docs/MODES.adoc +432 -0
  19. data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
  20. data/docs/OPTIONS.adoc +1387 -0
  21. data/docs/PREPROCESSING.adoc +491 -0
  22. data/docs/RSPEC.adoc +605 -0
  23. data/docs/RUBY_API.adoc +478 -0
  24. data/docs/SEMANTIC_DIFF_REPORT.adoc +528 -0
  25. data/docs/UNDERSTANDING_CANON.adoc +17 -0
  26. data/docs/VERBOSE.adoc +482 -0
  27. data/exe/canon +7 -0
  28. data/lib/canon/cli.rb +179 -0
  29. data/lib/canon/commands/diff_command.rb +195 -0
  30. data/lib/canon/commands/format_command.rb +113 -0
  31. data/lib/canon/comparison/base_comparator.rb +39 -0
  32. data/lib/canon/comparison/comparison_result.rb +79 -0
  33. data/lib/canon/comparison/html_comparator.rb +410 -0
  34. data/lib/canon/comparison/json_comparator.rb +212 -0
  35. data/lib/canon/comparison/match_options.rb +616 -0
  36. data/lib/canon/comparison/xml_comparator.rb +566 -0
  37. data/lib/canon/comparison/yaml_comparator.rb +93 -0
  38. data/lib/canon/comparison.rb +239 -0
  39. data/lib/canon/config.rb +172 -0
  40. data/lib/canon/diff/diff_block.rb +71 -0
  41. data/lib/canon/diff/diff_block_builder.rb +105 -0
  42. data/lib/canon/diff/diff_classifier.rb +46 -0
  43. data/lib/canon/diff/diff_context.rb +85 -0
  44. data/lib/canon/diff/diff_context_builder.rb +107 -0
  45. data/lib/canon/diff/diff_line.rb +77 -0
  46. data/lib/canon/diff/diff_node.rb +56 -0
  47. data/lib/canon/diff/diff_node_mapper.rb +148 -0
  48. data/lib/canon/diff/diff_report.rb +133 -0
  49. data/lib/canon/diff/diff_report_builder.rb +62 -0
  50. data/lib/canon/diff_formatter/by_line/base_formatter.rb +407 -0
  51. data/lib/canon/diff_formatter/by_line/html_formatter.rb +672 -0
  52. data/lib/canon/diff_formatter/by_line/json_formatter.rb +284 -0
  53. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +190 -0
  54. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +860 -0
  55. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +292 -0
  56. data/lib/canon/diff_formatter/by_object/base_formatter.rb +199 -0
  57. data/lib/canon/diff_formatter/by_object/json_formatter.rb +305 -0
  58. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +248 -0
  59. data/lib/canon/diff_formatter/by_object/yaml_formatter.rb +17 -0
  60. data/lib/canon/diff_formatter/character_map.yml +197 -0
  61. data/lib/canon/diff_formatter/debug_output.rb +431 -0
  62. data/lib/canon/diff_formatter/diff_detail_formatter.rb +551 -0
  63. data/lib/canon/diff_formatter/legend.rb +141 -0
  64. data/lib/canon/diff_formatter.rb +520 -0
  65. data/lib/canon/errors.rb +56 -0
  66. data/lib/canon/formatters/html4_formatter.rb +17 -0
  67. data/lib/canon/formatters/html5_formatter.rb +17 -0
  68. data/lib/canon/formatters/html_formatter.rb +37 -0
  69. data/lib/canon/formatters/html_formatter_base.rb +163 -0
  70. data/lib/canon/formatters/json_formatter.rb +3 -0
  71. data/lib/canon/formatters/xml_formatter.rb +20 -55
  72. data/lib/canon/formatters/yaml_formatter.rb +4 -1
  73. data/lib/canon/pretty_printer/html.rb +57 -0
  74. data/lib/canon/pretty_printer/json.rb +25 -0
  75. data/lib/canon/pretty_printer/xml.rb +29 -0
  76. data/lib/canon/rspec_matchers.rb +222 -80
  77. data/lib/canon/validators/base_validator.rb +49 -0
  78. data/lib/canon/validators/html_validator.rb +138 -0
  79. data/lib/canon/validators/json_validator.rb +89 -0
  80. data/lib/canon/validators/xml_validator.rb +53 -0
  81. data/lib/canon/validators/yaml_validator.rb +73 -0
  82. data/lib/canon/version.rb +1 -1
  83. data/lib/canon/xml/attribute_handler.rb +80 -0
  84. data/lib/canon/xml/c14n.rb +36 -0
  85. data/lib/canon/xml/character_encoder.rb +38 -0
  86. data/lib/canon/xml/data_model.rb +225 -0
  87. data/lib/canon/xml/element_matcher.rb +196 -0
  88. data/lib/canon/xml/line_range_mapper.rb +158 -0
  89. data/lib/canon/xml/namespace_handler.rb +86 -0
  90. data/lib/canon/xml/node.rb +32 -0
  91. data/lib/canon/xml/nodes/attribute_node.rb +54 -0
  92. data/lib/canon/xml/nodes/comment_node.rb +23 -0
  93. data/lib/canon/xml/nodes/element_node.rb +56 -0
  94. data/lib/canon/xml/nodes/namespace_node.rb +38 -0
  95. data/lib/canon/xml/nodes/processing_instruction_node.rb +24 -0
  96. data/lib/canon/xml/nodes/root_node.rb +16 -0
  97. data/lib/canon/xml/nodes/text_node.rb +23 -0
  98. data/lib/canon/xml/processor.rb +151 -0
  99. data/lib/canon/xml/whitespace_normalizer.rb +72 -0
  100. data/lib/canon/xml/xml_base_handler.rb +188 -0
  101. data/lib/canon.rb +14 -3
  102. metadata +116 -21
@@ -1,8 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "canon" unless defined?(::Canon)
4
- require "compare-xml"
5
- require "diffy"
4
+ require "canon/comparison"
5
+ require "canon/diff_formatter"
6
+ require "canon/config"
6
7
 
7
8
  begin
8
9
  require "rspec/expectations"
@@ -11,117 +12,226 @@ end
11
12
 
12
13
  module Canon
13
14
  module RSpecMatchers
15
+ # Configuration for RSpec matchers - delegates to Canon::Config
16
+ class << self
17
+ def configure
18
+ yield Canon::Config.configure
19
+ end
20
+
21
+ def reset_config
22
+ Canon::Config.reset!
23
+ end
24
+
25
+ # Delegate configuration getters to Canon::Config
26
+ def xml
27
+ Canon::Config.instance.xml
28
+ end
29
+
30
+ def html
31
+ Canon::Config.instance.html
32
+ end
33
+
34
+ def json
35
+ Canon::Config.instance.json
36
+ end
37
+
38
+ def yaml
39
+ Canon::Config.instance.yaml
40
+ end
41
+ end
42
+
14
43
  # Base matcher class for serialization equivalence
44
+ # This is a THIN WRAPPER around Canon::Comparison API
15
45
  class SerializationMatcher
16
- def initialize(expected, format = :xml)
46
+ def initialize(expected, format = nil, match_profile: nil,
47
+ match: nil, preprocessing: nil)
17
48
  @expected = expected
18
- unless SUPPORTED_FORMATS.include?(format.to_sym)
19
- raise Canon::Error, "Unsupported format: #{format}"
20
- end
21
-
22
- @format = format.to_sym
23
- @result = nil
49
+ @format = format&.to_sym
50
+ @match_profile = match_profile
51
+ @match = match
52
+ @preprocessing = preprocessing
24
53
  end
25
54
 
26
55
  def matches?(target)
27
56
  @target = target
28
- send("match_#{@format}")
29
- rescue NoMethodError
30
- raise Canon::Error, "Unsupported format: #{@format}"
31
- end
32
-
33
- def match_xml
34
- @result = CompareXML.equivalent?(
35
- Nokogiri::XML(@target),
36
- Nokogiri::XML(@expected),
37
- {
38
- collapse_whitespace: true,
39
- ignore_attr_order: true,
40
- verbose: true,
41
- },
57
+
58
+ # Build comparison options from config and matcher params
59
+ opts = build_comparison_options
60
+
61
+ # Add format hint if explicitly provided
62
+ opts[:format] = @format if @format
63
+
64
+ # Delegate to Canon::Comparison.equivalent? - the SINGLE source of truth
65
+ # Comparison handles format detection, HTML parsing, and all business logic
66
+ @comparison_result = Canon::Comparison.equivalent?(
67
+ @expected,
68
+ @target,
69
+ opts,
42
70
  )
43
71
 
44
- @result.empty?
72
+ # When verbose: true, result is a ComparisonResult object
73
+ # Use the equivalent? method to check for normative differences
74
+ case @comparison_result
75
+ when Canon::Comparison::ComparisonResult
76
+ @comparison_result.equivalent?
77
+ when Hash
78
+ # Legacy format - Hash with :differences array and :preprocessed strings
79
+ @comparison_result[:differences].empty?
80
+ when Array
81
+ # Legacy format - XML/JSON/YAML returns []
82
+ @comparison_result.empty?
83
+ else
84
+ # Boolean result
85
+ @comparison_result
86
+ end
45
87
  end
46
88
 
47
- # Canonicalize and check string equivalence for YAML/JSON
48
- def match_yaml
49
- canonicalize_and_compare(:yaml)
89
+ def failure_message
90
+ "expected #{format_name} to be equivalent\n\n#{diff_output}"
50
91
  end
51
92
 
52
- def match_json
53
- canonicalize_and_compare(:json)
93
+ def failure_message_when_negated
94
+ "expected #{format_name} not to be equivalent"
54
95
  end
55
96
 
56
- private
97
+ def expected
98
+ @expected
99
+ end
57
100
 
58
- def canonicalize_and_compare(format)
59
- @actual_sorted = Canon.format(@target, format)
60
- @expected_sorted = Canon.format(@expected, format)
61
- @actual_sorted == @expected_sorted
101
+ def actual
102
+ @target
62
103
  end
63
104
 
64
- def failure_message
65
- case @format
66
- when :xml
67
- xml_failure_message
68
- when :yaml, :json
69
- generic_failure_message
105
+ def diffable
106
+ false
107
+ end
108
+
109
+ private
110
+
111
+ def format_name
112
+ # Use explicitly provided format if available
113
+ if @format
114
+ case @format
115
+ when :html4, :html5 then "HTML"
116
+ when :string then "STRING"
117
+ else @format.to_s.upcase
118
+ end
119
+ else
120
+ # Fall back to detection only if format not provided
121
+ begin
122
+ detected_format = Canon::Comparison.send(:detect_format, @expected)
123
+ detected_format.to_s.upcase
124
+ rescue StandardError
125
+ "CONTENT"
126
+ end
70
127
  end
71
128
  end
72
129
 
73
- def xml_failure_message
74
- index = 0
75
- @result.map do |hash|
76
- index += 1
77
- index_str = index.to_s.rjust(2, "0")
78
- "DIFF #{index_str}:\n" \
79
- " expected node: #{hash[:node1]}\n" \
80
- " actual node : #{hash[:node2]}\n" \
81
- " diff from : #{hash[:diff1]}\n" \
82
- " diff to : #{hash[:diff2]}\n"
83
- end.join("\n")
84
- end
85
-
86
- def generic_failure_message
87
- diff = Diffy::Diff.new(
88
- @expected_sorted,
89
- @actual_sorted,
90
- include_diff_info: false,
91
- include_plus_and_minus_in_html: true,
92
- diff_options: "-u",
93
- )
130
+ def build_comparison_options
131
+ opts = { verbose: true } # Always use verbose for diff generation
132
+
133
+ # Add per-test parameters (highest priority)
134
+ opts[:match_profile] = @match_profile if @match_profile
135
+ opts[:match] = @match if @match
136
+ opts[:preprocessing] = @preprocessing if @preprocessing
137
+
138
+ # Add global configuration from Canon::Config (lower priority)
139
+ if @format
140
+ config_format = normalize_format_for_config(@format)
141
+
142
+ # Only access config if format is supported
143
+ if Canon::Config.instance.respond_to?(config_format)
144
+ format_config = Canon::Config.instance.public_send(config_format)
145
+ if format_config.match.profile
146
+ opts[:global_profile] =
147
+ format_config.match.profile
148
+ end
149
+ unless format_config.match.options.empty?
150
+ opts[:global_options] =
151
+ format_config.match.options
152
+ end
153
+ opts[:preprocessing] ||= format_config.preprocessing
154
+ elsif !%i[xml html html4 html5 json yaml
155
+ string].include?(@format)
156
+ # Unsupported format - raise error early
157
+ raise Canon::Error, "Unsupported format: #{@format}"
158
+ end
159
+ end
94
160
 
95
- "expected #{@format.to_s.upcase} to be equivalent\n\n" \
96
- "Diff:\n" +
97
- diff.to_s(:color)
161
+ opts
98
162
  end
99
163
 
100
- def failure_message_when_negated
101
- [
102
- "expected:",
103
- @target.to_s,
104
- "not be equivalent to:",
105
- @expected.to_s,
106
- ].join("\n")
164
+ def normalize_format_for_config(format)
165
+ case format
166
+ when :html4, :html5 then :html
167
+ else format
168
+ end
107
169
  end
108
170
 
109
- def diffable
110
- true
171
+ def diff_output
172
+ # For string format, use simple diff since there's no comparison_result
173
+ if @format == :string
174
+ config_format = :xml # Use XML config as fallback for string
175
+ diff_config = Canon::Config.instance.public_send(config_format).diff
176
+
177
+ formatter = Canon::DiffFormatter.new(
178
+ use_color: diff_config.use_color,
179
+ mode: :by_line, # Always use by_line for strings
180
+ context_lines: diff_config.context_lines,
181
+ diff_grouping_lines: diff_config.grouping_lines,
182
+ show_diffs: diff_config.show_diffs,
183
+ )
184
+
185
+ return formatter.format([], :string, doc1: @expected.to_s,
186
+ doc2: @target.to_s)
187
+ end
188
+
189
+ # Get diff configuration
190
+ config_format = normalize_format_for_config(@format || :xml)
191
+ diff_config = Canon::Config.instance.public_send(config_format).diff
192
+
193
+ # Delegate to Canon::DiffFormatter - the SINGLE source of diff generation
194
+ formatter = Canon::DiffFormatter.new(
195
+ use_color: diff_config.use_color,
196
+ mode: diff_config.mode,
197
+ context_lines: diff_config.context_lines,
198
+ diff_grouping_lines: diff_config.grouping_lines,
199
+ show_diffs: diff_config.show_diffs,
200
+ verbose_diff: diff_config.verbose_diff,
201
+ )
202
+
203
+ # Format the diff using the comparison result
204
+ formatter.format_comparison_result(@comparison_result, @expected,
205
+ @target)
206
+ rescue StandardError => e
207
+ "\nError generating diff: #{e.message}"
111
208
  end
112
209
  end
113
210
 
114
211
  # Matcher methods
115
- def be_serialization_equivalent_to(expected, format: :xml)
116
- SerializationMatcher.new(expected, format)
212
+ def be_serialization_equivalent_to(expected, format: :xml,
213
+ match_profile: nil, match: nil,
214
+ preprocessing: nil)
215
+ SerializationMatcher.new(expected, format,
216
+ match_profile: match_profile,
217
+ match: match,
218
+ preprocessing: preprocessing)
117
219
  end
118
220
 
119
- def be_analogous_with(expected)
120
- SerializationMatcher.new(expected, :xml)
221
+ def be_analogous_with(expected, match_profile: nil, match: nil,
222
+ preprocessing: nil)
223
+ SerializationMatcher.new(expected, :xml,
224
+ match_profile: match_profile,
225
+ match: match,
226
+ preprocessing: preprocessing)
121
227
  end
122
228
 
123
- def be_xml_equivalent_to(expected)
124
- SerializationMatcher.new(expected, :xml)
229
+ def be_xml_equivalent_to(expected, match_profile: nil, match: nil,
230
+ preprocessing: nil)
231
+ SerializationMatcher.new(expected, :xml,
232
+ match_profile: match_profile,
233
+ match: match,
234
+ preprocessing: preprocessing)
125
235
  end
126
236
 
127
237
  def be_yaml_equivalent_to(expected)
@@ -132,7 +242,39 @@ module Canon
132
242
  SerializationMatcher.new(expected, :json)
133
243
  end
134
244
 
135
- if defined?(::RSpec)
245
+ def be_html_equivalent_to(expected, match_profile: nil, match: nil,
246
+ preprocessing: nil)
247
+ SerializationMatcher.new(expected, :html,
248
+ match_profile: match_profile,
249
+ match: match,
250
+ preprocessing: preprocessing)
251
+ end
252
+
253
+ def be_html4_equivalent_to(expected, match_profile: nil, match: nil,
254
+ preprocessing: nil)
255
+ SerializationMatcher.new(expected, :html4,
256
+ match_profile: match_profile,
257
+ match: match,
258
+ preprocessing: preprocessing)
259
+ end
260
+
261
+ def be_html5_equivalent_to(expected, match_profile: nil, match: nil,
262
+ preprocessing: nil)
263
+ SerializationMatcher.new(expected, :html5,
264
+ match_profile: match_profile,
265
+ match: match,
266
+ preprocessing: preprocessing)
267
+ end
268
+
269
+ def be_equivalent_to(expected)
270
+ SerializationMatcher.new(expected, nil)
271
+ end
272
+
273
+ def be_string_equivalent_to(expected)
274
+ SerializationMatcher.new(expected, :string)
275
+ end
276
+
277
+ if defined?(::RSpec) && ::RSpec.respond_to?(:configure)
136
278
  RSpec.configure do |config|
137
279
  config.include(Canon::RSpecMatchers)
138
280
  end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../errors"
4
+
5
+ module Canon
6
+ module Validators
7
+ # Base class for all input validators
8
+ #
9
+ # This abstract base class defines the interface that all format-specific
10
+ # validators must implement. Each validator is responsible for validating
11
+ # input in a specific format and raising detailed ValidationError when
12
+ # issues are found.
13
+ class BaseValidator
14
+ # Validate input and raise ValidationError if invalid
15
+ #
16
+ # @param input [String] The input to validate
17
+ # @raise [Canon::ValidationError] If input is invalid
18
+ # @return [void]
19
+ def self.validate!(input)
20
+ raise NotImplementedError,
21
+ "#{name} must implement validate! method"
22
+ end
23
+
24
+ # Extract line and column information from an error
25
+ #
26
+ # @param error [Exception] The error containing location information
27
+ # @return [Hash] Hash with :line and :column keys
28
+ def self.extract_location(error)
29
+ line = nil
30
+ column = nil
31
+
32
+ # Try to extract line/column from error message
33
+ if error.respond_to?(:line)
34
+ line = error.line
35
+ elsif error.message =~ /line[:\s]+(\d+)/i
36
+ line = ::Regexp.last_match(1).to_i
37
+ end
38
+
39
+ if error.respond_to?(:column)
40
+ column = error.column
41
+ elsif error.message =~ /column[:\s]+(\d+)/i
42
+ column = ::Regexp.last_match(1).to_i
43
+ end
44
+
45
+ { line: line, column: column }
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,138 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+ require_relative "base_validator"
5
+
6
+ module Canon
7
+ module Validators
8
+ # Validator for HTML input
9
+ #
10
+ # Validates HTML input (HTML4, HTML5, or XHTML) using Nokogiri.
11
+ # Automatically detects the HTML type and applies appropriate validation.
12
+ # Raises detailed ValidationError with line and column information
13
+ # when malformed HTML is detected.
14
+ class HtmlValidator < BaseValidator
15
+ # Validate HTML input
16
+ #
17
+ # @param input [String] The HTML string to validate
18
+ # @raise [Canon::ValidationError] If HTML is malformed
19
+ # @return [void]
20
+ def self.validate!(input)
21
+ return if input.nil? || input.strip.empty?
22
+
23
+ # Strip XML declaration for validation (it's not critical for parsing)
24
+ cleaned_input = input.sub(/\A\s*<\?xml[^?]*\?>\s*/, "")
25
+
26
+ if xhtml?(cleaned_input)
27
+ validate_xhtml!(cleaned_input)
28
+ else
29
+ validate_html5!(cleaned_input)
30
+ end
31
+ end
32
+
33
+ # Check if HTML is XHTML
34
+ #
35
+ # @param html [String] The HTML string to check
36
+ # @return [Boolean] true if XHTML, false otherwise
37
+ def self.xhtml?(html)
38
+ html.include?("XHTML") ||
39
+ html.include?('xmlns="http://www.w3.org/1999/xhtml"') ||
40
+ html.match?(/xmlns:\w+/)
41
+ end
42
+
43
+ # Validate XHTML input using XML strict parsing
44
+ #
45
+ # @param input [String] The XHTML string to validate
46
+ # @raise [Canon::ValidationError] If XHTML is malformed
47
+ # @return [void]
48
+ def self.validate_xhtml!(input)
49
+ Nokogiri::XML(input) do |config|
50
+ config.strict.nonet
51
+ end
52
+ rescue Nokogiri::XML::SyntaxError => e
53
+ location = extract_location(e)
54
+ raise Canon::ValidationError.new(
55
+ e.message.split("\n").first,
56
+ format: :html,
57
+ line: location[:line],
58
+ column: location[:column],
59
+ details: "XHTML validation failed: #{extract_details(e)}",
60
+ )
61
+ end
62
+
63
+ # Validate HTML5 input
64
+ #
65
+ # @param input [String] The HTML5 string to validate
66
+ # @raise [Canon::ValidationError] If HTML5 is malformed
67
+ # @return [void]
68
+ def self.validate_html5!(input)
69
+ doc = Nokogiri::HTML5(input, max_errors: 100)
70
+
71
+ # Check for parse errors
72
+ return unless doc.errors.any?
73
+
74
+ # Find first significant error (level 2 = error, level 1 = warning)
75
+ # Filter out doctype warnings and other non-critical issues
76
+ significant_errors = doc.errors.select do |e|
77
+ e.level >= 2 && !doctype_or_warning?(e)
78
+ end
79
+
80
+ return if significant_errors.empty?
81
+
82
+ error = significant_errors.first
83
+ location = extract_location(error)
84
+ raise Canon::ValidationError.new(
85
+ error.message,
86
+ format: :html,
87
+ line: location[:line],
88
+ column: location[:column],
89
+ details: build_error_details(significant_errors),
90
+ )
91
+ end
92
+
93
+ # Extract additional error details
94
+ #
95
+ # @param error [Nokogiri::XML::SyntaxError] The syntax error
96
+ # @return [String, nil] Additional details about the error
97
+ def self.extract_details(error)
98
+ return nil unless error.respond_to?(:errors)
99
+
100
+ details = error.errors.map(&:message).reject do |msg|
101
+ msg == error.message
102
+ end
103
+ details.join("; ") unless details.empty?
104
+ end
105
+
106
+ # Build error details from multiple errors
107
+ #
108
+ # @param errors [Array<Nokogiri::XML::SyntaxError>] Array of errors
109
+ # @return [String, nil] Combined error details
110
+ def self.build_error_details(errors)
111
+ return nil if errors.size <= 1
112
+
113
+ significant = errors.select { |e| e.level >= 2 }
114
+ return nil if significant.empty?
115
+
116
+ details = significant[1..3].map do |e|
117
+ loc = extract_location(e)
118
+ msg = e.message
119
+ msg += " (line #{loc[:line]})" if loc[:line]
120
+ msg
121
+ end
122
+ details.join("; ")
123
+ end
124
+
125
+ # Check if error is a doctype or other non-critical warning
126
+ #
127
+ # @param error [Nokogiri::XML::SyntaxError] The error to check
128
+ # @return [Boolean] true if error is non-critical
129
+ def self.doctype_or_warning?(error)
130
+ error.message.match?(/doctype|Expected a doctype token/i)
131
+ end
132
+
133
+ private_class_method :xhtml?, :validate_xhtml!, :validate_html5!,
134
+ :extract_details, :build_error_details,
135
+ :doctype_or_warning?
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require_relative "base_validator"
5
+
6
+ module Canon
7
+ module Validators
8
+ # Validator for JSON input
9
+ #
10
+ # Validates JSON input using Ruby's JSON parser.
11
+ # Raises detailed ValidationError with position information
12
+ # when malformed JSON is detected.
13
+ class JsonValidator < BaseValidator
14
+ # Validate JSON input
15
+ #
16
+ # @param input [String] The JSON string to validate
17
+ # @raise [Canon::ValidationError] If JSON is malformed
18
+ # @return [void]
19
+ def self.validate!(input)
20
+ return if input.nil? || input.strip.empty?
21
+
22
+ JSON.parse(input)
23
+ rescue JSON::ParserError => e
24
+ # Extract position from error message
25
+ position = extract_position(e.message)
26
+
27
+ raise Canon::ValidationError.new(
28
+ clean_error_message(e.message),
29
+ format: :json,
30
+ line: position[:line],
31
+ column: position[:column],
32
+ details: extract_context(input, position),
33
+ )
34
+ end
35
+
36
+ # Extract line and column from JSON error message
37
+ #
38
+ # @param message [String] The error message
39
+ # @return [Hash] Hash with :line and :column keys
40
+ def self.extract_position(message)
41
+ line = nil
42
+ column = nil
43
+
44
+ # JSON errors often report character position
45
+ if message =~ /at line (\d+), column (\d+)/i
46
+ line = ::Regexp.last_match(1).to_i
47
+ column = ::Regexp.last_match(2).to_i
48
+ elsif /at character offset (\d+)/i.match?(message)
49
+ # For character offset, we can't easily determine line/column
50
+ # without parsing the input
51
+ end
52
+
53
+ { line: line, column: column }
54
+ end
55
+
56
+ # Clean error message by removing technical details
57
+ #
58
+ # @param message [String] The raw error message
59
+ # @return [String] Cleaned error message
60
+ def self.clean_error_message(message)
61
+ # Remove 'unexpected token' technical details and keep main message
62
+ message.split(" at ").first.strip
63
+ end
64
+
65
+ # Extract context around the error position
66
+ #
67
+ # @param input [String] The input JSON string
68
+ # @param position [Hash] Position hash with :line key
69
+ # @return [String, nil] Context snippet around the error
70
+ def self.extract_context(input, position)
71
+ return nil unless position[:line]
72
+
73
+ lines = input.split("\n")
74
+ line_idx = position[:line] - 1
75
+ return nil if line_idx.negative? || line_idx >= lines.size
76
+
77
+ # Get the problematic line and surrounding lines
78
+ start_idx = [0, line_idx - 1].max
79
+ end_idx = [lines.size - 1, line_idx + 1].min
80
+
81
+ context_lines = lines[start_idx..end_idx]
82
+ "Near: #{context_lines.join(' ')}"
83
+ end
84
+
85
+ private_class_method :extract_position, :clean_error_message,
86
+ :extract_context
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+ require_relative "base_validator"
5
+
6
+ module Canon
7
+ module Validators
8
+ # Validator for XML input
9
+ #
10
+ # Validates XML input using Nokogiri's strict parsing mode.
11
+ # Raises detailed ValidationError with line and column information
12
+ # when malformed XML is detected.
13
+ class XmlValidator < BaseValidator
14
+ # Validate XML input
15
+ #
16
+ # @param input [String] The XML string to validate
17
+ # @raise [Canon::ValidationError] If XML is malformed
18
+ # @return [void]
19
+ def self.validate!(input)
20
+ return if input.nil? || input.strip.empty?
21
+
22
+ # Parse with strict error handling
23
+ Nokogiri::XML(input) do |config|
24
+ config.strict.nonet
25
+ end
26
+ rescue Nokogiri::XML::SyntaxError => e
27
+ location = extract_location(e)
28
+ raise Canon::ValidationError.new(
29
+ e.message.split("\n").first,
30
+ format: :xml,
31
+ line: location[:line],
32
+ column: location[:column],
33
+ details: extract_details(e),
34
+ )
35
+ end
36
+
37
+ # Extract additional error details
38
+ #
39
+ # @param error [Nokogiri::XML::SyntaxError] The syntax error
40
+ # @return [String, nil] Additional details about the error
41
+ def self.extract_details(error)
42
+ return nil unless error.respond_to?(:errors)
43
+
44
+ details = error.errors.map(&:message).reject do |msg|
45
+ msg == error.message
46
+ end
47
+ details.join("; ") unless details.empty?
48
+ end
49
+
50
+ private_class_method :extract_details
51
+ end
52
+ end
53
+ end