canon 0.1.15 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '04860609f8d3300ccebf84a0f2208510600dcfaac4b3f54f698eb2de7eed0493'
4
- data.tar.gz: fc50abe2a915d7d7ff1cd630c0a5a50849b6fd5780664cb6bb419300b2388743
3
+ metadata.gz: 0eb3c717365f052953d3deaf83a897112709c1a6084b472b99ddfdc2c9e43b67
4
+ data.tar.gz: fe4b2b513193b87692cd1fcb11569898e69c6818bea08ae3dccc753ad935f6e0
5
5
  SHA512:
6
- metadata.gz: d1bcc3ad7439fdd7f65627c53cd0dc4b92d781bdbdaf330d2bb8ecc89b3319a4fc02e78de8e961cdb69854661881db39bf59c664210c7a59b7a011355b1db71b
7
- data.tar.gz: cca1b3f2eee48054b5431118350e64acb46f485096cd8c88798e0935210985d63eb93de4727a42d17cfaad8ef03c55e7d1745922500f3a3495a791d86fd60676
6
+ metadata.gz: 2c6d351b873ebb745c5abcdb2ff6cdbcf4ce53da1ad7f070c0b1eefeeeb776e315fa62c8d82c24b216e6e93cf5ac1790ebe3c6a171a142036ef0abc356d5a9e6
7
+ data.tar.gz: 6c0228d16e387e2a7919786cb57636e5c3183f0a1a1e119684fb0e01122a5ac23ffc08849f9df55bf413495024de30c4bf2e420172e13a9197d48b30636f845a
data/.rubocop_todo.yml CHANGED
@@ -1,64 +1,24 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2026-02-17 14:18:53 UTC using RuboCop version 1.81.7.
3
+ # on 2026-03-21 03:07:35 UTC using RuboCop version 1.85.1.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
7
7
  # versions of RuboCop, may require this file to be generated again.
8
8
 
9
9
  # Offense count: 1
10
- # Configuration parameters: Severity.
11
10
  Gemspec/RequiredRubyVersion:
12
11
  Exclude:
13
12
  - 'canon.gemspec'
14
13
 
15
- # Offense count: 1
16
- # This cop supports safe autocorrection (--autocorrect).
17
- # Configuration parameters: EnforcedStyle, IndentationWidth.
18
- # SupportedStyles: with_first_argument, with_fixed_indentation
19
- Layout/ArgumentAlignment:
20
- Exclude:
21
- - 'lib/canon/xml/element_matcher.rb'
22
-
23
- # Offense count: 23
24
- # This cop supports safe autocorrection (--autocorrect).
25
- # Configuration parameters: EnforcedStyleAlignWith.
26
- # SupportedStylesAlignWith: either, start_of_block, start_of_line
27
- Layout/BlockAlignment:
28
- Exclude:
29
- - 'spec/canon/fixtures/isodoc_spec.rb'
30
- - 'spec/canon/table_class_attribute_bug_spec.rb'
31
-
32
- # Offense count: 23
14
+ # Offense count: 773
33
15
  # This cop supports safe autocorrection (--autocorrect).
34
- Layout/BlockEndNewline:
35
- Exclude:
36
- - 'spec/canon/fixtures/isodoc_spec.rb'
37
- - 'spec/canon/table_class_attribute_bug_spec.rb'
38
-
39
- # Offense count: 46
40
- # This cop supports safe autocorrection (--autocorrect).
41
- # Configuration parameters: Width, AllowedPatterns.
42
- Layout/IndentationWidth:
43
- Exclude:
44
- - 'spec/canon/fixtures/isodoc_spec.rb'
45
- - 'spec/canon/table_class_attribute_bug_spec.rb'
46
-
47
- # Offense count: 780
48
- # This cop supports safe autocorrection (--autocorrect).
49
- # Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, IgnoreCopDirectives, AllowedPatterns, SplitStrings.
16
+ # Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
50
17
  # URISchemes: http, https
51
18
  Layout/LineLength:
52
19
  Enabled: false
53
20
 
54
- # Offense count: 1
55
- # This cop supports safe autocorrection (--autocorrect).
56
- # Configuration parameters: AllowInHeredoc.
57
- Layout/TrailingWhitespace:
58
- Exclude:
59
- - 'lib/canon/xml/element_matcher.rb'
60
-
61
- # Offense count: 48
21
+ # Offense count: 49
62
22
  # Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
63
23
  Lint/DuplicateBranch:
64
24
  Enabled: false
@@ -87,45 +47,44 @@ Lint/UnreachableCode:
87
47
  Exclude:
88
48
  - 'lib/canon/diff_formatter/debug_output.rb'
89
49
 
90
- # Offense count: 7
50
+ # Offense count: 6
91
51
  # This cop supports safe autocorrection (--autocorrect).
92
52
  # Configuration parameters: AllowUnusedKeywordArguments, IgnoreEmptyMethods, IgnoreNotImplementedMethods, NotImplementedExceptions.
93
53
  # NotImplementedExceptions: NotImplementedError
94
54
  Lint/UnusedMethodArgument:
95
55
  Exclude:
96
- - 'lib/canon/comparison.rb'
97
56
  - 'lib/canon/diff/path_builder.rb'
98
57
  - 'lib/canon/diff_formatter/by_line/base_formatter.rb'
99
58
  - 'lib/canon/diff_formatter/by_line/xml_formatter.rb'
100
59
  - 'lib/canon/diff_formatter/by_object/base_formatter.rb'
101
60
 
102
- # Offense count: 215
61
+ # Offense count: 229
103
62
  # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
104
63
  Metrics/AbcSize:
105
64
  Enabled: false
106
65
 
107
- # Offense count: 21
66
+ # Offense count: 22
108
67
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns, inherit_mode.
109
68
  # AllowedMethods: refine
110
69
  Metrics/BlockLength:
111
70
  Max: 84
112
71
 
113
- # Offense count: 183
72
+ # Offense count: 187
114
73
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
115
74
  Metrics/CyclomaticComplexity:
116
75
  Enabled: false
117
76
 
118
- # Offense count: 369
77
+ # Offense count: 394
119
78
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
120
79
  Metrics/MethodLength:
121
- Max: 115
80
+ Max: 95
122
81
 
123
- # Offense count: 44
82
+ # Offense count: 45
124
83
  # Configuration parameters: CountKeywordArgs, MaxOptionalParameters.
125
84
  Metrics/ParameterLists:
126
85
  Max: 9
127
86
 
128
- # Offense count: 149
87
+ # Offense count: 154
129
88
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
130
89
  Metrics/PerceivedComplexity:
131
90
  Enabled: false
@@ -139,16 +98,6 @@ Naming/MethodParameterName:
139
98
  - 'lib/canon/comparison/xml_comparator/attribute_comparator.rb'
140
99
  - 'lib/canon/xml/namespace_handler.rb'
141
100
 
142
- # Offense count: 1
143
- # Configuration parameters: NamePrefix, ForbiddenPrefixes, AllowedMethods, MethodDefinitionMacros, UseSorbetSigs.
144
- # NamePrefix: is_, has_, have_, does_
145
- # ForbiddenPrefixes: is_, has_, have_, does_
146
- # AllowedMethods: is_a?
147
- # MethodDefinitionMacros: define_method, define_singleton_method
148
- Naming/PredicatePrefix:
149
- Exclude:
150
- - 'lib/canon/comparison/html_comparator.rb'
151
-
152
101
  # Offense count: 6
153
102
  # Configuration parameters: EnforcedStyle, CheckMethodNames, CheckSymbols, AllowedIdentifiers, AllowedPatterns.
154
103
  # SupportedStyles: snake_case, normalcase, non_integer
@@ -159,13 +108,12 @@ Naming/VariableNumber:
159
108
  - 'lib/canon/comparison/markup_comparator.rb'
160
109
  - 'lib/canon/comparison/xml_comparator/diff_node_builder.rb'
161
110
 
162
- # Offense count: 13
111
+ # Offense count: 2
163
112
  # Configuration parameters: MinSize.
164
113
  Performance/CollectionLiteralInLoop:
165
114
  Exclude:
166
115
  - 'lib/canon/comparison/html_comparator.rb'
167
116
  - 'lib/canon/xml/xml_base_handler.rb'
168
- - 'spec/canon/table_class_attribute_bug_spec.rb'
169
117
 
170
118
  # Offense count: 68
171
119
  # Configuration parameters: Prefixes, AllowedPatterns.
@@ -184,10 +132,10 @@ RSpec/DescribeMethod:
184
132
  - 'spec/canon/comparison/multiple_differences_spec.rb'
185
133
  - 'spec/canon/diff_formatter/character_map_customization_spec.rb'
186
134
 
187
- # Offense count: 696
135
+ # Offense count: 695
188
136
  # Configuration parameters: CountAsOne.
189
137
  RSpec/ExampleLength:
190
- Max: 67
138
+ Max: 43
191
139
 
192
140
  # Offense count: 8
193
141
  # This cop supports safe autocorrection (--autocorrect).
@@ -240,7 +188,7 @@ RSpec/MultipleDescribes:
240
188
  RSpec/MultipleExpectations:
241
189
  Max: 15
242
190
 
243
- # Offense count: 70
191
+ # Offense count: 71
244
192
  # Configuration parameters: AllowSubject.
245
193
  RSpec/MultipleMemoizedHelpers:
246
194
  Max: 13
@@ -259,13 +207,12 @@ RSpec/NamedSubject:
259
207
  RSpec/NestedGroups:
260
208
  Max: 4
261
209
 
262
- # Offense count: 11
210
+ # Offense count: 10
263
211
  # Configuration parameters: AllowedPatterns.
264
212
  # AllowedPatterns: ^expect_, ^assert_
265
213
  RSpec/NoExpectationExample:
266
214
  Exclude:
267
215
  - 'spec/canon/context_grouping_spec.rb'
268
- - 'spec/canon/fixtures/isodoc_spec.rb'
269
216
  - 'spec/canon/informative_diffs_debug_spec.rb'
270
217
  - 'spec/canon/isodoc_blockquotes_spec.rb'
271
218
  - 'spec/canon/match_scenarios_spec.rb'
@@ -283,28 +230,17 @@ RSpec/SpecFilePathFormat:
283
230
  - 'spec/canon/yaml/formatter_spec.rb'
284
231
  - 'spec/xml_c14n_spec.rb'
285
232
 
286
- # Offense count: 120
233
+ # Offense count: 126
287
234
  # Configuration parameters: IgnoreNameless, IgnoreSymbolicNames.
288
235
  RSpec/VerifiedDoubles:
289
236
  Exclude:
237
+ - 'spec/canon/comparison/diff_node_builder_spec.rb'
290
238
  - 'spec/canon/comparison/whitespace_sensitivity_spec.rb'
291
239
  - 'spec/canon/diff/diff_classifier_spec.rb'
292
240
  - 'spec/canon/diff/path_builder_spec.rb'
293
241
  - 'spec/canon/diff/xml_serialization_formatter_spec.rb'
294
242
  - 'spec/canon/tree_diff/operation_converter_spec.rb'
295
243
 
296
- # Offense count: 44
297
- # This cop supports safe autocorrection (--autocorrect).
298
- # Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
299
- # SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
300
- # ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
301
- # FunctionalMethods: let, let!, subject, watch
302
- # AllowedMethods: lambda, proc, it
303
- Style/BlockDelimiters:
304
- Exclude:
305
- - 'spec/canon/fixtures/isodoc_spec.rb'
306
- - 'spec/canon/table_class_attribute_bug_spec.rb'
307
-
308
244
  # Offense count: 1
309
245
  # This cop supports safe autocorrection (--autocorrect).
310
246
  # Configuration parameters: EnforcedStyle, AllowComments.
data/README.adoc CHANGED
@@ -901,6 +901,63 @@ After checking out the repo, run `bin/setup` to install dependencies. Then run
901
901
  `rake spec` to run the tests. You can also run `bin/console` for an interactive
902
902
  prompt.
903
903
 
904
+ == Performance
905
+
906
+ Canon includes a comprehensive performance benchmarking system to prevent regressions in XML/HTML parsing and comparison algorithms.
907
+
908
+ === Running Benchmarks
909
+
910
+ [source,bash]
911
+ ----
912
+ # Run all benchmarks (10s per test, takes ~5 minutes)
913
+ bundle exec rake performance:run
914
+
915
+ # Quick benchmark (2s per test, takes ~30 seconds)
916
+ bundle exec rake performance:quick
917
+
918
+ # Compare against main branch (for PRs)
919
+ bundle exec rake performance:compare
920
+
921
+ # Run specific category
922
+ bundle exec rake performance:category[xml_parsing]
923
+
924
+ # Output formats
925
+ bundle exec rake performance:json
926
+ bundle exec rake performance:yaml
927
+ ----
928
+
929
+ === Benchmark Categories
930
+
931
+ * **XML Parsing**: DOM vs SAX, simple vs large documents
932
+ * **HTML Parsing**: Simple vs complex (with scripts/tables)
933
+ * **XML/HTML Comparison**: Identical, similar, different documents
934
+ * **Format Canonicalization**: XML C14N, JSON, YAML
935
+
936
+ === SAX Parser
937
+
938
+ Canon includes a SAX-based XML parser (`Canon::Xml::SaxBuilder`) that provides
939
+ significantly faster XML parsing by avoiding intermediate Nokogiri DOM trees.
940
+
941
+ [source,ruby]
942
+ ----
943
+ require 'canon/xml/sax_builder'
944
+
945
+ # Parse XML directly to Canon::Xml::Node tree
946
+ root = Canon::Xml::SaxBuilder.parse(xml_string)
947
+
948
+ # For C14N (strips DOCTYPE to avoid DTD default attribute expansion)
949
+ root = Canon::Xml::SaxBuilder.parse(xml_string, strip_doctype: true)
950
+ ----
951
+
952
+ Performance improvement: ~6x faster than DOM parsing + conversion for large documents.
953
+
954
+ === CI Integration
955
+
956
+ Performance benchmarks run automatically on:
957
+
958
+ * **Pull requests**: Compares against `main` branch and fails if regression exceeds 10%
959
+ * **Main branch pushes**: Logs baseline metrics for performance tracking
960
+
904
961
  == Contributing
905
962
 
906
963
  Bug reports and pull requests are welcome on GitHub at
data/Rakefile CHANGED
@@ -9,4 +9,6 @@ require "rubocop/rake_task"
9
9
 
10
10
  RuboCop::RakeTask.new
11
11
 
12
+ Dir.glob("lib/tasks/**/*.rake").each { |r| load r }
13
+
12
14
  task default: %i[spec rubocop]
@@ -92,6 +92,16 @@ Error handling and validation.
92
92
  * Format detection
93
93
  * Error messages
94
94
 
95
+ === Performance
96
+
97
+ link:performance/[**Performance**]::
98
+ Performance benchmarking and optimization.
99
+ +
100
+ * SAX-based XML parser
101
+ * Benchmark categories
102
+ * CI integration
103
+ * Regression detection
104
+
95
105
  == Quick Configuration Examples
96
106
 
97
107
  === Test-Friendly Comparison
@@ -0,0 +1,161 @@
1
+ ---
2
+ layout: default
3
+ title: Performance
4
+ nav_order: 100
5
+ ---
6
+ = Performance
7
+
8
+ Canon includes a comprehensive performance benchmarking system to prevent regressions in XML/HTML parsing and comparison algorithms.
9
+
10
+ == Running Benchmarks
11
+
12
+ [source,bash]
13
+ ----
14
+ # Run all benchmarks (10s per test, ~5 minutes total)
15
+ bundle exec rake performance:run
16
+
17
+ # Quick benchmark (2s per test, ~30 seconds)
18
+ bundle exec rake performance:quick
19
+
20
+ # Compare against main branch (for PRs, fails on regression)
21
+ bundle exec rake performance:compare
22
+
23
+ # Run specific category
24
+ bundle exec rake performance:category[xml_parsing]
25
+
26
+ # Output in different formats
27
+ bundle exec rake performance:json
28
+ bundle exec rake performance:yaml
29
+ ----
30
+
31
+ == Benchmark Categories
32
+
33
+ === XML Parsing
34
+
35
+ * **DOM (simple)**: Standard Nokogiri DOM parsing
36
+ * **SAX (simple)**: SAX-based streaming parser
37
+ * **DOM (large)**: Large document DOM parsing
38
+ * **SAX (large)**: Large document SAX parsing
39
+
40
+ SAX parser is typically ~20-50% faster than DOM for XML parsing.
41
+
42
+ === HTML Parsing
43
+
44
+ * **Simple HTML**: Basic HTML document parsing
45
+ * **Complex HTML**: HTML with scripts, styles, and tables
46
+
47
+ === XML/HTML Comparison
48
+
49
+ * **Identical**: Comparing the same document to itself
50
+ * **Similar**: Comparing documents with minor differences
51
+ * **Different**: Comparing documents with different namespaces/structure
52
+
53
+ === Format Canonicalization
54
+
55
+ * **XML C14N**: W3C Canonical XML
56
+ * **JSON**: JSON formatting
57
+ * **YAML**: YAML formatting
58
+
59
+ == SAX Parser
60
+
61
+ Canon includes a SAX-based XML parser (`Canon::Xml::SaxBuilder`) that provides
62
+ significantly faster XML parsing by avoiding intermediate Nokogiri DOM trees.
63
+
64
+ === How It Works
65
+
66
+ Traditional parsing:
67
+ [source]
68
+ ----
69
+ XML String → Nokogiri DOM (~60ms) → Canon::Xml::Node (~1200ms) = ~1260ms
70
+ ----
71
+
72
+ SAX parsing:
73
+ [source]
74
+ ----
75
+ XML String → Nokogiri SAX → Canon::Xml::Node (~200ms) = ~200ms
76
+ ----
77
+
78
+ === Usage
79
+
80
+ [source,ruby]
81
+ ----
82
+ require 'canon/xml/sax_builder'
83
+
84
+ # Parse XML directly to Canon::Xml::Node tree
85
+ root = Canon::Xml::SaxBuilder.parse(xml_string)
86
+
87
+ # With options
88
+ root = Canon::Xml::SaxBuilder.parse(xml_string,
89
+ preserve_whitespace: true,
90
+ strip_doctype: true # For C14N compatibility
91
+ )
92
+ ----
93
+
94
+ === Options
95
+
96
+ `preserve_whitespace`:: Keep whitespace-only text nodes (default: false)
97
+ `strip_doctype`:: Remove DOCTYPE declaration (for C14N, avoids DTD default attribute expansion)
98
+
99
+ == CI Integration
100
+
101
+ Performance benchmarks run automatically on:
102
+
103
+ === Pull Requests
104
+
105
+ When a PR is opened, the performance workflow compares benchmarks against the `main` branch.
106
+ If any benchmark regresses by more than 10%, the check fails with a clear error message.
107
+
108
+ [source,bash]
109
+ ----
110
+ # CI output shows comparison
111
+ Comparing against: Previous branch (main).
112
+ Threshold: 10% regression allowed
113
+
114
+ XML Parsing: SAX (large)
115
+ base: 1042.30 IPS
116
+ curr: 1285.20 IPS
117
+ change: +23.31%
118
+ ✅ OK
119
+ ----
120
+
121
+ === Main Branch
122
+
123
+ On pushes to `main`, benchmarks run to log baseline metrics for performance tracking.
124
+
125
+ == Threshold Configuration
126
+
127
+ The default regression threshold is 10%. Configure via rake task:
128
+
129
+ [source,bash]
130
+ ----
131
+ # Custom threshold (e.g., 5%)
132
+ RUBYOPT="-rbenchmark-runner" bundle exec rake performance:compare
133
+ ----
134
+
135
+ Or modify `lib/tasks/performance_comparator.rb`:
136
+
137
+ [source,ruby]
138
+ ----
139
+ DEFAULT_THRESHOLD = 0.10 # 10%
140
+ ----
141
+
142
+ == Adding New Benchmarks
143
+
144
+ Add test methods to `lib/tasks/benchmark_runner.rb`:
145
+
146
+ [source,ruby]
147
+ ----
148
+ # In BENCHMARKS hash
149
+ BENCHMARKS = {
150
+ xml_parsing: [
151
+ # ... existing tests ...
152
+ { name: "New Test", method: :my_new_test, desc: "Description" },
153
+ ],
154
+ }.freeze
155
+
156
+ # Add test method
157
+ def my_new_test
158
+ xml = DataGenerator.generate_xml(items: @items)
159
+ measure { Canon::Xml::SaxBuilder.parse(xml) }
160
+ end
161
+ ----
@@ -471,8 +471,9 @@ module Canon
471
471
  child.children.each do |text_child|
472
472
  next unless text_child.is_a?(Canon::Xml::Nodes::TextNode)
473
473
 
474
- # Remove HTML comments from text content
475
- normalized = text_child.value.gsub(/<!--.*?-->/m, "").strip
474
+ # Remove HTML comments from text content without using regex
475
+ # to avoid ReDoS/incomplete sanitization vulnerabilities
476
+ normalized = remove_html_comments(text_child.value)
476
477
  # Update the text value
477
478
  text_child.instance_variable_set(:@value, normalized)
478
479
  end
@@ -562,13 +563,9 @@ module Canon
562
563
  # Also removes whitespace-only CDATA children that Nokogiri creates
563
564
  def normalize_html_style_script_comments(doc)
564
565
  doc.css("style, script").each do |element|
565
- # Remove HTML comments from style/script content
566
- # SAFE: This regex operates on already-parsed DOM element content,
567
- # not on raw user input. The non-greedy .*? correctly matches
568
- # comment boundaries. Any remaining <!-- would be literal text
569
- # (not a comment), which is safe in this context.
570
- # CodeQL false positive: see https://github.com/github/codeql/issues/XXXX
571
- normalized = element.content.gsub(/<!--.*?-->/m, "").strip
566
+ # Remove HTML comments from style/script content without using regex
567
+ # to avoid ReDoS/incomplete sanitization vulnerabilities
568
+ normalized = remove_html_comments(element.content)
572
569
 
573
570
  if normalized.empty?
574
571
  # Remove all children (including whitespace-only CDATA nodes)
@@ -579,6 +576,43 @@ module Canon
579
576
  end
580
577
  end
581
578
 
579
+ # Remove HTML comments from a string without using regex
580
+ # This avoids ReDoS and incomplete sanitization vulnerabilities
581
+ #
582
+ # @param text [String] Text potentially containing HTML comments
583
+ # @return [String] Text with HTML comments removed
584
+ def remove_html_comments(text)
585
+ return "" if text.nil?
586
+
587
+ result = +""
588
+ pos = 0
589
+
590
+ while pos < text.length
591
+ # Look for comment start
592
+ comment_start = text.index("<!--", pos)
593
+ if comment_start.nil?
594
+ # No more comments, append rest of text
595
+ result << text[pos..]
596
+ break
597
+ end
598
+
599
+ # Append text before comment
600
+ result << text[pos...comment_start]
601
+
602
+ # Look for comment end
603
+ comment_end = text.index("-->", comment_start + 4)
604
+ if comment_end.nil?
605
+ # Unclosed comment, skip the rest
606
+ break
607
+ end
608
+
609
+ # Move past the comment
610
+ pos = comment_end + 3
611
+ end
612
+
613
+ result.strip
614
+ end
615
+
582
616
  # Normalize whitespace in text nodes according to HTML rendering rules
583
617
  # In HTML rendering, sequences of whitespace (spaces, tabs, newlines)
584
618
  # collapse to a single space, except in elements where whitespace is
@@ -621,8 +655,8 @@ compare_profile = nil)
621
655
  next if ancestor_preserves_whitespace?(parent, preserve_whitespace)
622
656
 
623
657
  # Collapse whitespace sequences (spaces, tabs, newlines) to single
624
- # space
625
- normalized = text_node.content.gsub(/\s+/, " ")
658
+ # space - use tr/squeeze to avoid ReDoS vulnerability from gsub(/\s+/)
659
+ normalized = text_node.content.tr("\t\n\r\f\v", " ").squeeze(" ")
626
660
 
627
661
  # Trim leading/trailing whitespace if appropriate
628
662
  normalized = normalized.strip if should_trim_text_node?(text_node)
@@ -77,6 +77,13 @@ module Canon
77
77
  return build_text_difference_reason(text1, text2)
78
78
  end
79
79
 
80
+ # For attribute order differences, show the actual attribute names
81
+ if dimension == :attribute_order
82
+ attrs1 = extract_attributes(node1)&.keys || []
83
+ attrs2 = extract_attributes(node2)&.keys || []
84
+ return "Attribute order changed: [#{attrs1.join(', ')}] → [#{attrs2.join(', ')}]"
85
+ end
86
+
80
87
  # Default reason
81
88
  "#{diff1} vs #{diff2}"
82
89
  end
@@ -615,9 +615,47 @@ differences)
615
615
  return build_text_diff_reason(text1, text2)
616
616
  end
617
617
 
618
+ # For attribute values differences, show the actual values
619
+ if dimension == :attribute_values
620
+ attrs1 = extract_attributes(node1)
621
+ attrs2 = extract_attributes(node2)
622
+ return build_attribute_value_diff_reason(attrs1, attrs2)
623
+ end
624
+
625
+ # For attribute order differences, show the actual attribute names
626
+ if dimension == :attribute_order
627
+ attrs1 = extract_attributes(node1)&.keys || []
628
+ attrs2 = extract_attributes(node2)&.keys || []
629
+ return "Attribute order changed: [#{attrs1.join(', ')}] → [#{attrs2.join(', ')}]"
630
+ end
631
+
618
632
  "#{diff1} vs #{diff2}"
619
633
  end
620
634
 
635
+ # Build a clear reason message for attribute value differences
636
+ #
637
+ # @param attrs1 [Hash, nil] First node's attributes
638
+ # @param attrs2 [Hash, nil] Second node's attributes
639
+ # @return [String] Clear explanation of the attribute value difference
640
+ def build_attribute_value_diff_reason(attrs1, attrs2)
641
+ return "missing vs present attributes" unless attrs1 && attrs2
642
+
643
+ require "set"
644
+ keys1 = attrs1.keys.to_set
645
+ keys2 = attrs2.keys.to_set
646
+
647
+ common = keys1 & keys2
648
+ different_values = common.reject { |k| attrs1[k] == attrs2[k] }
649
+
650
+ return "all attribute values match" if different_values.empty?
651
+
652
+ parts = different_values.map do |k|
653
+ "#{k}: #{attrs1[k].inspect} vs #{attrs2[k].inspect}"
654
+ end
655
+
656
+ parts.join("; ")
657
+ end
658
+
621
659
  # Build a clear reason message for attribute presence differences
622
660
  #
623
661
  # @param attrs1 [Hash, nil] First node's attributes
@@ -31,12 +31,18 @@ module Canon
31
31
  return "" if line.nil?
32
32
 
33
33
  # Collapse all whitespace (spaces, tabs, newlines) to single space
34
- normalized = line.gsub(/\s+/, " ").strip
34
+ # Avoid regex to prevent ReDoS vulnerability - use String methods
35
+ normalized = line.strip.tr("\t\n\r\f\v", " ").squeeze(" ")
35
36
 
36
37
  # Normalize whitespace around tag delimiters
37
- # Remove spaces before > and after <
38
- normalized = normalized.gsub(/\s+>/, ">") # "div >" -> "div>"
39
- normalized.gsub(/<\s+/, "<") # "< div" -> "<div"
38
+ # Remove spaces before > and after < (avoid regex for ReDoS safety)
39
+ while normalized.include?(" >")
40
+ normalized = normalized.gsub(" >", ">")
41
+ end
42
+ while normalized.include?("< ")
43
+ normalized = normalized.gsub("< ", "<")
44
+ end
45
+ normalized
40
46
  end
41
47
 
42
48
  # Check if a line is blank (nil or whitespace-only)
@@ -113,7 +113,8 @@ module Canon
113
113
  # show reason if available
114
114
  if diff.respond_to?(:reason) && diff.reason
115
115
  output << "#{colorize('Reason:', :cyan, use_color,
116
- bold: true)} #{colorize(diff.reason, :yellow, use_color)}"
116
+ bold: true)} #{colorize(diff.reason,
117
+ :yellow, use_color)}"
117
118
  end
118
119
  output << ""
119
120
 
@@ -307,9 +307,12 @@ module Canon
307
307
  # Add value
308
308
  result << "value:#{value}" if value
309
309
 
310
- # Add attributes
311
- attributes.each do |key, val|
312
- result << "attr:#{key}=#{val}"
310
+ # Add attributes (key only, not values)
311
+ # This ensures nodes differing only in attribute VALUES still get matched
312
+ # and are then reported as attribute_updates rather than structural differences
313
+ # NOTE: The value differences are detected separately in detect_changes
314
+ attributes.each_key do |key|
315
+ result << "attr:#{key}"
313
316
  end
314
317
 
315
318
  # Add child labels