canon 0.1.15 → 0.1.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +19 -83
- data/README.adoc +57 -0
- data/Rakefile +2 -0
- data/docs/features/index.adoc +10 -0
- data/docs/features/performance.adoc +161 -0
- data/lib/canon/comparison/html_comparator.rb +45 -11
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +7 -0
- data/lib/canon/comparison/xml_comparator.rb +38 -0
- data/lib/canon/diff/formatting_detector.rb +10 -4
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +2 -1
- data/lib/canon/tree_diff/core/tree_node.rb +6 -3
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/sax_builder.rb +360 -0
- data/lib/tasks/benchmark_runner.rb +610 -0
- data/lib/tasks/performance.rake +81 -0
- data/lib/tasks/performance_comparator.rb +100 -0
- data/lib/tasks/performance_helpers.rb +219 -0
- metadata +8 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0eb3c717365f052953d3deaf83a897112709c1a6084b472b99ddfdc2c9e43b67
|
|
4
|
+
data.tar.gz: fe4b2b513193b87692cd1fcb11569898e69c6818bea08ae3dccc753ad935f6e0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2c6d351b873ebb745c5abcdb2ff6cdbcf4ce53da1ad7f070c0b1eefeeeb776e315fa62c8d82c24b216e6e93cf5ac1790ebe3c6a171a142036ef0abc356d5a9e6
|
|
7
|
+
data.tar.gz: 6c0228d16e387e2a7919786cb57636e5c3183f0a1a1e119684fb0e01122a5ac23ffc08849f9df55bf413495024de30c4bf2e420172e13a9197d48b30636f845a
|
data/.rubocop_todo.yml
CHANGED
|
@@ -1,64 +1,24 @@
|
|
|
1
1
|
# This configuration was generated by
|
|
2
2
|
# `rubocop --auto-gen-config`
|
|
3
|
-
# on 2026-
|
|
3
|
+
# on 2026-03-21 03:07:35 UTC using RuboCop version 1.85.1.
|
|
4
4
|
# The point is for the user to remove these configuration records
|
|
5
5
|
# one by one as the offenses are removed from the code base.
|
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
|
7
7
|
# versions of RuboCop, may require this file to be generated again.
|
|
8
8
|
|
|
9
9
|
# Offense count: 1
|
|
10
|
-
# Configuration parameters: Severity.
|
|
11
10
|
Gemspec/RequiredRubyVersion:
|
|
12
11
|
Exclude:
|
|
13
12
|
- 'canon.gemspec'
|
|
14
13
|
|
|
15
|
-
# Offense count:
|
|
16
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
17
|
-
# Configuration parameters: EnforcedStyle, IndentationWidth.
|
|
18
|
-
# SupportedStyles: with_first_argument, with_fixed_indentation
|
|
19
|
-
Layout/ArgumentAlignment:
|
|
20
|
-
Exclude:
|
|
21
|
-
- 'lib/canon/xml/element_matcher.rb'
|
|
22
|
-
|
|
23
|
-
# Offense count: 23
|
|
24
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
25
|
-
# Configuration parameters: EnforcedStyleAlignWith.
|
|
26
|
-
# SupportedStylesAlignWith: either, start_of_block, start_of_line
|
|
27
|
-
Layout/BlockAlignment:
|
|
28
|
-
Exclude:
|
|
29
|
-
- 'spec/canon/fixtures/isodoc_spec.rb'
|
|
30
|
-
- 'spec/canon/table_class_attribute_bug_spec.rb'
|
|
31
|
-
|
|
32
|
-
# Offense count: 23
|
|
14
|
+
# Offense count: 773
|
|
33
15
|
# This cop supports safe autocorrection (--autocorrect).
|
|
34
|
-
|
|
35
|
-
Exclude:
|
|
36
|
-
- 'spec/canon/fixtures/isodoc_spec.rb'
|
|
37
|
-
- 'spec/canon/table_class_attribute_bug_spec.rb'
|
|
38
|
-
|
|
39
|
-
# Offense count: 46
|
|
40
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
41
|
-
# Configuration parameters: Width, AllowedPatterns.
|
|
42
|
-
Layout/IndentationWidth:
|
|
43
|
-
Exclude:
|
|
44
|
-
- 'spec/canon/fixtures/isodoc_spec.rb'
|
|
45
|
-
- 'spec/canon/table_class_attribute_bug_spec.rb'
|
|
46
|
-
|
|
47
|
-
# Offense count: 780
|
|
48
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
49
|
-
# Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, IgnoreCopDirectives, AllowedPatterns, SplitStrings.
|
|
16
|
+
# Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
|
|
50
17
|
# URISchemes: http, https
|
|
51
18
|
Layout/LineLength:
|
|
52
19
|
Enabled: false
|
|
53
20
|
|
|
54
|
-
# Offense count:
|
|
55
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
56
|
-
# Configuration parameters: AllowInHeredoc.
|
|
57
|
-
Layout/TrailingWhitespace:
|
|
58
|
-
Exclude:
|
|
59
|
-
- 'lib/canon/xml/element_matcher.rb'
|
|
60
|
-
|
|
61
|
-
# Offense count: 48
|
|
21
|
+
# Offense count: 49
|
|
62
22
|
# Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
|
|
63
23
|
Lint/DuplicateBranch:
|
|
64
24
|
Enabled: false
|
|
@@ -87,45 +47,44 @@ Lint/UnreachableCode:
|
|
|
87
47
|
Exclude:
|
|
88
48
|
- 'lib/canon/diff_formatter/debug_output.rb'
|
|
89
49
|
|
|
90
|
-
# Offense count:
|
|
50
|
+
# Offense count: 6
|
|
91
51
|
# This cop supports safe autocorrection (--autocorrect).
|
|
92
52
|
# Configuration parameters: AllowUnusedKeywordArguments, IgnoreEmptyMethods, IgnoreNotImplementedMethods, NotImplementedExceptions.
|
|
93
53
|
# NotImplementedExceptions: NotImplementedError
|
|
94
54
|
Lint/UnusedMethodArgument:
|
|
95
55
|
Exclude:
|
|
96
|
-
- 'lib/canon/comparison.rb'
|
|
97
56
|
- 'lib/canon/diff/path_builder.rb'
|
|
98
57
|
- 'lib/canon/diff_formatter/by_line/base_formatter.rb'
|
|
99
58
|
- 'lib/canon/diff_formatter/by_line/xml_formatter.rb'
|
|
100
59
|
- 'lib/canon/diff_formatter/by_object/base_formatter.rb'
|
|
101
60
|
|
|
102
|
-
# Offense count:
|
|
61
|
+
# Offense count: 229
|
|
103
62
|
# Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
|
|
104
63
|
Metrics/AbcSize:
|
|
105
64
|
Enabled: false
|
|
106
65
|
|
|
107
|
-
# Offense count:
|
|
66
|
+
# Offense count: 22
|
|
108
67
|
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns, inherit_mode.
|
|
109
68
|
# AllowedMethods: refine
|
|
110
69
|
Metrics/BlockLength:
|
|
111
70
|
Max: 84
|
|
112
71
|
|
|
113
|
-
# Offense count:
|
|
72
|
+
# Offense count: 187
|
|
114
73
|
# Configuration parameters: AllowedMethods, AllowedPatterns, Max.
|
|
115
74
|
Metrics/CyclomaticComplexity:
|
|
116
75
|
Enabled: false
|
|
117
76
|
|
|
118
|
-
# Offense count:
|
|
77
|
+
# Offense count: 394
|
|
119
78
|
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
|
120
79
|
Metrics/MethodLength:
|
|
121
|
-
Max:
|
|
80
|
+
Max: 95
|
|
122
81
|
|
|
123
|
-
# Offense count:
|
|
82
|
+
# Offense count: 45
|
|
124
83
|
# Configuration parameters: CountKeywordArgs, MaxOptionalParameters.
|
|
125
84
|
Metrics/ParameterLists:
|
|
126
85
|
Max: 9
|
|
127
86
|
|
|
128
|
-
# Offense count:
|
|
87
|
+
# Offense count: 154
|
|
129
88
|
# Configuration parameters: AllowedMethods, AllowedPatterns, Max.
|
|
130
89
|
Metrics/PerceivedComplexity:
|
|
131
90
|
Enabled: false
|
|
@@ -139,16 +98,6 @@ Naming/MethodParameterName:
|
|
|
139
98
|
- 'lib/canon/comparison/xml_comparator/attribute_comparator.rb'
|
|
140
99
|
- 'lib/canon/xml/namespace_handler.rb'
|
|
141
100
|
|
|
142
|
-
# Offense count: 1
|
|
143
|
-
# Configuration parameters: NamePrefix, ForbiddenPrefixes, AllowedMethods, MethodDefinitionMacros, UseSorbetSigs.
|
|
144
|
-
# NamePrefix: is_, has_, have_, does_
|
|
145
|
-
# ForbiddenPrefixes: is_, has_, have_, does_
|
|
146
|
-
# AllowedMethods: is_a?
|
|
147
|
-
# MethodDefinitionMacros: define_method, define_singleton_method
|
|
148
|
-
Naming/PredicatePrefix:
|
|
149
|
-
Exclude:
|
|
150
|
-
- 'lib/canon/comparison/html_comparator.rb'
|
|
151
|
-
|
|
152
101
|
# Offense count: 6
|
|
153
102
|
# Configuration parameters: EnforcedStyle, CheckMethodNames, CheckSymbols, AllowedIdentifiers, AllowedPatterns.
|
|
154
103
|
# SupportedStyles: snake_case, normalcase, non_integer
|
|
@@ -159,13 +108,12 @@ Naming/VariableNumber:
|
|
|
159
108
|
- 'lib/canon/comparison/markup_comparator.rb'
|
|
160
109
|
- 'lib/canon/comparison/xml_comparator/diff_node_builder.rb'
|
|
161
110
|
|
|
162
|
-
# Offense count:
|
|
111
|
+
# Offense count: 2
|
|
163
112
|
# Configuration parameters: MinSize.
|
|
164
113
|
Performance/CollectionLiteralInLoop:
|
|
165
114
|
Exclude:
|
|
166
115
|
- 'lib/canon/comparison/html_comparator.rb'
|
|
167
116
|
- 'lib/canon/xml/xml_base_handler.rb'
|
|
168
|
-
- 'spec/canon/table_class_attribute_bug_spec.rb'
|
|
169
117
|
|
|
170
118
|
# Offense count: 68
|
|
171
119
|
# Configuration parameters: Prefixes, AllowedPatterns.
|
|
@@ -184,10 +132,10 @@ RSpec/DescribeMethod:
|
|
|
184
132
|
- 'spec/canon/comparison/multiple_differences_spec.rb'
|
|
185
133
|
- 'spec/canon/diff_formatter/character_map_customization_spec.rb'
|
|
186
134
|
|
|
187
|
-
# Offense count:
|
|
135
|
+
# Offense count: 695
|
|
188
136
|
# Configuration parameters: CountAsOne.
|
|
189
137
|
RSpec/ExampleLength:
|
|
190
|
-
Max:
|
|
138
|
+
Max: 43
|
|
191
139
|
|
|
192
140
|
# Offense count: 8
|
|
193
141
|
# This cop supports safe autocorrection (--autocorrect).
|
|
@@ -240,7 +188,7 @@ RSpec/MultipleDescribes:
|
|
|
240
188
|
RSpec/MultipleExpectations:
|
|
241
189
|
Max: 15
|
|
242
190
|
|
|
243
|
-
# Offense count:
|
|
191
|
+
# Offense count: 71
|
|
244
192
|
# Configuration parameters: AllowSubject.
|
|
245
193
|
RSpec/MultipleMemoizedHelpers:
|
|
246
194
|
Max: 13
|
|
@@ -259,13 +207,12 @@ RSpec/NamedSubject:
|
|
|
259
207
|
RSpec/NestedGroups:
|
|
260
208
|
Max: 4
|
|
261
209
|
|
|
262
|
-
# Offense count:
|
|
210
|
+
# Offense count: 10
|
|
263
211
|
# Configuration parameters: AllowedPatterns.
|
|
264
212
|
# AllowedPatterns: ^expect_, ^assert_
|
|
265
213
|
RSpec/NoExpectationExample:
|
|
266
214
|
Exclude:
|
|
267
215
|
- 'spec/canon/context_grouping_spec.rb'
|
|
268
|
-
- 'spec/canon/fixtures/isodoc_spec.rb'
|
|
269
216
|
- 'spec/canon/informative_diffs_debug_spec.rb'
|
|
270
217
|
- 'spec/canon/isodoc_blockquotes_spec.rb'
|
|
271
218
|
- 'spec/canon/match_scenarios_spec.rb'
|
|
@@ -283,28 +230,17 @@ RSpec/SpecFilePathFormat:
|
|
|
283
230
|
- 'spec/canon/yaml/formatter_spec.rb'
|
|
284
231
|
- 'spec/xml_c14n_spec.rb'
|
|
285
232
|
|
|
286
|
-
# Offense count:
|
|
233
|
+
# Offense count: 126
|
|
287
234
|
# Configuration parameters: IgnoreNameless, IgnoreSymbolicNames.
|
|
288
235
|
RSpec/VerifiedDoubles:
|
|
289
236
|
Exclude:
|
|
237
|
+
- 'spec/canon/comparison/diff_node_builder_spec.rb'
|
|
290
238
|
- 'spec/canon/comparison/whitespace_sensitivity_spec.rb'
|
|
291
239
|
- 'spec/canon/diff/diff_classifier_spec.rb'
|
|
292
240
|
- 'spec/canon/diff/path_builder_spec.rb'
|
|
293
241
|
- 'spec/canon/diff/xml_serialization_formatter_spec.rb'
|
|
294
242
|
- 'spec/canon/tree_diff/operation_converter_spec.rb'
|
|
295
243
|
|
|
296
|
-
# Offense count: 44
|
|
297
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
298
|
-
# Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
|
|
299
|
-
# SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
|
|
300
|
-
# ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
|
|
301
|
-
# FunctionalMethods: let, let!, subject, watch
|
|
302
|
-
# AllowedMethods: lambda, proc, it
|
|
303
|
-
Style/BlockDelimiters:
|
|
304
|
-
Exclude:
|
|
305
|
-
- 'spec/canon/fixtures/isodoc_spec.rb'
|
|
306
|
-
- 'spec/canon/table_class_attribute_bug_spec.rb'
|
|
307
|
-
|
|
308
244
|
# Offense count: 1
|
|
309
245
|
# This cop supports safe autocorrection (--autocorrect).
|
|
310
246
|
# Configuration parameters: EnforcedStyle, AllowComments.
|
data/README.adoc
CHANGED
|
@@ -901,6 +901,63 @@ After checking out the repo, run `bin/setup` to install dependencies. Then run
|
|
|
901
901
|
`rake spec` to run the tests. You can also run `bin/console` for an interactive
|
|
902
902
|
prompt.
|
|
903
903
|
|
|
904
|
+
== Performance
|
|
905
|
+
|
|
906
|
+
Canon includes a comprehensive performance benchmarking system to prevent regressions in XML/HTML parsing and comparison algorithms.
|
|
907
|
+
|
|
908
|
+
=== Running Benchmarks
|
|
909
|
+
|
|
910
|
+
[source,bash]
|
|
911
|
+
----
|
|
912
|
+
# Run all benchmarks (10s per test, takes ~5 minutes)
|
|
913
|
+
bundle exec rake performance:run
|
|
914
|
+
|
|
915
|
+
# Quick benchmark (2s per test, takes ~30 seconds)
|
|
916
|
+
bundle exec rake performance:quick
|
|
917
|
+
|
|
918
|
+
# Compare against main branch (for PRs)
|
|
919
|
+
bundle exec rake performance:compare
|
|
920
|
+
|
|
921
|
+
# Run specific category
|
|
922
|
+
bundle exec rake performance:category[xml_parsing]
|
|
923
|
+
|
|
924
|
+
# Output formats
|
|
925
|
+
bundle exec rake performance:json
|
|
926
|
+
bundle exec rake performance:yaml
|
|
927
|
+
----
|
|
928
|
+
|
|
929
|
+
=== Benchmark Categories
|
|
930
|
+
|
|
931
|
+
* **XML Parsing**: DOM vs SAX, simple vs large documents
|
|
932
|
+
* **HTML Parsing**: Simple vs complex (with scripts/tables)
|
|
933
|
+
* **XML/HTML Comparison**: Identical, similar, different documents
|
|
934
|
+
* **Format Canonicalization**: XML C14N, JSON, YAML
|
|
935
|
+
|
|
936
|
+
=== SAX Parser
|
|
937
|
+
|
|
938
|
+
Canon includes a SAX-based XML parser (`Canon::Xml::SaxBuilder`) that provides
|
|
939
|
+
significantly faster XML parsing by avoiding intermediate Nokogiri DOM trees.
|
|
940
|
+
|
|
941
|
+
[source,ruby]
|
|
942
|
+
----
|
|
943
|
+
require 'canon/xml/sax_builder'
|
|
944
|
+
|
|
945
|
+
# Parse XML directly to Canon::Xml::Node tree
|
|
946
|
+
root = Canon::Xml::SaxBuilder.parse(xml_string)
|
|
947
|
+
|
|
948
|
+
# For C14N (strips DOCTYPE to avoid DTD default attribute expansion)
|
|
949
|
+
root = Canon::Xml::SaxBuilder.parse(xml_string, strip_doctype: true)
|
|
950
|
+
----
|
|
951
|
+
|
|
952
|
+
Performance improvement: ~6x faster than DOM parsing + conversion for large documents.
|
|
953
|
+
|
|
954
|
+
=== CI Integration
|
|
955
|
+
|
|
956
|
+
Performance benchmarks run automatically on:
|
|
957
|
+
|
|
958
|
+
* **Pull requests**: Compares against `main` branch and fails if regression exceeds 10%
|
|
959
|
+
* **Main branch pushes**: Logs baseline metrics for performance tracking
|
|
960
|
+
|
|
904
961
|
== Contributing
|
|
905
962
|
|
|
906
963
|
Bug reports and pull requests are welcome on GitHub at
|
data/Rakefile
CHANGED
data/docs/features/index.adoc
CHANGED
|
@@ -92,6 +92,16 @@ Error handling and validation.
|
|
|
92
92
|
* Format detection
|
|
93
93
|
* Error messages
|
|
94
94
|
|
|
95
|
+
=== Performance
|
|
96
|
+
|
|
97
|
+
link:performance/[**Performance**]::
|
|
98
|
+
Performance benchmarking and optimization.
|
|
99
|
+
+
|
|
100
|
+
* SAX-based XML parser
|
|
101
|
+
* Benchmark categories
|
|
102
|
+
* CI integration
|
|
103
|
+
* Regression detection
|
|
104
|
+
|
|
95
105
|
== Quick Configuration Examples
|
|
96
106
|
|
|
97
107
|
=== Test-Friendly Comparison
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
---
|
|
2
|
+
layout: default
|
|
3
|
+
title: Performance
|
|
4
|
+
nav_order: 100
|
|
5
|
+
---
|
|
6
|
+
= Performance
|
|
7
|
+
|
|
8
|
+
Canon includes a comprehensive performance benchmarking system to prevent regressions in XML/HTML parsing and comparison algorithms.
|
|
9
|
+
|
|
10
|
+
== Running Benchmarks
|
|
11
|
+
|
|
12
|
+
[source,bash]
|
|
13
|
+
----
|
|
14
|
+
# Run all benchmarks (10s per test, ~5 minutes total)
|
|
15
|
+
bundle exec rake performance:run
|
|
16
|
+
|
|
17
|
+
# Quick benchmark (2s per test, ~30 seconds)
|
|
18
|
+
bundle exec rake performance:quick
|
|
19
|
+
|
|
20
|
+
# Compare against main branch (for PRs, fails on regression)
|
|
21
|
+
bundle exec rake performance:compare
|
|
22
|
+
|
|
23
|
+
# Run specific category
|
|
24
|
+
bundle exec rake performance:category[xml_parsing]
|
|
25
|
+
|
|
26
|
+
# Output in different formats
|
|
27
|
+
bundle exec rake performance:json
|
|
28
|
+
bundle exec rake performance:yaml
|
|
29
|
+
----
|
|
30
|
+
|
|
31
|
+
== Benchmark Categories
|
|
32
|
+
|
|
33
|
+
=== XML Parsing
|
|
34
|
+
|
|
35
|
+
* **DOM (simple)**: Standard Nokogiri DOM parsing
|
|
36
|
+
* **SAX (simple)**: SAX-based streaming parser
|
|
37
|
+
* **DOM (large)**: Large document DOM parsing
|
|
38
|
+
* **SAX (large)**: Large document SAX parsing
|
|
39
|
+
|
|
40
|
+
SAX parser is typically ~20-50% faster than DOM for XML parsing.
|
|
41
|
+
|
|
42
|
+
=== HTML Parsing
|
|
43
|
+
|
|
44
|
+
* **Simple HTML**: Basic HTML document parsing
|
|
45
|
+
* **Complex HTML**: HTML with scripts, styles, and tables
|
|
46
|
+
|
|
47
|
+
=== XML/HTML Comparison
|
|
48
|
+
|
|
49
|
+
* **Identical**: Comparing the same document to itself
|
|
50
|
+
* **Similar**: Comparing documents with minor differences
|
|
51
|
+
* **Different**: Comparing documents with different namespaces/structure
|
|
52
|
+
|
|
53
|
+
=== Format Canonicalization
|
|
54
|
+
|
|
55
|
+
* **XML C14N**: W3C Canonical XML
|
|
56
|
+
* **JSON**: JSON formatting
|
|
57
|
+
* **YAML**: YAML formatting
|
|
58
|
+
|
|
59
|
+
== SAX Parser
|
|
60
|
+
|
|
61
|
+
Canon includes a SAX-based XML parser (`Canon::Xml::SaxBuilder`) that provides
|
|
62
|
+
significantly faster XML parsing by avoiding intermediate Nokogiri DOM trees.
|
|
63
|
+
|
|
64
|
+
=== How It Works
|
|
65
|
+
|
|
66
|
+
Traditional parsing:
|
|
67
|
+
[source]
|
|
68
|
+
----
|
|
69
|
+
XML String → Nokogiri DOM (~60ms) → Canon::Xml::Node (~1200ms) = ~1260ms
|
|
70
|
+
----
|
|
71
|
+
|
|
72
|
+
SAX parsing:
|
|
73
|
+
[source]
|
|
74
|
+
----
|
|
75
|
+
XML String → Nokogiri SAX → Canon::Xml::Node (~200ms) = ~200ms
|
|
76
|
+
----
|
|
77
|
+
|
|
78
|
+
=== Usage
|
|
79
|
+
|
|
80
|
+
[source,ruby]
|
|
81
|
+
----
|
|
82
|
+
require 'canon/xml/sax_builder'
|
|
83
|
+
|
|
84
|
+
# Parse XML directly to Canon::Xml::Node tree
|
|
85
|
+
root = Canon::Xml::SaxBuilder.parse(xml_string)
|
|
86
|
+
|
|
87
|
+
# With options
|
|
88
|
+
root = Canon::Xml::SaxBuilder.parse(xml_string,
|
|
89
|
+
preserve_whitespace: true,
|
|
90
|
+
strip_doctype: true # For C14N compatibility
|
|
91
|
+
)
|
|
92
|
+
----
|
|
93
|
+
|
|
94
|
+
=== Options
|
|
95
|
+
|
|
96
|
+
`preserve_whitespace`:: Keep whitespace-only text nodes (default: false)
|
|
97
|
+
`strip_doctype`:: Remove DOCTYPE declaration (for C14N, avoids DTD default attribute expansion)
|
|
98
|
+
|
|
99
|
+
== CI Integration
|
|
100
|
+
|
|
101
|
+
Performance benchmarks run automatically on:
|
|
102
|
+
|
|
103
|
+
=== Pull Requests
|
|
104
|
+
|
|
105
|
+
When a PR is opened, the performance workflow compares benchmarks against the `main` branch.
|
|
106
|
+
If any benchmark regresses by more than 10%, the check fails with a clear error message.
|
|
107
|
+
|
|
108
|
+
[source,bash]
|
|
109
|
+
----
|
|
110
|
+
# CI output shows comparison
|
|
111
|
+
Comparing against: Previous branch (main).
|
|
112
|
+
Threshold: 10% regression allowed
|
|
113
|
+
|
|
114
|
+
XML Parsing: SAX (large)
|
|
115
|
+
base: 1042.30 IPS
|
|
116
|
+
curr: 1285.20 IPS
|
|
117
|
+
change: +23.31%
|
|
118
|
+
✅ OK
|
|
119
|
+
----
|
|
120
|
+
|
|
121
|
+
=== Main Branch
|
|
122
|
+
|
|
123
|
+
On pushes to `main`, benchmarks run to log baseline metrics for performance tracking.
|
|
124
|
+
|
|
125
|
+
== Threshold Configuration
|
|
126
|
+
|
|
127
|
+
The default regression threshold is 10%. Configure via rake task:
|
|
128
|
+
|
|
129
|
+
[source,bash]
|
|
130
|
+
----
|
|
131
|
+
# Custom threshold (e.g., 5%)
|
|
132
|
+
RUBYOPT="-rbenchmark-runner" bundle exec rake performance:compare
|
|
133
|
+
----
|
|
134
|
+
|
|
135
|
+
Or modify `lib/tasks/performance_comparator.rb`:
|
|
136
|
+
|
|
137
|
+
[source,ruby]
|
|
138
|
+
----
|
|
139
|
+
DEFAULT_THRESHOLD = 0.10 # 10%
|
|
140
|
+
----
|
|
141
|
+
|
|
142
|
+
== Adding New Benchmarks
|
|
143
|
+
|
|
144
|
+
Add test methods to `lib/tasks/benchmark_runner.rb`:
|
|
145
|
+
|
|
146
|
+
[source,ruby]
|
|
147
|
+
----
|
|
148
|
+
# In BENCHMARKS hash
|
|
149
|
+
BENCHMARKS = {
|
|
150
|
+
xml_parsing: [
|
|
151
|
+
# ... existing tests ...
|
|
152
|
+
{ name: "New Test", method: :my_new_test, desc: "Description" },
|
|
153
|
+
],
|
|
154
|
+
}.freeze
|
|
155
|
+
|
|
156
|
+
# Add test method
|
|
157
|
+
def my_new_test
|
|
158
|
+
xml = DataGenerator.generate_xml(items: @items)
|
|
159
|
+
measure { Canon::Xml::SaxBuilder.parse(xml) }
|
|
160
|
+
end
|
|
161
|
+
----
|
|
@@ -471,8 +471,9 @@ module Canon
|
|
|
471
471
|
child.children.each do |text_child|
|
|
472
472
|
next unless text_child.is_a?(Canon::Xml::Nodes::TextNode)
|
|
473
473
|
|
|
474
|
-
# Remove HTML comments from text content
|
|
475
|
-
|
|
474
|
+
# Remove HTML comments from text content without using regex
|
|
475
|
+
# to avoid ReDoS/incomplete sanitization vulnerabilities
|
|
476
|
+
normalized = remove_html_comments(text_child.value)
|
|
476
477
|
# Update the text value
|
|
477
478
|
text_child.instance_variable_set(:@value, normalized)
|
|
478
479
|
end
|
|
@@ -562,13 +563,9 @@ module Canon
|
|
|
562
563
|
# Also removes whitespace-only CDATA children that Nokogiri creates
|
|
563
564
|
def normalize_html_style_script_comments(doc)
|
|
564
565
|
doc.css("style, script").each do |element|
|
|
565
|
-
# Remove HTML comments from style/script content
|
|
566
|
-
#
|
|
567
|
-
|
|
568
|
-
# comment boundaries. Any remaining <!-- would be literal text
|
|
569
|
-
# (not a comment), which is safe in this context.
|
|
570
|
-
# CodeQL false positive: see https://github.com/github/codeql/issues/XXXX
|
|
571
|
-
normalized = element.content.gsub(/<!--.*?-->/m, "").strip
|
|
566
|
+
# Remove HTML comments from style/script content without using regex
|
|
567
|
+
# to avoid ReDoS/incomplete sanitization vulnerabilities
|
|
568
|
+
normalized = remove_html_comments(element.content)
|
|
572
569
|
|
|
573
570
|
if normalized.empty?
|
|
574
571
|
# Remove all children (including whitespace-only CDATA nodes)
|
|
@@ -579,6 +576,43 @@ module Canon
|
|
|
579
576
|
end
|
|
580
577
|
end
|
|
581
578
|
|
|
579
|
+
# Remove HTML comments from a string without using regex
|
|
580
|
+
# This avoids ReDoS and incomplete sanitization vulnerabilities
|
|
581
|
+
#
|
|
582
|
+
# @param text [String] Text potentially containing HTML comments
|
|
583
|
+
# @return [String] Text with HTML comments removed
|
|
584
|
+
def remove_html_comments(text)
|
|
585
|
+
return "" if text.nil?
|
|
586
|
+
|
|
587
|
+
result = +""
|
|
588
|
+
pos = 0
|
|
589
|
+
|
|
590
|
+
while pos < text.length
|
|
591
|
+
# Look for comment start
|
|
592
|
+
comment_start = text.index("<!--", pos)
|
|
593
|
+
if comment_start.nil?
|
|
594
|
+
# No more comments, append rest of text
|
|
595
|
+
result << text[pos..]
|
|
596
|
+
break
|
|
597
|
+
end
|
|
598
|
+
|
|
599
|
+
# Append text before comment
|
|
600
|
+
result << text[pos...comment_start]
|
|
601
|
+
|
|
602
|
+
# Look for comment end
|
|
603
|
+
comment_end = text.index("-->", comment_start + 4)
|
|
604
|
+
if comment_end.nil?
|
|
605
|
+
# Unclosed comment, skip the rest
|
|
606
|
+
break
|
|
607
|
+
end
|
|
608
|
+
|
|
609
|
+
# Move past the comment
|
|
610
|
+
pos = comment_end + 3
|
|
611
|
+
end
|
|
612
|
+
|
|
613
|
+
result.strip
|
|
614
|
+
end
|
|
615
|
+
|
|
582
616
|
# Normalize whitespace in text nodes according to HTML rendering rules
|
|
583
617
|
# In HTML rendering, sequences of whitespace (spaces, tabs, newlines)
|
|
584
618
|
# collapse to a single space, except in elements where whitespace is
|
|
@@ -621,8 +655,8 @@ compare_profile = nil)
|
|
|
621
655
|
next if ancestor_preserves_whitespace?(parent, preserve_whitespace)
|
|
622
656
|
|
|
623
657
|
# Collapse whitespace sequences (spaces, tabs, newlines) to single
|
|
624
|
-
# space
|
|
625
|
-
normalized = text_node.content.
|
|
658
|
+
# space - use tr/squeeze to avoid ReDoS vulnerability from gsub(/\s+/)
|
|
659
|
+
normalized = text_node.content.tr("\t\n\r\f\v", " ").squeeze(" ")
|
|
626
660
|
|
|
627
661
|
# Trim leading/trailing whitespace if appropriate
|
|
628
662
|
normalized = normalized.strip if should_trim_text_node?(text_node)
|
|
@@ -77,6 +77,13 @@ module Canon
|
|
|
77
77
|
return build_text_difference_reason(text1, text2)
|
|
78
78
|
end
|
|
79
79
|
|
|
80
|
+
# For attribute order differences, show the actual attribute names
|
|
81
|
+
if dimension == :attribute_order
|
|
82
|
+
attrs1 = extract_attributes(node1)&.keys || []
|
|
83
|
+
attrs2 = extract_attributes(node2)&.keys || []
|
|
84
|
+
return "Attribute order changed: [#{attrs1.join(', ')}] → [#{attrs2.join(', ')}]"
|
|
85
|
+
end
|
|
86
|
+
|
|
80
87
|
# Default reason
|
|
81
88
|
"#{diff1} vs #{diff2}"
|
|
82
89
|
end
|
|
@@ -615,9 +615,47 @@ differences)
|
|
|
615
615
|
return build_text_diff_reason(text1, text2)
|
|
616
616
|
end
|
|
617
617
|
|
|
618
|
+
# For attribute values differences, show the actual values
|
|
619
|
+
if dimension == :attribute_values
|
|
620
|
+
attrs1 = extract_attributes(node1)
|
|
621
|
+
attrs2 = extract_attributes(node2)
|
|
622
|
+
return build_attribute_value_diff_reason(attrs1, attrs2)
|
|
623
|
+
end
|
|
624
|
+
|
|
625
|
+
# For attribute order differences, show the actual attribute names
|
|
626
|
+
if dimension == :attribute_order
|
|
627
|
+
attrs1 = extract_attributes(node1)&.keys || []
|
|
628
|
+
attrs2 = extract_attributes(node2)&.keys || []
|
|
629
|
+
return "Attribute order changed: [#{attrs1.join(', ')}] → [#{attrs2.join(', ')}]"
|
|
630
|
+
end
|
|
631
|
+
|
|
618
632
|
"#{diff1} vs #{diff2}"
|
|
619
633
|
end
|
|
620
634
|
|
|
635
|
+
# Build a clear reason message for attribute value differences
|
|
636
|
+
#
|
|
637
|
+
# @param attrs1 [Hash, nil] First node's attributes
|
|
638
|
+
# @param attrs2 [Hash, nil] Second node's attributes
|
|
639
|
+
# @return [String] Clear explanation of the attribute value difference
|
|
640
|
+
def build_attribute_value_diff_reason(attrs1, attrs2)
|
|
641
|
+
return "missing vs present attributes" unless attrs1 && attrs2
|
|
642
|
+
|
|
643
|
+
require "set"
|
|
644
|
+
keys1 = attrs1.keys.to_set
|
|
645
|
+
keys2 = attrs2.keys.to_set
|
|
646
|
+
|
|
647
|
+
common = keys1 & keys2
|
|
648
|
+
different_values = common.reject { |k| attrs1[k] == attrs2[k] }
|
|
649
|
+
|
|
650
|
+
return "all attribute values match" if different_values.empty?
|
|
651
|
+
|
|
652
|
+
parts = different_values.map do |k|
|
|
653
|
+
"#{k}: #{attrs1[k].inspect} vs #{attrs2[k].inspect}"
|
|
654
|
+
end
|
|
655
|
+
|
|
656
|
+
parts.join("; ")
|
|
657
|
+
end
|
|
658
|
+
|
|
621
659
|
# Build a clear reason message for attribute presence differences
|
|
622
660
|
#
|
|
623
661
|
# @param attrs1 [Hash, nil] First node's attributes
|
|
@@ -31,12 +31,18 @@ module Canon
|
|
|
31
31
|
return "" if line.nil?
|
|
32
32
|
|
|
33
33
|
# Collapse all whitespace (spaces, tabs, newlines) to single space
|
|
34
|
-
|
|
34
|
+
# Avoid regex to prevent ReDoS vulnerability - use String methods
|
|
35
|
+
normalized = line.strip.tr("\t\n\r\f\v", " ").squeeze(" ")
|
|
35
36
|
|
|
36
37
|
# Normalize whitespace around tag delimiters
|
|
37
|
-
# Remove spaces before > and after <
|
|
38
|
-
|
|
39
|
-
|
|
38
|
+
# Remove spaces before > and after < (avoid regex for ReDoS safety)
|
|
39
|
+
while normalized.include?(" >")
|
|
40
|
+
normalized = normalized.gsub(" >", ">")
|
|
41
|
+
end
|
|
42
|
+
while normalized.include?("< ")
|
|
43
|
+
normalized = normalized.gsub("< ", "<")
|
|
44
|
+
end
|
|
45
|
+
normalized
|
|
40
46
|
end
|
|
41
47
|
|
|
42
48
|
# Check if a line is blank (nil or whitespace-only)
|
|
@@ -113,7 +113,8 @@ module Canon
|
|
|
113
113
|
# show reason if available
|
|
114
114
|
if diff.respond_to?(:reason) && diff.reason
|
|
115
115
|
output << "#{colorize('Reason:', :cyan, use_color,
|
|
116
|
-
bold: true)} #{colorize(diff.reason,
|
|
116
|
+
bold: true)} #{colorize(diff.reason,
|
|
117
|
+
:yellow, use_color)}"
|
|
117
118
|
end
|
|
118
119
|
output << ""
|
|
119
120
|
|
|
@@ -307,9 +307,12 @@ module Canon
|
|
|
307
307
|
# Add value
|
|
308
308
|
result << "value:#{value}" if value
|
|
309
309
|
|
|
310
|
-
# Add attributes
|
|
311
|
-
|
|
312
|
-
|
|
310
|
+
# Add attributes (key only, not values)
|
|
311
|
+
# This ensures nodes differing only in attribute VALUES still get matched
|
|
312
|
+
# and are then reported as attribute_updates rather than structural differences
|
|
313
|
+
# NOTE: The value differences are detected separately in detect_changes
|
|
314
|
+
attributes.each_key do |key|
|
|
315
|
+
result << "attr:#{key}"
|
|
313
316
|
end
|
|
314
317
|
|
|
315
318
|
# Add child labels
|