canon 0.1.14 → 0.1.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +19 -83
- data/README.adoc +57 -0
- data/Rakefile +2 -0
- data/docs/features/index.adoc +10 -0
- data/docs/features/performance.adoc +161 -0
- data/lib/canon/color_detector.rb +3 -3
- data/lib/canon/comparison/comparison_result.rb +1 -3
- data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +1 -1
- data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +1 -1
- data/lib/canon/comparison/dimensions/comments_dimension.rb +2 -2
- data/lib/canon/comparison/dimensions/element_position_dimension.rb +1 -1
- data/lib/canon/comparison/dimensions/registry.rb +1 -1
- data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +2 -2
- data/lib/canon/comparison/dimensions/text_content_dimension.rb +2 -2
- data/lib/canon/comparison/format_detector.rb +1 -1
- data/lib/canon/comparison/html_comparator.rb +47 -17
- data/lib/canon/comparison/markup_comparator.rb +7 -7
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +3 -3
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +2 -2
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +7 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +1 -1
- data/lib/canon/comparison/xml_comparator.rb +42 -8
- data/lib/canon/comparison/xml_node_comparison.rb +3 -3
- data/lib/canon/comparison.rb +1 -1
- data/lib/canon/config/env_provider.rb +3 -3
- data/lib/canon/diff/diff_block_builder.rb +2 -2
- data/lib/canon/diff/diff_context_builder.rb +1 -1
- data/lib/canon/diff/diff_node_mapper.rb +1 -1
- data/lib/canon/diff/formatting_detector.rb +10 -4
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +2 -2
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +2 -2
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +2 -2
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +1 -1
- data/lib/canon/diff_formatter/by_object/json_formatter.rb +3 -1
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +1 -1
- data/lib/canon/diff_formatter/debug_output.rb +4 -6
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +1 -1
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +7 -0
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +7 -0
- data/lib/canon/diff_formatter.rb +5 -5
- data/lib/canon/errors.rb +3 -3
- data/lib/canon/rspec_matchers.rb +2 -2
- data/lib/canon/tree_diff/adapters/json_adapter.rb +2 -6
- data/lib/canon/tree_diff/adapters/yaml_adapter.rb +2 -6
- data/lib/canon/tree_diff/core/matching.rb +2 -2
- data/lib/canon/tree_diff/core/node_signature.rb +2 -4
- data/lib/canon/tree_diff/core/tree_node.rb +7 -4
- data/lib/canon/tree_diff/operation_converter.rb +1 -1
- data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +1 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/line_range_mapper.rb +1 -1
- data/lib/canon/xml/nodes/attribute_node.rb +4 -0
- data/lib/canon/xml/nodes/element_node.rb +4 -0
- data/lib/canon/xml/nodes/namespace_node.rb +4 -0
- data/lib/canon/xml/nodes/processing_instruction_node.rb +4 -0
- data/lib/canon/xml/sax_builder.rb +360 -0
- data/lib/canon.rb +1 -1
- data/lib/tasks/benchmark_runner.rb +610 -0
- data/lib/tasks/performance.rake +81 -0
- data/lib/tasks/performance_comparator.rb +100 -0
- data/lib/tasks/performance_helpers.rb +219 -0
- metadata +9 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0eb3c717365f052953d3deaf83a897112709c1a6084b472b99ddfdc2c9e43b67
|
|
4
|
+
data.tar.gz: fe4b2b513193b87692cd1fcb11569898e69c6818bea08ae3dccc753ad935f6e0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2c6d351b873ebb745c5abcdb2ff6cdbcf4ce53da1ad7f070c0b1eefeeeb776e315fa62c8d82c24b216e6e93cf5ac1790ebe3c6a171a142036ef0abc356d5a9e6
|
|
7
|
+
data.tar.gz: 6c0228d16e387e2a7919786cb57636e5c3183f0a1a1e119684fb0e01122a5ac23ffc08849f9df55bf413495024de30c4bf2e420172e13a9197d48b30636f845a
|
data/.rubocop_todo.yml
CHANGED
|
@@ -1,64 +1,24 @@
|
|
|
1
1
|
# This configuration was generated by
|
|
2
2
|
# `rubocop --auto-gen-config`
|
|
3
|
-
# on 2026-
|
|
3
|
+
# on 2026-03-21 03:07:35 UTC using RuboCop version 1.85.1.
|
|
4
4
|
# The point is for the user to remove these configuration records
|
|
5
5
|
# one by one as the offenses are removed from the code base.
|
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
|
7
7
|
# versions of RuboCop, may require this file to be generated again.
|
|
8
8
|
|
|
9
9
|
# Offense count: 1
|
|
10
|
-
# Configuration parameters: Severity.
|
|
11
10
|
Gemspec/RequiredRubyVersion:
|
|
12
11
|
Exclude:
|
|
13
12
|
- 'canon.gemspec'
|
|
14
13
|
|
|
15
|
-
# Offense count:
|
|
16
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
17
|
-
# Configuration parameters: EnforcedStyle, IndentationWidth.
|
|
18
|
-
# SupportedStyles: with_first_argument, with_fixed_indentation
|
|
19
|
-
Layout/ArgumentAlignment:
|
|
20
|
-
Exclude:
|
|
21
|
-
- 'lib/canon/xml/element_matcher.rb'
|
|
22
|
-
|
|
23
|
-
# Offense count: 23
|
|
24
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
25
|
-
# Configuration parameters: EnforcedStyleAlignWith.
|
|
26
|
-
# SupportedStylesAlignWith: either, start_of_block, start_of_line
|
|
27
|
-
Layout/BlockAlignment:
|
|
28
|
-
Exclude:
|
|
29
|
-
- 'spec/canon/fixtures/isodoc_spec.rb'
|
|
30
|
-
- 'spec/canon/table_class_attribute_bug_spec.rb'
|
|
31
|
-
|
|
32
|
-
# Offense count: 23
|
|
14
|
+
# Offense count: 773
|
|
33
15
|
# This cop supports safe autocorrection (--autocorrect).
|
|
34
|
-
|
|
35
|
-
Exclude:
|
|
36
|
-
- 'spec/canon/fixtures/isodoc_spec.rb'
|
|
37
|
-
- 'spec/canon/table_class_attribute_bug_spec.rb'
|
|
38
|
-
|
|
39
|
-
# Offense count: 46
|
|
40
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
41
|
-
# Configuration parameters: Width, AllowedPatterns.
|
|
42
|
-
Layout/IndentationWidth:
|
|
43
|
-
Exclude:
|
|
44
|
-
- 'spec/canon/fixtures/isodoc_spec.rb'
|
|
45
|
-
- 'spec/canon/table_class_attribute_bug_spec.rb'
|
|
46
|
-
|
|
47
|
-
# Offense count: 780
|
|
48
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
49
|
-
# Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, IgnoreCopDirectives, AllowedPatterns, SplitStrings.
|
|
16
|
+
# Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
|
|
50
17
|
# URISchemes: http, https
|
|
51
18
|
Layout/LineLength:
|
|
52
19
|
Enabled: false
|
|
53
20
|
|
|
54
|
-
# Offense count:
|
|
55
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
56
|
-
# Configuration parameters: AllowInHeredoc.
|
|
57
|
-
Layout/TrailingWhitespace:
|
|
58
|
-
Exclude:
|
|
59
|
-
- 'lib/canon/xml/element_matcher.rb'
|
|
60
|
-
|
|
61
|
-
# Offense count: 48
|
|
21
|
+
# Offense count: 49
|
|
62
22
|
# Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
|
|
63
23
|
Lint/DuplicateBranch:
|
|
64
24
|
Enabled: false
|
|
@@ -87,45 +47,44 @@ Lint/UnreachableCode:
|
|
|
87
47
|
Exclude:
|
|
88
48
|
- 'lib/canon/diff_formatter/debug_output.rb'
|
|
89
49
|
|
|
90
|
-
# Offense count:
|
|
50
|
+
# Offense count: 6
|
|
91
51
|
# This cop supports safe autocorrection (--autocorrect).
|
|
92
52
|
# Configuration parameters: AllowUnusedKeywordArguments, IgnoreEmptyMethods, IgnoreNotImplementedMethods, NotImplementedExceptions.
|
|
93
53
|
# NotImplementedExceptions: NotImplementedError
|
|
94
54
|
Lint/UnusedMethodArgument:
|
|
95
55
|
Exclude:
|
|
96
|
-
- 'lib/canon/comparison.rb'
|
|
97
56
|
- 'lib/canon/diff/path_builder.rb'
|
|
98
57
|
- 'lib/canon/diff_formatter/by_line/base_formatter.rb'
|
|
99
58
|
- 'lib/canon/diff_formatter/by_line/xml_formatter.rb'
|
|
100
59
|
- 'lib/canon/diff_formatter/by_object/base_formatter.rb'
|
|
101
60
|
|
|
102
|
-
# Offense count:
|
|
61
|
+
# Offense count: 229
|
|
103
62
|
# Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
|
|
104
63
|
Metrics/AbcSize:
|
|
105
64
|
Enabled: false
|
|
106
65
|
|
|
107
|
-
# Offense count:
|
|
66
|
+
# Offense count: 22
|
|
108
67
|
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns, inherit_mode.
|
|
109
68
|
# AllowedMethods: refine
|
|
110
69
|
Metrics/BlockLength:
|
|
111
70
|
Max: 84
|
|
112
71
|
|
|
113
|
-
# Offense count:
|
|
72
|
+
# Offense count: 187
|
|
114
73
|
# Configuration parameters: AllowedMethods, AllowedPatterns, Max.
|
|
115
74
|
Metrics/CyclomaticComplexity:
|
|
116
75
|
Enabled: false
|
|
117
76
|
|
|
118
|
-
# Offense count:
|
|
77
|
+
# Offense count: 394
|
|
119
78
|
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
|
120
79
|
Metrics/MethodLength:
|
|
121
|
-
Max:
|
|
80
|
+
Max: 95
|
|
122
81
|
|
|
123
|
-
# Offense count:
|
|
82
|
+
# Offense count: 45
|
|
124
83
|
# Configuration parameters: CountKeywordArgs, MaxOptionalParameters.
|
|
125
84
|
Metrics/ParameterLists:
|
|
126
85
|
Max: 9
|
|
127
86
|
|
|
128
|
-
# Offense count:
|
|
87
|
+
# Offense count: 154
|
|
129
88
|
# Configuration parameters: AllowedMethods, AllowedPatterns, Max.
|
|
130
89
|
Metrics/PerceivedComplexity:
|
|
131
90
|
Enabled: false
|
|
@@ -139,16 +98,6 @@ Naming/MethodParameterName:
|
|
|
139
98
|
- 'lib/canon/comparison/xml_comparator/attribute_comparator.rb'
|
|
140
99
|
- 'lib/canon/xml/namespace_handler.rb'
|
|
141
100
|
|
|
142
|
-
# Offense count: 1
|
|
143
|
-
# Configuration parameters: NamePrefix, ForbiddenPrefixes, AllowedMethods, MethodDefinitionMacros, UseSorbetSigs.
|
|
144
|
-
# NamePrefix: is_, has_, have_, does_
|
|
145
|
-
# ForbiddenPrefixes: is_, has_, have_, does_
|
|
146
|
-
# AllowedMethods: is_a?
|
|
147
|
-
# MethodDefinitionMacros: define_method, define_singleton_method
|
|
148
|
-
Naming/PredicatePrefix:
|
|
149
|
-
Exclude:
|
|
150
|
-
- 'lib/canon/comparison/html_comparator.rb'
|
|
151
|
-
|
|
152
101
|
# Offense count: 6
|
|
153
102
|
# Configuration parameters: EnforcedStyle, CheckMethodNames, CheckSymbols, AllowedIdentifiers, AllowedPatterns.
|
|
154
103
|
# SupportedStyles: snake_case, normalcase, non_integer
|
|
@@ -159,13 +108,12 @@ Naming/VariableNumber:
|
|
|
159
108
|
- 'lib/canon/comparison/markup_comparator.rb'
|
|
160
109
|
- 'lib/canon/comparison/xml_comparator/diff_node_builder.rb'
|
|
161
110
|
|
|
162
|
-
# Offense count:
|
|
111
|
+
# Offense count: 2
|
|
163
112
|
# Configuration parameters: MinSize.
|
|
164
113
|
Performance/CollectionLiteralInLoop:
|
|
165
114
|
Exclude:
|
|
166
115
|
- 'lib/canon/comparison/html_comparator.rb'
|
|
167
116
|
- 'lib/canon/xml/xml_base_handler.rb'
|
|
168
|
-
- 'spec/canon/table_class_attribute_bug_spec.rb'
|
|
169
117
|
|
|
170
118
|
# Offense count: 68
|
|
171
119
|
# Configuration parameters: Prefixes, AllowedPatterns.
|
|
@@ -184,10 +132,10 @@ RSpec/DescribeMethod:
|
|
|
184
132
|
- 'spec/canon/comparison/multiple_differences_spec.rb'
|
|
185
133
|
- 'spec/canon/diff_formatter/character_map_customization_spec.rb'
|
|
186
134
|
|
|
187
|
-
# Offense count:
|
|
135
|
+
# Offense count: 695
|
|
188
136
|
# Configuration parameters: CountAsOne.
|
|
189
137
|
RSpec/ExampleLength:
|
|
190
|
-
Max:
|
|
138
|
+
Max: 43
|
|
191
139
|
|
|
192
140
|
# Offense count: 8
|
|
193
141
|
# This cop supports safe autocorrection (--autocorrect).
|
|
@@ -240,7 +188,7 @@ RSpec/MultipleDescribes:
|
|
|
240
188
|
RSpec/MultipleExpectations:
|
|
241
189
|
Max: 15
|
|
242
190
|
|
|
243
|
-
# Offense count:
|
|
191
|
+
# Offense count: 71
|
|
244
192
|
# Configuration parameters: AllowSubject.
|
|
245
193
|
RSpec/MultipleMemoizedHelpers:
|
|
246
194
|
Max: 13
|
|
@@ -259,13 +207,12 @@ RSpec/NamedSubject:
|
|
|
259
207
|
RSpec/NestedGroups:
|
|
260
208
|
Max: 4
|
|
261
209
|
|
|
262
|
-
# Offense count:
|
|
210
|
+
# Offense count: 10
|
|
263
211
|
# Configuration parameters: AllowedPatterns.
|
|
264
212
|
# AllowedPatterns: ^expect_, ^assert_
|
|
265
213
|
RSpec/NoExpectationExample:
|
|
266
214
|
Exclude:
|
|
267
215
|
- 'spec/canon/context_grouping_spec.rb'
|
|
268
|
-
- 'spec/canon/fixtures/isodoc_spec.rb'
|
|
269
216
|
- 'spec/canon/informative_diffs_debug_spec.rb'
|
|
270
217
|
- 'spec/canon/isodoc_blockquotes_spec.rb'
|
|
271
218
|
- 'spec/canon/match_scenarios_spec.rb'
|
|
@@ -283,28 +230,17 @@ RSpec/SpecFilePathFormat:
|
|
|
283
230
|
- 'spec/canon/yaml/formatter_spec.rb'
|
|
284
231
|
- 'spec/xml_c14n_spec.rb'
|
|
285
232
|
|
|
286
|
-
# Offense count:
|
|
233
|
+
# Offense count: 126
|
|
287
234
|
# Configuration parameters: IgnoreNameless, IgnoreSymbolicNames.
|
|
288
235
|
RSpec/VerifiedDoubles:
|
|
289
236
|
Exclude:
|
|
237
|
+
- 'spec/canon/comparison/diff_node_builder_spec.rb'
|
|
290
238
|
- 'spec/canon/comparison/whitespace_sensitivity_spec.rb'
|
|
291
239
|
- 'spec/canon/diff/diff_classifier_spec.rb'
|
|
292
240
|
- 'spec/canon/diff/path_builder_spec.rb'
|
|
293
241
|
- 'spec/canon/diff/xml_serialization_formatter_spec.rb'
|
|
294
242
|
- 'spec/canon/tree_diff/operation_converter_spec.rb'
|
|
295
243
|
|
|
296
|
-
# Offense count: 44
|
|
297
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
298
|
-
# Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
|
|
299
|
-
# SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
|
|
300
|
-
# ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
|
|
301
|
-
# FunctionalMethods: let, let!, subject, watch
|
|
302
|
-
# AllowedMethods: lambda, proc, it
|
|
303
|
-
Style/BlockDelimiters:
|
|
304
|
-
Exclude:
|
|
305
|
-
- 'spec/canon/fixtures/isodoc_spec.rb'
|
|
306
|
-
- 'spec/canon/table_class_attribute_bug_spec.rb'
|
|
307
|
-
|
|
308
244
|
# Offense count: 1
|
|
309
245
|
# This cop supports safe autocorrection (--autocorrect).
|
|
310
246
|
# Configuration parameters: EnforcedStyle, AllowComments.
|
data/README.adoc
CHANGED
|
@@ -901,6 +901,63 @@ After checking out the repo, run `bin/setup` to install dependencies. Then run
|
|
|
901
901
|
`rake spec` to run the tests. You can also run `bin/console` for an interactive
|
|
902
902
|
prompt.
|
|
903
903
|
|
|
904
|
+
== Performance
|
|
905
|
+
|
|
906
|
+
Canon includes a comprehensive performance benchmarking system to prevent regressions in XML/HTML parsing and comparison algorithms.
|
|
907
|
+
|
|
908
|
+
=== Running Benchmarks
|
|
909
|
+
|
|
910
|
+
[source,bash]
|
|
911
|
+
----
|
|
912
|
+
# Run all benchmarks (10s per test, takes ~5 minutes)
|
|
913
|
+
bundle exec rake performance:run
|
|
914
|
+
|
|
915
|
+
# Quick benchmark (2s per test, takes ~30 seconds)
|
|
916
|
+
bundle exec rake performance:quick
|
|
917
|
+
|
|
918
|
+
# Compare against main branch (for PRs)
|
|
919
|
+
bundle exec rake performance:compare
|
|
920
|
+
|
|
921
|
+
# Run specific category
|
|
922
|
+
bundle exec rake performance:category[xml_parsing]
|
|
923
|
+
|
|
924
|
+
# Output formats
|
|
925
|
+
bundle exec rake performance:json
|
|
926
|
+
bundle exec rake performance:yaml
|
|
927
|
+
----
|
|
928
|
+
|
|
929
|
+
=== Benchmark Categories
|
|
930
|
+
|
|
931
|
+
* **XML Parsing**: DOM vs SAX, simple vs large documents
|
|
932
|
+
* **HTML Parsing**: Simple vs complex (with scripts/tables)
|
|
933
|
+
* **XML/HTML Comparison**: Identical, similar, different documents
|
|
934
|
+
* **Format Canonicalization**: XML C14N, JSON, YAML
|
|
935
|
+
|
|
936
|
+
=== SAX Parser
|
|
937
|
+
|
|
938
|
+
Canon includes a SAX-based XML parser (`Canon::Xml::SaxBuilder`) that provides
|
|
939
|
+
significantly faster XML parsing by avoiding intermediate Nokogiri DOM trees.
|
|
940
|
+
|
|
941
|
+
[source,ruby]
|
|
942
|
+
----
|
|
943
|
+
require 'canon/xml/sax_builder'
|
|
944
|
+
|
|
945
|
+
# Parse XML directly to Canon::Xml::Node tree
|
|
946
|
+
root = Canon::Xml::SaxBuilder.parse(xml_string)
|
|
947
|
+
|
|
948
|
+
# For C14N (strips DOCTYPE to avoid DTD default attribute expansion)
|
|
949
|
+
root = Canon::Xml::SaxBuilder.parse(xml_string, strip_doctype: true)
|
|
950
|
+
----
|
|
951
|
+
|
|
952
|
+
Performance improvement: ~6x faster than DOM parsing + conversion for large documents.
|
|
953
|
+
|
|
954
|
+
=== CI Integration
|
|
955
|
+
|
|
956
|
+
Performance benchmarks run automatically on:
|
|
957
|
+
|
|
958
|
+
* **Pull requests**: Compares against `main` branch and fails if regression exceeds 10%
|
|
959
|
+
* **Main branch pushes**: Logs baseline metrics for performance tracking
|
|
960
|
+
|
|
904
961
|
== Contributing
|
|
905
962
|
|
|
906
963
|
Bug reports and pull requests are welcome on GitHub at
|
data/Rakefile
CHANGED
data/docs/features/index.adoc
CHANGED
|
@@ -92,6 +92,16 @@ Error handling and validation.
|
|
|
92
92
|
* Format detection
|
|
93
93
|
* Error messages
|
|
94
94
|
|
|
95
|
+
=== Performance
|
|
96
|
+
|
|
97
|
+
link:performance/[**Performance**]::
|
|
98
|
+
Performance benchmarking and optimization.
|
|
99
|
+
+
|
|
100
|
+
* SAX-based XML parser
|
|
101
|
+
* Benchmark categories
|
|
102
|
+
* CI integration
|
|
103
|
+
* Regression detection
|
|
104
|
+
|
|
95
105
|
== Quick Configuration Examples
|
|
96
106
|
|
|
97
107
|
=== Test-Friendly Comparison
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
---
|
|
2
|
+
layout: default
|
|
3
|
+
title: Performance
|
|
4
|
+
nav_order: 100
|
|
5
|
+
---
|
|
6
|
+
= Performance
|
|
7
|
+
|
|
8
|
+
Canon includes a comprehensive performance benchmarking system to prevent regressions in XML/HTML parsing and comparison algorithms.
|
|
9
|
+
|
|
10
|
+
== Running Benchmarks
|
|
11
|
+
|
|
12
|
+
[source,bash]
|
|
13
|
+
----
|
|
14
|
+
# Run all benchmarks (10s per test, ~5 minutes total)
|
|
15
|
+
bundle exec rake performance:run
|
|
16
|
+
|
|
17
|
+
# Quick benchmark (2s per test, ~30 seconds)
|
|
18
|
+
bundle exec rake performance:quick
|
|
19
|
+
|
|
20
|
+
# Compare against main branch (for PRs, fails on regression)
|
|
21
|
+
bundle exec rake performance:compare
|
|
22
|
+
|
|
23
|
+
# Run specific category
|
|
24
|
+
bundle exec rake performance:category[xml_parsing]
|
|
25
|
+
|
|
26
|
+
# Output in different formats
|
|
27
|
+
bundle exec rake performance:json
|
|
28
|
+
bundle exec rake performance:yaml
|
|
29
|
+
----
|
|
30
|
+
|
|
31
|
+
== Benchmark Categories
|
|
32
|
+
|
|
33
|
+
=== XML Parsing
|
|
34
|
+
|
|
35
|
+
* **DOM (simple)**: Standard Nokogiri DOM parsing
|
|
36
|
+
* **SAX (simple)**: SAX-based streaming parser
|
|
37
|
+
* **DOM (large)**: Large document DOM parsing
|
|
38
|
+
* **SAX (large)**: Large document SAX parsing
|
|
39
|
+
|
|
40
|
+
SAX parser is typically ~20-50% faster than DOM for XML parsing.
|
|
41
|
+
|
|
42
|
+
=== HTML Parsing
|
|
43
|
+
|
|
44
|
+
* **Simple HTML**: Basic HTML document parsing
|
|
45
|
+
* **Complex HTML**: HTML with scripts, styles, and tables
|
|
46
|
+
|
|
47
|
+
=== XML/HTML Comparison
|
|
48
|
+
|
|
49
|
+
* **Identical**: Comparing the same document to itself
|
|
50
|
+
* **Similar**: Comparing documents with minor differences
|
|
51
|
+
* **Different**: Comparing documents with different namespaces/structure
|
|
52
|
+
|
|
53
|
+
=== Format Canonicalization
|
|
54
|
+
|
|
55
|
+
* **XML C14N**: W3C Canonical XML
|
|
56
|
+
* **JSON**: JSON formatting
|
|
57
|
+
* **YAML**: YAML formatting
|
|
58
|
+
|
|
59
|
+
== SAX Parser
|
|
60
|
+
|
|
61
|
+
Canon includes a SAX-based XML parser (`Canon::Xml::SaxBuilder`) that provides
|
|
62
|
+
significantly faster XML parsing by avoiding intermediate Nokogiri DOM trees.
|
|
63
|
+
|
|
64
|
+
=== How It Works
|
|
65
|
+
|
|
66
|
+
Traditional parsing:
|
|
67
|
+
[source]
|
|
68
|
+
----
|
|
69
|
+
XML String → Nokogiri DOM (~60ms) → Canon::Xml::Node (~1200ms) = ~1260ms
|
|
70
|
+
----
|
|
71
|
+
|
|
72
|
+
SAX parsing:
|
|
73
|
+
[source]
|
|
74
|
+
----
|
|
75
|
+
XML String → Nokogiri SAX → Canon::Xml::Node (~200ms) = ~200ms
|
|
76
|
+
----
|
|
77
|
+
|
|
78
|
+
=== Usage
|
|
79
|
+
|
|
80
|
+
[source,ruby]
|
|
81
|
+
----
|
|
82
|
+
require 'canon/xml/sax_builder'
|
|
83
|
+
|
|
84
|
+
# Parse XML directly to Canon::Xml::Node tree
|
|
85
|
+
root = Canon::Xml::SaxBuilder.parse(xml_string)
|
|
86
|
+
|
|
87
|
+
# With options
|
|
88
|
+
root = Canon::Xml::SaxBuilder.parse(xml_string,
|
|
89
|
+
preserve_whitespace: true,
|
|
90
|
+
strip_doctype: true # For C14N compatibility
|
|
91
|
+
)
|
|
92
|
+
----
|
|
93
|
+
|
|
94
|
+
=== Options
|
|
95
|
+
|
|
96
|
+
`preserve_whitespace`:: Keep whitespace-only text nodes (default: false)
|
|
97
|
+
`strip_doctype`:: Remove DOCTYPE declaration (for C14N, avoids DTD default attribute expansion)
|
|
98
|
+
|
|
99
|
+
== CI Integration
|
|
100
|
+
|
|
101
|
+
Performance benchmarks run automatically on:
|
|
102
|
+
|
|
103
|
+
=== Pull Requests
|
|
104
|
+
|
|
105
|
+
When a PR is opened, the performance workflow compares benchmarks against the `main` branch.
|
|
106
|
+
If any benchmark regresses by more than 10%, the check fails with a clear error message.
|
|
107
|
+
|
|
108
|
+
[source,bash]
|
|
109
|
+
----
|
|
110
|
+
# CI output shows comparison
|
|
111
|
+
Comparing against: Previous branch (main).
|
|
112
|
+
Threshold: 10% regression allowed
|
|
113
|
+
|
|
114
|
+
XML Parsing: SAX (large)
|
|
115
|
+
base: 1042.30 IPS
|
|
116
|
+
curr: 1285.20 IPS
|
|
117
|
+
change: +23.31%
|
|
118
|
+
✅ OK
|
|
119
|
+
----
|
|
120
|
+
|
|
121
|
+
=== Main Branch
|
|
122
|
+
|
|
123
|
+
On pushes to `main`, benchmarks run to log baseline metrics for performance tracking.
|
|
124
|
+
|
|
125
|
+
== Threshold Configuration
|
|
126
|
+
|
|
127
|
+
The default regression threshold is 10%. Configure via rake task:
|
|
128
|
+
|
|
129
|
+
[source,bash]
|
|
130
|
+
----
|
|
131
|
+
# Custom threshold (e.g., 5%)
|
|
132
|
+
RUBYOPT="-rbenchmark-runner" bundle exec rake performance:compare
|
|
133
|
+
----
|
|
134
|
+
|
|
135
|
+
Or modify `lib/tasks/performance_comparator.rb`:
|
|
136
|
+
|
|
137
|
+
[source,ruby]
|
|
138
|
+
----
|
|
139
|
+
DEFAULT_THRESHOLD = 0.10 # 10%
|
|
140
|
+
----
|
|
141
|
+
|
|
142
|
+
== Adding New Benchmarks
|
|
143
|
+
|
|
144
|
+
Add test methods to `lib/tasks/benchmark_runner.rb`:
|
|
145
|
+
|
|
146
|
+
[source,ruby]
|
|
147
|
+
----
|
|
148
|
+
# In BENCHMARKS hash
|
|
149
|
+
BENCHMARKS = {
|
|
150
|
+
xml_parsing: [
|
|
151
|
+
# ... existing tests ...
|
|
152
|
+
{ name: "New Test", method: :my_new_test, desc: "Description" },
|
|
153
|
+
],
|
|
154
|
+
}.freeze
|
|
155
|
+
|
|
156
|
+
# Add test method
|
|
157
|
+
def my_new_test
|
|
158
|
+
xml = DataGenerator.generate_xml(items: @items)
|
|
159
|
+
measure { Canon::Xml::SaxBuilder.parse(xml) }
|
|
160
|
+
end
|
|
161
|
+
----
|
data/lib/canon/color_detector.rb
CHANGED
|
@@ -82,7 +82,7 @@ module Canon
|
|
|
82
82
|
# @return [Boolean] true if colors appear to be supported
|
|
83
83
|
def detect_from_env
|
|
84
84
|
# Check TERM variable
|
|
85
|
-
term = ENV
|
|
85
|
+
term = ENV.fetch("TERM", nil)
|
|
86
86
|
if term && NO_COLOR_TERMS.any? { |t| term.include?(t) }
|
|
87
87
|
# Known no-color terminals
|
|
88
88
|
return false
|
|
@@ -100,7 +100,7 @@ module Canon
|
|
|
100
100
|
end
|
|
101
101
|
|
|
102
102
|
# Check for known color-capable terminals
|
|
103
|
-
colorterm = ENV
|
|
103
|
+
colorterm = ENV.fetch("COLORTERM", nil)
|
|
104
104
|
return true if COLOR_TERM_VALUES.include?(colorterm)
|
|
105
105
|
|
|
106
106
|
# Default: assume colors are supported on modern terminals
|
|
@@ -125,7 +125,7 @@ module Canon
|
|
|
125
125
|
# - Generic CI: check for specific TeamCity/Terminal variables
|
|
126
126
|
#
|
|
127
127
|
# @return [Boolean] true if CI environment likely supports colors
|
|
128
|
-
def detect_ci_colors
|
|
128
|
+
def detect_ci_colors # rubocop:disable Naming/PredicateMethod
|
|
129
129
|
# Most modern CI systems support ANSI colors
|
|
130
130
|
# Only disable for explicitly known non-color CI
|
|
131
131
|
return false if ENV["TERM"] == "dumb"
|
|
@@ -44,10 +44,8 @@ html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil)
|
|
|
44
44
|
if diff.is_a?(Canon::Diff::DiffNode)
|
|
45
45
|
diff.normative?
|
|
46
46
|
# Legacy Hash format - always considered normative (structural differences)
|
|
47
|
-
elsif diff.is_a?(Hash)
|
|
48
|
-
true
|
|
49
47
|
else
|
|
50
|
-
|
|
48
|
+
diff.is_a?(Hash)
|
|
51
49
|
end
|
|
52
50
|
end
|
|
53
51
|
end
|
|
@@ -37,7 +37,7 @@ module Canon
|
|
|
37
37
|
# @param order1 [Array<Symbol>] First attribute order
|
|
38
38
|
# @param order2 [Array<Symbol>] Second attribute order
|
|
39
39
|
# @return [Boolean] true if attribute order is exactly the same
|
|
40
|
-
def compare_strict(order1, order2)
|
|
40
|
+
def compare_strict(order1, order2) # rubocop:disable Naming/PredicateMethod
|
|
41
41
|
order1 == order2
|
|
42
42
|
end
|
|
43
43
|
|
|
@@ -37,7 +37,7 @@ module Canon
|
|
|
37
37
|
# @param names1 [Array<Symbol>] First attribute names
|
|
38
38
|
# @param names2 [Array<Symbol>] Second attribute names
|
|
39
39
|
# @return [Boolean] true if attribute names are exactly equal
|
|
40
|
-
def compare_strict(names1, names2)
|
|
40
|
+
def compare_strict(names1, names2) # rubocop:disable Naming/PredicateMethod
|
|
41
41
|
names1.sort == names2.sort
|
|
42
42
|
end
|
|
43
43
|
|
|
@@ -37,7 +37,7 @@ module Canon
|
|
|
37
37
|
# @param comments1 [Array<String>] First comments array
|
|
38
38
|
# @param comments2 [Array<String>] Second comments array
|
|
39
39
|
# @return [Boolean] true if comments are exactly equal
|
|
40
|
-
def compare_strict(comments1, comments2)
|
|
40
|
+
def compare_strict(comments1, comments2) # rubocop:disable Naming/PredicateMethod
|
|
41
41
|
comments1 == comments2
|
|
42
42
|
end
|
|
43
43
|
|
|
@@ -48,7 +48,7 @@ module Canon
|
|
|
48
48
|
# @param comments1 [Array<String>] First comments array
|
|
49
49
|
# @param comments2 [Array<String>] Second comments array
|
|
50
50
|
# @return [Boolean] true if normalized comments are equal
|
|
51
|
-
def compare_normalize(comments1, comments2)
|
|
51
|
+
def compare_normalize(comments1, comments2) # rubocop:disable Naming/PredicateMethod
|
|
52
52
|
normalize_comments(comments1) == normalize_comments(comments2)
|
|
53
53
|
end
|
|
54
54
|
|
|
@@ -39,7 +39,7 @@ module Canon
|
|
|
39
39
|
# @param pos1 [Integer] First position
|
|
40
40
|
# @param pos2 [Integer] Second position
|
|
41
41
|
# @return [Boolean] true if positions are equal
|
|
42
|
-
def compare_strict(pos1, pos2)
|
|
42
|
+
def compare_strict(pos1, pos2) # rubocop:disable Naming/PredicateMethod
|
|
43
43
|
pos1 == pos2
|
|
44
44
|
end
|
|
45
45
|
|
|
@@ -67,7 +67,7 @@ module Canon
|
|
|
67
67
|
# @param node2 [Object] Second node
|
|
68
68
|
# @param behavior [Symbol] Comparison behavior
|
|
69
69
|
# @return [Boolean] true if nodes match for this dimension
|
|
70
|
-
def self.compare(dimension_name, node1, node2, behavior)
|
|
70
|
+
def self.compare(dimension_name, node1, node2, behavior) # rubocop:disable Naming/PredicateMethod
|
|
71
71
|
dimension = get(dimension_name)
|
|
72
72
|
dimension.equivalent?(node1, node2, behavior)
|
|
73
73
|
end
|
|
@@ -41,7 +41,7 @@ module Canon
|
|
|
41
41
|
# @param ws1 [Array<String>] First whitespace array
|
|
42
42
|
# @param ws2 [Array<String>] Second whitespace array
|
|
43
43
|
# @return [Boolean] true if structural whitespace is exactly equal
|
|
44
|
-
def compare_strict(ws1, ws2)
|
|
44
|
+
def compare_strict(ws1, ws2) # rubocop:disable Naming/PredicateMethod
|
|
45
45
|
ws1 == ws2
|
|
46
46
|
end
|
|
47
47
|
|
|
@@ -52,7 +52,7 @@ module Canon
|
|
|
52
52
|
# @param ws1 [Array<String>] First whitespace array
|
|
53
53
|
# @param ws2 [Array<String>] Second whitespace array
|
|
54
54
|
# @return [Boolean] true if normalized structural whitespace is equal
|
|
55
|
-
def compare_normalize(ws1, ws2)
|
|
55
|
+
def compare_normalize(ws1, ws2) # rubocop:disable Naming/PredicateMethod
|
|
56
56
|
normalize_whitespace(ws1) == normalize_whitespace(ws2)
|
|
57
57
|
end
|
|
58
58
|
|
|
@@ -37,7 +37,7 @@ module Canon
|
|
|
37
37
|
# @param text1 [String, nil] First text
|
|
38
38
|
# @param text2 [String, nil] Second text
|
|
39
39
|
# @return [Boolean] true if texts are exactly equal
|
|
40
|
-
def compare_strict(text1, text2)
|
|
40
|
+
def compare_strict(text1, text2) # rubocop:disable Naming/PredicateMethod
|
|
41
41
|
text1.to_s == text2.to_s
|
|
42
42
|
end
|
|
43
43
|
|
|
@@ -49,7 +49,7 @@ module Canon
|
|
|
49
49
|
# @param text1 [String, nil] First text
|
|
50
50
|
# @param text2 [String, nil] Second text
|
|
51
51
|
# @return [Boolean] true if normalized texts are equal
|
|
52
|
-
def compare_normalize(text1, text2)
|
|
52
|
+
def compare_normalize(text1, text2) # rubocop:disable Naming/PredicateMethod
|
|
53
53
|
normalized1 = normalize_text(text1)
|
|
54
54
|
normalized2 = normalize_text(text2)
|
|
55
55
|
|
|
@@ -52,7 +52,7 @@ module Canon
|
|
|
52
52
|
# @return [Symbol] Format type
|
|
53
53
|
def detect_string(str)
|
|
54
54
|
# Use cache for format detection
|
|
55
|
-
Cache.fetch(:format_detect, Cache.key_for_format_detection(str)) do
|
|
55
|
+
Cache.fetch(:format_detect, Cache.key_for_format_detection(str)) do # rubocop:disable Lint/UselessDefaultValueArgument
|
|
56
56
|
detect_string_uncached(str)
|
|
57
57
|
end
|
|
58
58
|
end
|