canon 0.1.16 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0eb3c717365f052953d3deaf83a897112709c1a6084b472b99ddfdc2c9e43b67
4
- data.tar.gz: fe4b2b513193b87692cd1fcb11569898e69c6818bea08ae3dccc753ad935f6e0
3
+ metadata.gz: 34ab9a64b52d8598690536908941e38950bb4071fb6222b50d1cb584236e5286
4
+ data.tar.gz: 2dffcc8e29fcd1f75d78595ef73350208e7b369f2183c2df93672b94df7a6376
5
5
  SHA512:
6
- metadata.gz: 2c6d351b873ebb745c5abcdb2ff6cdbcf4ce53da1ad7f070c0b1eefeeeb776e315fa62c8d82c24b216e6e93cf5ac1790ebe3c6a171a142036ef0abc356d5a9e6
7
- data.tar.gz: 6c0228d16e387e2a7919786cb57636e5c3183f0a1a1e119684fb0e01122a5ac23ffc08849f9df55bf413495024de30c4bf2e420172e13a9197d48b30636f845a
6
+ metadata.gz: 41d784c820a7bbafd9874bf369ef303376578e74f7171e8f00a7ed1ed0b8800576ca67f51272f1464f6f36cbaec35b6f1f1e9806916e7a8c59bc6c04e827ee79
7
+ data.tar.gz: d3e354615ed4b40447ae0be7de64bf6250c40650e25f897a5155f52a74f57a8601003efd37d539abf9f514d5cbe614298a2185db7d72b68b4ff95db6da1b5b99
data/.rubocop_todo.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2026-03-21 03:07:35 UTC using RuboCop version 1.85.1.
3
+ # on 2026-03-24 08:58:24 UTC using RuboCop version 1.85.1.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
@@ -11,13 +11,63 @@ Gemspec/RequiredRubyVersion:
11
11
  Exclude:
12
12
  - 'canon.gemspec'
13
13
 
14
- # Offense count: 773
14
+ # Offense count: 10
15
+ # This cop supports safe autocorrection (--autocorrect).
16
+ # Configuration parameters: EnforcedStyle, IndentationWidth.
17
+ # SupportedStyles: with_first_argument, with_fixed_indentation
18
+ Layout/ArgumentAlignment:
19
+ Exclude:
20
+ - 'lib/canon/xml/data_model.rb'
21
+ - 'spec/canon/comparison/encoding_normalization_spec.rb'
22
+ - 'spec/canon/comparison/xml_whitespace_spec.rb'
23
+
24
+ # Offense count: 1
25
+ # This cop supports safe autocorrection (--autocorrect).
26
+ # Configuration parameters: EnforcedStyle, IndentationWidth.
27
+ # SupportedStyles: with_first_element, with_fixed_indentation
28
+ Layout/ArrayAlignment:
29
+ Exclude:
30
+ - 'lib/canon/comparison/format_detector.rb'
31
+
32
+ # Offense count: 1
33
+ # This cop supports safe autocorrection (--autocorrect).
34
+ Layout/EmptyLineAfterGuardClause:
35
+ Exclude:
36
+ - 'lib/canon/xml/data_model.rb'
37
+
38
+ # Offense count: 1
39
+ # This cop supports safe autocorrection (--autocorrect).
40
+ # Configuration parameters: AllowMultipleStyles, EnforcedHashRocketStyle, EnforcedColonStyle, EnforcedLastArgumentHashStyle.
41
+ # SupportedHashRocketStyles: key, separator, table
42
+ # SupportedColonStyles: key, separator, table
43
+ # SupportedLastArgumentHashStyles: always_inspect, always_ignore, ignore_implicit, ignore_explicit
44
+ Layout/HashAlignment:
45
+ Exclude:
46
+ - 'lib/canon/comparison/format_detector.rb'
47
+
48
+ # Offense count: 831
15
49
  # This cop supports safe autocorrection (--autocorrect).
16
50
  # Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
17
51
  # URISchemes: http, https
18
52
  Layout/LineLength:
19
53
  Enabled: false
20
54
 
55
+ # Offense count: 9
56
+ # This cop supports safe autocorrection (--autocorrect).
57
+ # Configuration parameters: EnforcedStyle.
58
+ # SupportedStyles: symmetrical, new_line, same_line
59
+ Layout/MultilineMethodCallBraceLayout:
60
+ Exclude:
61
+ - 'spec/canon/comparison/encoding_normalization_spec.rb'
62
+ - 'spec/canon/comparison/xml_whitespace_spec.rb'
63
+
64
+ # Offense count: 2
65
+ # This cop supports safe autocorrection (--autocorrect).
66
+ # Configuration parameters: AllowInHeredoc.
67
+ Layout/TrailingWhitespace:
68
+ Exclude:
69
+ - 'lib/canon/comparison/format_detector.rb'
70
+
21
71
  # Offense count: 49
22
72
  # Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
23
73
  Lint/DuplicateBranch:
@@ -47,6 +97,13 @@ Lint/UnreachableCode:
47
97
  Exclude:
48
98
  - 'lib/canon/diff_formatter/debug_output.rb'
49
99
 
100
+ # Offense count: 1
101
+ # This cop supports safe autocorrection (--autocorrect).
102
+ # Configuration parameters: IgnoreEmptyBlocks, AllowUnusedKeywordArguments.
103
+ Lint/UnusedBlockArgument:
104
+ Exclude:
105
+ - 'lib/canon/xml/data_model.rb'
106
+
50
107
  # Offense count: 6
51
108
  # This cop supports safe autocorrection (--autocorrect).
52
109
  # Configuration parameters: AllowUnusedKeywordArguments, IgnoreEmptyMethods, IgnoreNotImplementedMethods, NotImplementedExceptions.
@@ -58,7 +115,7 @@ Lint/UnusedMethodArgument:
58
115
  - 'lib/canon/diff_formatter/by_line/xml_formatter.rb'
59
116
  - 'lib/canon/diff_formatter/by_object/base_formatter.rb'
60
117
 
61
- # Offense count: 229
118
+ # Offense count: 236
62
119
  # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
63
120
  Metrics/AbcSize:
64
121
  Enabled: false
@@ -69,12 +126,12 @@ Metrics/AbcSize:
69
126
  Metrics/BlockLength:
70
127
  Max: 84
71
128
 
72
- # Offense count: 187
129
+ # Offense count: 193
73
130
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
74
131
  Metrics/CyclomaticComplexity:
75
132
  Enabled: false
76
133
 
77
- # Offense count: 394
134
+ # Offense count: 403
78
135
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
79
136
  Metrics/MethodLength:
80
137
  Max: 95
@@ -84,7 +141,7 @@ Metrics/MethodLength:
84
141
  Metrics/ParameterLists:
85
142
  Max: 9
86
143
 
87
- # Offense count: 154
144
+ # Offense count: 160
88
145
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
89
146
  Metrics/PerceivedComplexity:
90
147
  Enabled: false
@@ -115,13 +172,19 @@ Performance/CollectionLiteralInLoop:
115
172
  - 'lib/canon/comparison/html_comparator.rb'
116
173
  - 'lib/canon/xml/xml_base_handler.rb'
117
174
 
175
+ # Offense count: 1
176
+ # This cop supports unsafe autocorrection (--autocorrect-all).
177
+ Performance/UnfreezeString:
178
+ Exclude:
179
+ - 'spec/canon/comparison/encoding_normalization_spec.rb'
180
+
118
181
  # Offense count: 68
119
182
  # Configuration parameters: Prefixes, AllowedPatterns.
120
183
  # Prefixes: when, with, without
121
184
  RSpec/ContextWording:
122
185
  Enabled: false
123
186
 
124
- # Offense count: 27
187
+ # Offense count: 29
125
188
  # Configuration parameters: IgnoredMetadata.
126
189
  RSpec/DescribeClass:
127
190
  Enabled: false
@@ -217,6 +280,11 @@ RSpec/NoExpectationExample:
217
280
  - 'spec/canon/isodoc_blockquotes_spec.rb'
218
281
  - 'spec/canon/match_scenarios_spec.rb'
219
282
 
283
+ # Offense count: 2
284
+ RSpec/RepeatedExample:
285
+ Exclude:
286
+ - 'spec/canon/comparison/encoding_normalization_spec.rb'
287
+
220
288
  # Offense count: 7
221
289
  # Configuration parameters: CustomTransform, IgnoreMethods, IgnoreMetadata, InflectorPath, EnforcedInflector.
222
290
  # SupportedInflectors: default, active_support
@@ -241,6 +309,17 @@ RSpec/VerifiedDoubles:
241
309
  - 'spec/canon/diff/xml_serialization_formatter_spec.rb'
242
310
  - 'spec/canon/tree_diff/operation_converter_spec.rb'
243
311
 
312
+ # Offense count: 1
313
+ # This cop supports safe autocorrection (--autocorrect).
314
+ # Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
315
+ # SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
316
+ # ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
317
+ # FunctionalMethods: let, let!, subject, watch
318
+ # AllowedMethods: lambda, proc, it
319
+ Style/BlockDelimiters:
320
+ Exclude:
321
+ - 'spec/canon/comparison/encoding_normalization_spec.rb'
322
+
244
323
  # Offense count: 1
245
324
  # This cop supports safe autocorrection (--autocorrect).
246
325
  # Configuration parameters: EnforcedStyle, AllowComments.
@@ -263,6 +342,13 @@ Style/IdenticalConditionalBranches:
263
342
  - 'lib/canon/diff_formatter/by_object/base_formatter.rb'
264
343
  - 'lib/canon/diff_formatter/legend.rb'
265
344
 
345
+ # Offense count: 1
346
+ # This cop supports safe autocorrection (--autocorrect).
347
+ # Configuration parameters: AllowMethodComparison, ComparisonsThreshold.
348
+ Style/MultipleComparison:
349
+ Exclude:
350
+ - 'lib/canon/comparison/format_detector.rb'
351
+
266
352
  # Offense count: 1
267
353
  # Configuration parameters: AllowedMethods.
268
354
  # AllowedMethods: respond_to_missing?
data/README.adoc CHANGED
@@ -214,6 +214,7 @@ Compare documents based on meaning, not formatting:
214
214
  * Comment handling with display control
215
215
  * Multiple match dimensions with behaviors
216
216
  * Predefined match profiles (strict, rendered, spec_friendly, content_only)
217
+ * **Cross-encoding comparison**: Compare XML documents with different character encodings (UTF-8, Shift_JIS, ISO-8859-1, UTF-16) — Canon automatically normalizes to UTF-8 before comparison
217
218
 
218
219
  See link:docs/MATCH_OPTIONS[Match options] for details.
219
220
 
@@ -151,22 +151,72 @@ sensitivity in XML instance documents:
151
151
  </text>
152
152
  ----
153
153
 
154
+ The `xml:space` attribute affects both structural whitespace and text content:
155
+
156
+ * **Structural whitespace** (whitespace-only text nodes between child elements)
157
+ * **Text content whitespace** (whitespace within text nodes)
158
+
159
+ .xml:space with structural_whitespace
160
+ [example]
161
+ ====
162
+ [source,ruby]
163
+ ----
164
+ # With xml:space="preserve", structural whitespace is preserved
165
+ xml1 = "<root xml:space='preserve'>\n <text>Hello</text>\n</root>"
166
+ xml2 = "<root xml:space='preserve'><text>Hello</text></root>"
167
+
168
+ # These are NOT equivalent (structural whitespace differs)
169
+ Canon::Comparison.equivalent?(xml1, xml2)
170
+ # => false
171
+ ----
172
+ ====
173
+
174
+ .xml:space with text_content
175
+ [example]
176
+ ====
177
+ [source,ruby]
178
+ ----
179
+ # With xml:space="preserve", text content whitespace is preserved
180
+ xml1 = '<root xml:space="preserve"><code> indented </code></root>'
181
+ xml2 = '<root xml:space="preserve"><code>indented</code></root>'
182
+
183
+ # These are NOT equivalent (text whitespace differs)
184
+ Canon::Comparison.equivalent?(xml1, xml2,
185
+ match: { text_content: :strict }
186
+ )
187
+ # => false
188
+ ----
189
+ ====
190
+
154
191
  ==== Whitelist and blacklist options
155
192
 
156
- You can explicitly specify which elements are whitespace-sensitive:
193
+ You can explicitly specify which elements are whitespace-sensitive using either short or long option names:
157
194
 
158
195
  [source,ruby]
159
196
  ----
160
- # Specify elements that preserve whitespace
197
+ # Short names (preferred)
198
+ Canon::Comparison.equivalent?(xml1, xml2,
199
+ match: {
200
+ structural_whitespace: :strict,
201
+ sensitive_elements: ["pre", "code", "sample"],
202
+ insensitive_elements: ["div", "span"]
203
+ }
204
+ )
205
+
206
+ # Long names (backward-compatible)
161
207
  Canon::Comparison.equivalent?(xml1, xml2,
162
208
  match: {
163
209
  structural_whitespace: :strict,
164
- whitespace_sensitive_elements: [:pre, :code, :sample],
165
- whitespace_insensitive_elements: [:p, :div] # Override defaults/whitelist
210
+ whitespace_sensitive_elements: ["pre", "code", "sample"],
211
+ whitespace_insensitive_elements: ["div", "span"]
166
212
  }
167
213
  )
168
214
  ----
169
215
 
216
+ **Element names are strings** (not symbols) for consistency with XML/HTML conventions.
217
+
218
+ **Blacklist takes precedence over whitelist** — if an element appears in both lists, whitespace is stripped.
219
+
170
220
  ==== respect_xml_space option
171
221
 
172
222
  Control whether xml:space attributes in the document are honored:
@@ -211,34 +261,80 @@ When determining if an element is whitespace-sensitive, Canon uses this priority
211
261
 
212
262
  ==== Format-specific defaults
213
263
 
214
- **HTML**:: `[:pre, :textarea, :script, :style]` - These elements preserve whitespace by HTML specification
264
+ **HTML**:: `["pre", "textarea", "script", "style"]` - These elements preserve whitespace by HTML specification
215
265
  **XML**:: `[]` - No default whitespace-sensitive elements, purely user-controlled
216
266
 
267
+ ==== Two types of whitespace sensitivity
268
+
269
+ Canon handles two distinct whitespace concerns:
270
+
271
+ **1. Structural whitespace stripping** — whitespace-only text nodes between sibling elements (indentation, newlines). These are never semantically meaningful and are stripped by default for XML to enable ElementMatcher to work correctly.
272
+
273
+ **2. Text content comparison** — how non-whitespace text content is compared. Controlled by `structural_whitespace` and `text_content` dimension behaviors (`:strict`, `:normalize`, `:ignore`).
274
+
275
+ The `sensitive_elements` / `insensitive_elements` options control both concerns:
276
+
277
+ [source,ruby]
278
+ ----
279
+ # For XML: structural whitespace is stripped by default
280
+ # Use sensitive_elements to preserve whitespace in specific elements
281
+ xml1 = "<root><item>Test</item></root>"
282
+ xml2 = "<root>\n <item>Test</item>\n</root>"
283
+
284
+ # With sensitive_elements, whitespace inside <item> is preserved
285
+ Canon::Comparison.equivalent?(xml1, xml2,
286
+ match: {
287
+ structural_whitespace: :strict,
288
+ sensitive_elements: ["item"]
289
+ }
290
+ )
291
+ # => true
292
+ ----
293
+
294
+ **Precedence**: blacklist (`insensitive_elements`) > whitelist (`sensitive_elements`) > format defaults
295
+
296
+ **No inheritance**: Only the immediate parent element's name is checked — not ancestor elements.
297
+
217
298
  ==== Examples
218
299
 
219
- .Using xml:space attribute
300
+ .Using xml:space="preserve" for structural whitespace
301
+ [source,ruby]
302
+ ----
303
+ xml1 = "<root xml:space='preserve'>\n <text>Hello</text>\n</root>"
304
+ xml2 = "<root xml:space='preserve'><text>Hello</text></root>"
305
+
306
+ # Structural whitespace differs - NOT equivalent
307
+ Canon::Comparison.equivalent?(xml1, xml2)
308
+ # => false
309
+ ----
310
+
311
+ .Using xml:space="preserve" for text content
220
312
  [source,ruby]
221
313
  ----
222
- xml1 = '<root><code xml:space="preserve"> indented </code></root>'
223
- xml2 = '<root><code xml:space="preserve">indented</code></root>'
314
+ xml1 = '<root><code xml:space="preserve"> multiple spaces </code></root>'
315
+ xml2 = '<root><code xml:space="preserve">multiple spaces</code></root>'
224
316
 
225
- # These are NOT equivalent (whitespace matters in xml:space="preserve")
317
+ # Text content whitespace differs - NOT equivalent with text_content: :strict
226
318
  Canon::Comparison.equivalent?(xml1, xml2,
227
- match: { structural_whitespace: :strict }
319
+ match: { text_content: :strict }
228
320
  )
229
321
  # => false
230
322
  ----
231
323
 
232
- .Using whitelist
324
+ .Using sensitive_elements whitelist
233
325
  [source,ruby]
234
326
  ----
235
- # Make <p> elements whitespace-sensitive
327
+ # Make <sample> elements whitespace-sensitive (strings, not symbols)
328
+ xml1 = "<sample>\n content\n</sample>"
329
+ xml2 = "<sample>content</sample>"
330
+
236
331
  Canon::Comparison.equivalent?(xml1, xml2,
237
332
  match: {
238
333
  structural_whitespace: :strict,
239
- whitespace_sensitive_elements: [:p, :pre]
334
+ sensitive_elements: ["sample"]
240
335
  }
241
336
  )
337
+ # => false (structural whitespace differs in <sample>)
242
338
  ----
243
339
 
244
340
  .Overriding HTML defaults
@@ -249,7 +345,7 @@ Canon::Comparison.equivalent?(html1, html2,
249
345
  format: :html,
250
346
  match: {
251
347
  structural_whitespace: :strict,
252
- whitespace_insensitive_elements: [:script]
348
+ insensitive_elements: ["script"]
253
349
  }
254
350
  )
255
351
  ----
@@ -636,12 +732,12 @@ expect(actual).to be_xml_equivalent_to(expected,
636
732
  element_hierarchy: :ignore
637
733
  )
638
734
 
639
- # Element-level whitespace sensitivity
735
+ # Element-level whitespace sensitivity (strings, not symbols)
640
736
  expect(actual).to be_xml_equivalent_to(expected,
641
737
  match: { structural_whitespace: :strict }
642
738
  )
643
739
  .with_options(
644
- whitespace_sensitive_elements: [:pre, :code, :sample],
740
+ sensitive_elements: ["pre", "code", "sample"],
645
741
  respect_xml_space: true
646
742
  )
647
743
 
@@ -650,7 +746,7 @@ expect(html).to be_html_equivalent_to(expected,
650
746
  match: { structural_whitespace: :strict }
651
747
  )
652
748
  .with_options(
653
- whitespace_insensitive_elements: [:script, :style]
749
+ insensitive_elements: ["script", "style"]
654
750
  )
655
751
  ====
656
752
 
data/docs/lychee.toml CHANGED
@@ -9,9 +9,6 @@ max_cache_age = "1d"
9
9
  # Check both source files and built site
10
10
  include_verbatim = true
11
11
 
12
- # Recursively check all files
13
- recursive = true
14
-
15
12
  # File types to check (regex patterns)
16
13
  include = [
17
14
  "_site/**/*.html",
@@ -50,9 +47,6 @@ user_agent = "lychee/canon-docs-link-checker"
50
47
  # Check HTTP, HTTPS, and file:// schemes
51
48
  scheme = ["https", "http", "file"]
52
49
 
53
- # Include file:// URLs for local link checking
54
- include_file = true
55
-
56
50
  # Handle different link types
57
51
  include_mail = false # Don't check mailto: links
58
52
 
@@ -66,7 +60,4 @@ verbose = "warn"
66
60
  require_https = false # Don't enforce
67
61
 
68
62
  # Index files for directory URLs
69
- index_files = ["index.html"]
70
-
71
- # Ignore patterns file
72
- ignore_file = ".lycheeignore"
63
+ index_files = ["index.html"]
@@ -340,6 +340,44 @@ Special attributes like `xml:lang`, `xml:space`, `xml:id`, and `xml:base` are pr
340
340
  When `xml:space="preserve"` is set, whitespace is preserved in descendants.
341
341
  ----
342
342
 
343
+ === Cross-encoding comparison
344
+
345
+ Canon automatically normalizes XML character encodings before comparison, enabling
346
+ cross-encoding comparisons to work correctly.
347
+
348
+ **Supported encodings**: UTF-8, UTF-16 (all variants), Shift_JIS, EUC-JP, ISO-8859-1, and more.
349
+
350
+ **How it works**:
351
+
352
+ 1. Extract the declared encoding from the XML declaration (e.g., `encoding="Shift_JIS"`)
353
+ 2. If declared encoding differs from UTF-8, transcode to UTF-8
354
+ 3. Handle cases where the declared encoding doesn't match actual bytes
355
+ 4. Use safe transcoding with replacement characters for invalid sequences
356
+
357
+ .Cross-encoding comparison example
358
+ [example]
359
+ ====
360
+ [source,ruby]
361
+ ----
362
+ # UTF-8 vs Shift_JIS - automatically normalized
363
+ xml1 = "<root>日本語</root>" # UTF-8
364
+ xml2 = "<root>日本語</root>".encode("Shift_JIS") # Shift_JIS
365
+
366
+ Canon::Comparison.equivalent?(xml1, xml2)
367
+ # => true (automatically transcoded to UTF-8 before comparison)
368
+
369
+ # ASCII content works across all encodings
370
+ xml3 = "<root>hello</root>"
371
+ xml4 = "<root>hello</root>".encode("ISO-8859-1")
372
+
373
+ Canon::Comparison.equivalent?(xml3, xml4)
374
+ # => true
375
+ ----
376
+ ====
377
+
378
+ This means you can compare XML files from different sources or systems without
379
+ worrying about their native encoding.
380
+
343
381
  == Usage examples
344
382
 
345
383
  === Basic XML comparison
data/lib/canon/cache.rb CHANGED
@@ -89,7 +89,8 @@ module Canon
89
89
  # @return [String] Cache key
90
90
  def key_for_format_detection(content)
91
91
  # Use first 100 chars for quick key, plus length
92
- preview = content[0..100]
92
+ # Force to binary to avoid encoding compatibility issues
93
+ preview = content[0..100].b
93
94
  digest = Digest::SHA256.hexdigest(preview + content.length.to_s)
94
95
  "fmt:#{digest[0..16]}"
95
96
  end
@@ -62,7 +62,21 @@ module Canon
62
62
  # @param str [String] String to detect format of
63
63
  # @return [Symbol] Format type
64
64
  def detect_string_uncached(str)
65
- trimmed = str.strip
65
+ # Convert to UTF-8 for consistent handling if possible
66
+ # This handles cases like UTF-16 encoded XML that would otherwise fail string operations
67
+ str_utf8 = if ["UTF-16", "UTF-16BE",
68
+ "UTF-16LE"].include?(str.encoding.name)
69
+ begin
70
+ str.encode("UTF-8", str.encoding, invalid: :replace,
71
+ undef: :replace, replace: "?")
72
+ rescue EncodingError
73
+ str.dup.force_encoding("BINARY").encode("UTF-8")
74
+ end
75
+ else
76
+ str
77
+ end
78
+
79
+ trimmed = str_utf8.strip
66
80
 
67
81
  # YAML indicators
68
82
  return :yaml if trimmed.start_with?("---")
@@ -174,12 +174,17 @@ module Canon
174
174
  end
175
175
  end
176
176
 
177
- # Filter out whitespace-only text nodes based on structural_whitespace setting
178
- if match_opts && %i[ignore
179
- normalize].include?(match_opts[:structural_whitespace]) && text_node?(node)
180
- text = node_text(node)
181
- return true if MatchOptions.normalize_text(text).empty?
182
- end
177
+ # Strip whitespace-only text nodes based on parent element configuration.
178
+ # Use sensitive_elements / insensitive_elements to control.
179
+ # Blacklist (insensitive) > whitelist (sensitive) > format defaults.
180
+ return false unless text_node?(node) && node.parent
181
+ return false unless MatchOptions.normalize_text(node_text(node)).empty?
182
+
183
+ return true unless WhitespaceSensitivity.whitespace_preserved?(
184
+ node.parent, match_opts
185
+ )
186
+
187
+ false
183
188
 
184
189
  false
185
190
  end
@@ -121,6 +121,8 @@ module Canon
121
121
  hash_matching
122
122
  similarity_matching
123
123
  propagation
124
+ sensitive_elements
125
+ insensitive_elements
124
126
  whitespace_sensitive_elements
125
127
  whitespace_insensitive_elements
126
128
  respect_xml_space
@@ -8,6 +8,12 @@ module Canon
8
8
  # XML/HTML-specific match options resolver
9
9
  class XmlResolver < BaseResolver
10
10
  # Format-specific defaults for XML/HTML
11
+ #
12
+ # Sensitive elements (preserve structural whitespace):
13
+ # - XML: none by default — all structural whitespace stripped
14
+ # - HTML: pre, code, textarea, script, style by default
15
+ # Use sensitive_elements option to add elements that preserve whitespace.
16
+ #
11
17
  FORMAT_DEFAULTS = {
12
18
  html: {
13
19
  preprocessing: :rendered,
@@ -33,7 +39,9 @@ module Canon
33
39
 
34
40
  # Predefined match profiles for XML/HTML
35
41
  MATCH_PROFILES = {
36
- # Strict: Match exactly as written in source (XML default)
42
+ # Strict: Match exactly as written in source (XML default).
43
+ # Structural whitespace is stripped by default for XML.
44
+ # Use sensitive_elements to preserve structural whitespace in specific elements.
37
45
  strict: {
38
46
  preprocessing: :none,
39
47
  text_content: :strict,
@@ -66,6 +66,88 @@ module Canon
66
66
  element_sensitive?(node, opts)
67
67
  end
68
68
 
69
+ # Check if structural whitespace is preserved (not stripped) for an element.
70
+ #
71
+ # Uses sensitive_elements (whitelist) and insensitive_elements (blacklist)
72
+ # from match_opts. Blacklist takes precedence over whitelist.
73
+ # Format defaults apply when neither is configured.
74
+ #
75
+ # No inheritance from ancestors — checks only the immediate parent element name.
76
+ #
77
+ # @param element [Object] Element node to check
78
+ # @param match_opts [Hash] Resolved match options
79
+ # @return [Boolean] true if whitespace is preserved (not stripped)
80
+ def whitespace_preserved?(element, match_opts)
81
+ return false unless element
82
+ return false unless element.respond_to?(:name)
83
+
84
+ elem_name = element.name.to_s
85
+
86
+ # Blacklist: always strip (highest priority)
87
+ insensitive_raw = match_opts[:insensitive_elements]
88
+ insensitive_raw ||= match_opts[:whitespace_insensitive_elements]
89
+ insensitive = (insensitive_raw || []).map(&:to_s)
90
+ return false if insensitive.include?(elem_name)
91
+
92
+ # Check if we should ignore xml:space (user override)
93
+ if respect_xml_space?(match_opts)
94
+ # Check xml:space="preserve" (document declaration)
95
+ return true if xml_space_preserve?(element)
96
+
97
+ # Check xml:space="default" (use configured behavior)
98
+ return false if xml_space_default?(element)
99
+ end
100
+
101
+ # Whitelist: preserve whitespace
102
+ sensitive = resolved_sensitive_elements(match_opts)
103
+ return true if sensitive.include?(elem_name)
104
+
105
+ # Default: preserve for HTML, strip for XML
106
+ format = match_opts[:format] || :xml
107
+ case format
108
+ when :html, :html4, :html5
109
+ true
110
+ else
111
+ false
112
+ end
113
+ end
114
+
115
+ # Get resolved list of whitespace-sensitive element names (strings).
116
+ #
117
+ # Combines format defaults + user whitelist, minus user blacklist.
118
+ # Supports both short names (sensitive_elements) and long names
119
+ # (whitespace_sensitive_elements) for backward compatibility.
120
+ #
121
+ # @param match_opts [Hash] Resolved match options
122
+ # @return [Array<String>] Sensitive element names
123
+ def resolved_sensitive_elements(match_opts)
124
+ sensitive = []
125
+
126
+ # 1. Format defaults
127
+ format = match_opts[:format] || :xml
128
+ case format
129
+ when :html, :html4, :html5
130
+ sensitive += %w[pre code textarea script style]
131
+ end
132
+
133
+ # 2. User whitelist (additive to format defaults)
134
+ whitelist = match_opts[:sensitive_elements]
135
+ whitelist ||= match_opts[:whitespace_sensitive_elements]
136
+ if whitelist
137
+ sensitive += whitelist.map(&:to_s)
138
+ end
139
+
140
+ # 3. User blacklist removes from combined set
141
+ blacklist_raw = match_opts[:insensitive_elements]
142
+ blacklist_raw ||= match_opts[:whitespace_insensitive_elements]
143
+ if blacklist_raw
144
+ blacklist = blacklist_raw.to_set(&:to_s)
145
+ sensitive.reject! { |e| blacklist.include?(e) }
146
+ end
147
+
148
+ sensitive.uniq
149
+ end
150
+
69
151
  # Get format-specific default sensitive elements
70
152
  #
71
153
  # This is the SINGLE SOURCE OF TRUTH for default whitespace-sensitive
@@ -25,6 +25,9 @@ module Canon
25
25
  preserve_whitespace: preserve_whitespace)
26
26
  end
27
27
 
28
+ # Normalize encoding before preprocessing (UTF-16 strings can't use strip, etc.)
29
+ node = Canon::Xml::DataModel.normalize_encoding(node)
30
+
28
31
  # Apply preprocessing to XML string before parsing
29
32
  xml_string = apply_preprocessing(node, preprocessing).strip
30
33
 
@@ -190,14 +190,17 @@ diff_children, differences)
190
190
  end
191
191
  end
192
192
 
193
- # Filter out whitespace-only text nodes based on structural_whitespace setting
194
- # - :ignore or :normalize: Filter all whitespace-only text nodes
195
- # - :strict: Preserve all whitespace-only text nodes (don't filter any)
196
- if text_node?(node) && %i[ignore
197
- normalize].include?(match_opts[:structural_whitespace])
198
- text = node_text(node)
199
- return true if MatchOptions.normalize_text(text).empty?
200
- end
193
+ # Strip whitespace-only text nodes based on parent element configuration.
194
+ # Use sensitive_elements / insensitive_elements to control.
195
+ # Blacklist (insensitive) > whitelist (sensitive) > format defaults.
196
+ return false unless text_node?(node) && node.parent
197
+ return false unless MatchOptions.normalize_text(node_text(node)).empty?
198
+
199
+ return true unless WhitespaceSensitivity.whitespace_preserved?(
200
+ node.parent, match_opts
201
+ )
202
+
203
+ false
201
204
 
202
205
  false
203
206
  end
@@ -326,9 +326,9 @@ module Canon
326
326
  # Handle cases where one node is missing (e.g. text added or removed)
327
327
  if node1.nil? || node2.nil?
328
328
  if node1.nil?
329
- text2 = node2.to_s
329
+ text2 = NodeUtils.get_node_text(node2)
330
330
  else
331
- text1 = node1.to_s
331
+ text1 = NodeUtils.get_node_text(node1)
332
332
  end
333
333
  end
334
334
 
data/lib/canon/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Canon
4
- VERSION = "0.1.16"
4
+ VERSION = "0.1.18"
5
5
  end
@@ -21,8 +21,11 @@ module Canon
21
21
  # @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
22
22
  # @return [Nodes::RootNode] Root of the data model tree
23
23
  def self.from_xml(xml_string, preserve_whitespace: false)
24
+ # Normalize encoding before parsing
25
+ normalized_xml = normalize_encoding(xml_string)
26
+
24
27
  # Parse with Nokogiri
25
- doc = Nokogiri::XML(xml_string, &:nonet)
28
+ doc = Nokogiri::XML(normalized_xml, &:nonet)
26
29
 
27
30
  # Check for relative namespace URIs (prohibited by C14N 1.1)
28
31
  check_for_relative_namespace_uris(doc)
@@ -31,6 +34,132 @@ module Canon
31
34
  build_from_nokogiri(doc, preserve_whitespace: preserve_whitespace)
32
35
  end
33
36
 
37
+ # Normalize XML string encoding to UTF-8
38
+ #
39
+ # Handles cases where:
40
+ # 1. The XML declaration specifies an encoding that doesn't match the actual encoding
41
+ # 2. The string's internal encoding is non-UTF-8 (without a declaration)
42
+ #
43
+ # For case 1, we check if the declared encoding matches the actual bytes.
44
+ # If bytes are valid UTF-8 despite the declaration, we update the declaration to UTF-8.
45
+ #
46
+ # @param xml_string [String] XML string to normalize
47
+ # @return [String] Normalized XML string with UTF-8 encoding
48
+ def self.normalize_encoding(xml_string)
49
+ return xml_string unless xml_string.is_a?(String)
50
+
51
+ # Extract declared encoding from XML declaration
52
+ declared_encoding = extract_xml_encoding(xml_string)
53
+
54
+ if declared_encoding
55
+ # Case 1: XML has a declaration
56
+ if declared_encoding.upcase != "UTF-8"
57
+ # Check if bytes are actually valid UTF-8 despite the declaration
58
+ utf8_reinterpreted = try_utf8_reinterpretation(xml_string)
59
+ if utf8_reinterpreted
60
+ # Bytes are valid UTF-8 - update declaration to UTF-8
61
+ return update_xml_declaration(xml_string, "UTF-8")
62
+ end
63
+
64
+ # Bytes aren't valid UTF-8 - must really be in declared encoding
65
+ return transcode_to_utf8(xml_string, declared_encoding)
66
+ end
67
+ elsif xml_string.encoding.name != "UTF-8"
68
+ # Case 2: No declaration but string encoding is non-UTF-8
69
+ # First, try to re-interpret bytes as UTF-8 (handles mislabeled strings)
70
+ reinterpreted = try_utf8_reinterpretation(xml_string)
71
+ return reinterpreted if reinterpreted
72
+
73
+ # If re-interpretation fails, try transcoding with the labeled encoding
74
+ return transcode_to_utf8(xml_string, xml_string.encoding.name)
75
+ end
76
+
77
+ xml_string
78
+ end
79
+
80
+ # Update the encoding declaration in an XML string
81
+ #
82
+ # @param xml_string [String] XML string
83
+ # @param new_encoding [String] New encoding to declare
84
+ # @return [String] XML string with updated declaration
85
+ def self.update_xml_declaration(xml_string, new_encoding)
86
+ xml_string.sub(/\bencoding\s*=\s*["'][^"']+["']/i) do |_match|
87
+ %(encoding="#{new_encoding}")
88
+ end
89
+ end
90
+
91
+ # Transcode string to UTF-8
92
+ #
93
+ # @param xml_string [String] String to transcode
94
+ # @param source_encoding [String] Source encoding to interpret bytes as
95
+ # @return [String] UTF-8 transcoded string
96
+ def self.transcode_to_utf8(xml_string, source_encoding)
97
+ # First, check if the bytes are actually valid UTF-8 despite the declared encoding
98
+ # If so, just re-interpret as UTF-8 (common case: declaration is wrong)
99
+ if source_encoding != "UTF-8"
100
+ # Force the bytes to be interpreted as the declared encoding, then check validity
101
+ forced = xml_string.dup.force_encoding(source_encoding)
102
+ if forced.valid_encoding?
103
+ # Now check if the same bytes are valid UTF-8
104
+ utf8_check = xml_string.dup.force_encoding("UTF-8")
105
+ if utf8_check.valid_encoding?
106
+ # Bytes are valid UTF-8 - the declaration is likely wrong
107
+ # Return the string as UTF-8 (already is)
108
+ return xml_string.dup.force_encoding("UTF-8")
109
+ end
110
+
111
+ # Bytes aren't valid UTF-8, so they must really be in source_encoding
112
+ # Proceed with transcoding
113
+ return forced.encode("UTF-8", source_encoding,
114
+ invalid: :replace,
115
+ undef: :replace,
116
+ replace: "?")
117
+ end
118
+ end
119
+
120
+ # Already UTF-8 or transcoding failed, return as-is
121
+ xml_string.dup.force_encoding("UTF-8")
122
+ rescue EncodingError
123
+ xml_string
124
+ end
125
+
126
+ # Attempt to re-interpret string as UTF-8 if bytes are valid UTF-8
127
+ #
128
+ # This handles the case where a string was incorrectly labeled with a different
129
+ # encoding (e.g., `.encode("Shift_JIS")` on a UTF-8 string) but the actual
130
+ # bytes are valid UTF-8.
131
+ #
132
+ # @param xml_string [String] XML string to check
133
+ # @return [String, nil] UTF-8 re-interpreted string, or nil if not possible
134
+ def self.try_utf8_reinterpretation(xml_string)
135
+ return xml_string if xml_string.encoding.name == "UTF-8"
136
+
137
+ # Try forcing to UTF-8 and see if it's valid
138
+ forced = xml_string.dup.force_encoding("UTF-8")
139
+ return forced if forced.valid_encoding?
140
+
141
+ nil
142
+ end
143
+
144
+ # Extract encoding from XML declaration
145
+ #
146
+ # @param xml_string [String] XML string
147
+ # @return [String, nil] Declared encoding or nil if not found
148
+ def self.extract_xml_encoding(xml_string)
149
+ # Match XML declaration with encoding attribute
150
+ # Handles: <?xml version="1.0" encoding="UTF-8"?>
151
+ # and: <?xml version='1.0' encoding='UTF-8'?>
152
+ #
153
+ # Use binary encoding to avoid encoding compatibility issues
154
+ # when the string has non-ASCII compatible encoding (e.g., UTF-16)
155
+ binary_string = xml_string.dup.force_encoding("BINARY")
156
+ if binary_string =~ /\A\s*<\?xml[^>]*\bencoding\s*=\s*["']([^"']+)["'][^>]*\?>/i
157
+ return Regexp.last_match(1)
158
+ end
159
+
160
+ nil
161
+ end
162
+
34
163
  # Alias for compatibility with base class interface
35
164
  def self.parse(xml_string)
36
165
  from_xml(xml_string)
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "benchmark/ips"
4
+ require "table_tennis"
4
5
 
5
6
  # Ensure lib/ is on the load path regardless of tmp location
6
7
  lib_path = File.expand_path(File.join(__dir__, "..", "..", "lib"))
@@ -101,7 +102,8 @@ class BenchmarkRunner
101
102
  end
102
103
 
103
104
  # Category section with description
104
- def self.category(title, icon:, description:, failure_means:, compare_against: nil)
105
+ def self.category(title, icon:, description:, failure_means:,
106
+ compare_against: nil)
105
107
  puts
106
108
  puts "#{CYAN}#{VL}#{CLEAR} #{BOLD}#{MAGENTA}#{icon} #{title}#{CLEAR}"
107
109
  puts
@@ -124,26 +126,21 @@ class BenchmarkRunner
124
126
  puts
125
127
  end
126
128
 
127
- # Results table for a category
128
- def self.table_header
129
- puts " #{BOLD}#{'%-35s'} #{'%10s'} #{'%8s'} #{'%s'}#{CLEAR}"
130
- puts " Test IPS ±% Speedup"
131
- sep(char: "─", width: 76)
132
- end
129
+ # Results table for a category using TableTennis
130
+ def self.table(results)
131
+ rows = results.map do |r|
132
+ {
133
+ test: r[:name],
134
+ ips: r[:ips],
135
+ deviation: "#{r[:deviation].round(1)}%",
136
+ status: r[:is_best] ? "BEST" : "",
137
+ }
138
+ end
133
139
 
134
- def self.table_row(label, ips, deviation, speedup: nil, is_best: false)
135
- speedup_str = speedup ? " ⚡#{speedup.round(2)}x" : ""
136
- label_str = is_best ? "#{GREEN}#{label}#{CLEAR}" : label
137
- bar = render_bar(ips)
140
+ return if rows.empty?
138
141
 
139
- puts " #{label_str}"
140
- puts " #{DIM}#{bar}#{CLEAR} #{format('%10.1f', ips)} #{format('%6.1f%%', deviation)}#{speedup_str}"
141
- puts
142
- end
143
-
144
- def self.table_footer
145
- sep(char: "─", width: 76)
146
- puts
142
+ table = TableTennis.new(rows, theme: :dark)
143
+ table.render
147
144
  end
148
145
 
149
146
  def self.speedup_badge(factor, label)
@@ -151,24 +148,7 @@ class BenchmarkRunner
151
148
  puts " #{GREEN} #{factor.round(2)}x faster#{CLEAR}"
152
149
  end
153
150
 
154
- def self.reset_max_ips
155
- @max_ips = nil
156
- end
157
-
158
- def self.set_max_ips(val)
159
- @max_ips = val
160
- end
161
-
162
- def self.render_bar(ips, max_width: 20)
163
- @max_ips ||= ips
164
- ratio = ips / @max_ips.to_f
165
- width = [(ratio * max_width).round, 1].max
166
- filled = [width, max_width].min
167
- empty = max_width - filled
168
- ("█" * filled) + ("░" * empty)
169
- end
170
-
171
- # Summary card
151
+ # Summary card using TableTennis
172
152
  def self.summary_card(results)
173
153
  puts
174
154
  sep(width: 78)
@@ -176,16 +156,23 @@ class BenchmarkRunner
176
156
  puts " #{BOLD}#{MAGENTA}SUMMARY#{CLEAR}"
177
157
  puts
178
158
 
179
- total = results.length
180
-
181
- results.each do |r|
182
- # For standalone runs, all results are shown as "current" without comparison
183
- ips_str = r[:ips] ? format("%10.1f IPS", r[:ips]) : ""
184
- puts " #{DIM}◆#{CLEAR} #{format('%-35s', r[:label])} #{ips_str}"
159
+ rows = results.map do |r|
160
+ {
161
+ benchmark: r[:label],
162
+ ips: r[:ips]&.round(1),
163
+ }
185
164
  end
186
165
 
166
+ return if rows.empty?
167
+
168
+ table = TableTennis.new(rows,
169
+ title: "Performance Results",
170
+ theme: :dark,
171
+ headers: { benchmark: "Benchmark", ips: "IPS" })
172
+ table.render
173
+
187
174
  puts
188
- puts " #{DIM}#{total} benchmarks completed#{CLEAR}"
175
+ puts " #{DIM}#{results.length} benchmarks completed#{CLEAR}"
189
176
  puts
190
177
  end
191
178
  end
@@ -239,24 +226,35 @@ class BenchmarkRunner
239
226
  # Test definitions
240
227
  BENCHMARKS = {
241
228
  xml_parsing: [
242
- { name: "DOM (simple)", method: :xml_parse_dom_simple, desc: "Standard DOM parsing" },
243
- { name: "SAX (simple)", method: :xml_parse_sax_simple, desc: "Streaming SAX parsing" },
244
- { name: "DOM (large)", method: :xml_parse_dom_large, desc: "Large document DOM" },
245
- { name: "SAX (large)", method: :xml_parse_sax_large, desc: "Large document SAX" },
229
+ { name: "DOM (simple)", method: :xml_parse_dom_simple,
230
+ desc: "Standard DOM parsing" },
231
+ { name: "SAX (simple)", method: :xml_parse_sax_simple,
232
+ desc: "Streaming SAX parsing" },
233
+ { name: "DOM (large)", method: :xml_parse_dom_large,
234
+ desc: "Large document DOM" },
235
+ { name: "SAX (large)", method: :xml_parse_sax_large,
236
+ desc: "Large document SAX" },
246
237
  ],
247
238
  html_parsing: [
248
239
  { name: "Simple HTML", method: :html_parse_simple, desc: "Basic HTML" },
249
- { name: "Complex HTML", method: :html_parse_complex, desc: "HTML with scripts/tables" },
240
+ { name: "Complex HTML", method: :html_parse_complex,
241
+ desc: "HTML with scripts/tables" },
250
242
  ],
251
243
  xml_comparison: [
252
- { name: "Identical XML", method: :xml_compare_identical, desc: "Same documents" },
253
- { name: "Similar XML", method: :xml_compare_similar, desc: "Slightly different" },
254
- { name: "Different XML", method: :xml_compare_different, desc: "Different namespaces" },
244
+ { name: "Identical XML", method: :xml_compare_identical,
245
+ desc: "Same documents" },
246
+ { name: "Similar XML", method: :xml_compare_similar,
247
+ desc: "Slightly different" },
248
+ { name: "Different XML", method: :xml_compare_different,
249
+ desc: "Different namespaces" },
255
250
  ],
256
251
  html_comparison: [
257
- { name: "Identical HTML", method: :html_compare_identical, desc: "Same HTML" },
258
- { name: "Similar HTML", method: :html_compare_similar, desc: "Slightly different" },
259
- { name: "Different HTML", method: :html_compare_different, desc: "Different structure" },
252
+ { name: "Identical HTML", method: :html_compare_identical,
253
+ desc: "Same HTML" },
254
+ { name: "Similar HTML", method: :html_compare_similar,
255
+ desc: "Slightly different" },
256
+ { name: "Different HTML", method: :html_compare_different,
257
+ desc: "Different structure" },
260
258
  ],
261
259
  formatting: [
262
260
  { name: "XML C14N", method: :xml_c14n_format, desc: "Canonical XML" },
@@ -287,7 +285,8 @@ class BenchmarkRunner
287
285
  end.join
288
286
  "<#{prefix}root#{ns_attr}#{attrs}>#{children}</#{prefix}root>"
289
287
  else
290
- child = build_xml_element(items / 2, depth - 1, prefix, with_attrs, "")
288
+ child = build_xml_element(items / 2, depth - 1, prefix, with_attrs,
289
+ "")
291
290
  "<#{prefix}root#{ns_attr}#{attrs}>#{child}</#{prefix}root>"
292
291
  end
293
292
  end
@@ -401,8 +400,6 @@ class BenchmarkRunner
401
400
  end
402
401
 
403
402
  def run_benchmarks
404
- Term.reset_max_ips
405
-
406
403
  # Header
407
404
  Term.header("Canon Performance Benchmarks", color: Term::CYAN)
408
405
 
@@ -434,8 +431,6 @@ class BenchmarkRunner
434
431
  compare_against: config[:compare_against],
435
432
  )
436
433
 
437
- Term.table_header
438
-
439
434
  # Run each test in category
440
435
  category_results = []
441
436
  max_ips = 0
@@ -457,23 +452,32 @@ class BenchmarkRunner
457
452
  $stdout = original_stdout
458
453
  end
459
454
 
460
- # Reset for relative bars within category
461
- Term.set_max_ips(max_ips)
462
-
463
- # Print results with relative bars
464
- category_results.each do |r|
455
+ # Build results for TableTennis table
456
+ table_rows = category_results.map do |r|
465
457
  is_best = r[:result][:upper] >= max_ips
466
- Term.table_row(r[:name], (r[:result][:lower] + r[:result][:upper]) / 2.0,
467
- calculate_deviation(r[:result]), is_best: is_best)
468
- @all_results << { label: "#{config[:name]}: #{r[:name]}", ips: (r[:result][:lower] + r[:result][:upper]) / 2.0 }
458
+ label = "#{config[:name]}: #{r[:name]}"
459
+ @all_results << { label: label,
460
+ ips: (r[:result][:lower] + r[:result][:upper]) / 2.0 }
461
+ @results[label] = r[:result] # Populate @results for comparison
462
+ {
463
+ name: r[:name],
464
+ ips: (r[:result][:lower] + r[:result][:upper]) / 2.0,
465
+ deviation: calculate_deviation(r[:result]),
466
+ is_best: is_best,
467
+ }
469
468
  end
470
469
 
471
- Term.table_footer
470
+ # Render TableTennis table
471
+ Term.table(table_rows)
472
472
 
473
473
  # SAX vs DOM comparison for XML parsing
474
474
  if category == :xml_parsing && SAX_AVAILABLE
475
- sax = category_results.find { |r| r[:name].include?("SAX") && r[:name].include?("large") }
476
- dom = category_results.find { |r| r[:name].include?("DOM") && r[:name].include?("large") }
475
+ sax = category_results.find do |r|
476
+ r[:name].include?("SAX") && r[:name].include?("large")
477
+ end
478
+ dom = category_results.find do |r|
479
+ r[:name].include?("DOM") && r[:name].include?("large")
480
+ end
477
481
 
478
482
  if sax && dom
479
483
  sax_ips = (sax[:result][:lower] + sax[:result][:upper]) / 2.0
@@ -481,9 +485,11 @@ class BenchmarkRunner
481
485
  speedup = sax_ips / dom_ips
482
486
 
483
487
  if speedup > 1.0
484
- Term.speedup_badge(speedup, "SAX is faster than DOM for large documents")
488
+ Term.speedup_badge(speedup,
489
+ "SAX is faster than DOM for large documents")
485
490
  else
486
- Term.hint("DOM is #{format('%.2f', 1 / speedup)}x faster than SAX for large documents")
491
+ Term.hint("DOM is #{format('%.2f',
492
+ 1 / speedup)}x faster than SAX for large documents")
487
493
  end
488
494
  end
489
495
  end
@@ -509,7 +515,8 @@ class BenchmarkRunner
509
515
  html = DataGenerator.generate_html(items: @items)
510
516
  measure { Canon.parse_html(html) }
511
517
  when :html_parse_complex
512
- html = DataGenerator.generate_html(items: @items, with_scripts: true, with_tables: true)
518
+ html = DataGenerator.generate_html(items: @items, with_scripts: true,
519
+ with_tables: true)
513
520
  measure { Canon.parse_html(html) }
514
521
  when :xml_compare_identical
515
522
  xml = DataGenerator.generate_xml(items: @items)
@@ -566,7 +573,8 @@ class BenchmarkRunner
566
573
  error_margin = std_dev / mean
567
574
  error_pct = error_margin.round(4)
568
575
 
569
- { lower: mean.round(4) * (1 - error_pct), upper: mean.round(4) * (1 + error_pct) }
576
+ { lower: mean.round(4) * (1 - error_pct),
577
+ upper: mean.round(4) * (1 + error_pct) }
570
578
  end
571
579
 
572
580
  def measure_time
@@ -4,6 +4,7 @@ require "json"
4
4
  require "open3"
5
5
  require "tmpdir"
6
6
  require "fileutils"
7
+ require "table_tennis"
7
8
 
8
9
  module PerformanceHelpers
9
10
  # ANSI color codes for terminal output
@@ -97,9 +98,49 @@ module PerformanceHelpers
97
98
  all_base.merge!(base_results)
98
99
  all_current.merge!(curr_results)
99
100
 
101
+ # Collect comparison results for TableTennis table
102
+ comparison_rows = []
103
+
100
104
  curr_results.each do |label, result|
101
- print_realtime_comparison(label, result, base_results[label], threshold)
105
+ base_result = base_results[label]
106
+ cmp = compare_metrics(label, result, base_result, threshold)
107
+ comparison_rows << cmp
102
108
  end
109
+
110
+ print_comparison_table(comparison_rows, threshold)
111
+ end
112
+
113
+ def print_comparison_table(comparison_rows, threshold)
114
+ rows = comparison_rows.map do |cmp|
115
+ {
116
+ benchmark: cmp[:label],
117
+ base_ips: cmp[:base_ips]&.round(1),
118
+ curr_ips: cmp[:curr_ips]&.round(1),
119
+ change: cmp[:change] ? "#{(cmp[:change] * 100).round(1)}%" : "N/A",
120
+ status: if cmp[:base_ips].nil?
121
+ "NEW"
122
+ elsif cmp[:change] < -threshold
123
+ "REGRESSED"
124
+ else
125
+ "OK"
126
+ end,
127
+ }
128
+ end
129
+
130
+ return if rows.empty?
131
+
132
+ table = TableTennis.new(rows,
133
+ title: "Performance Comparison",
134
+ theme: :dark,
135
+ headers: {
136
+ benchmark: "Benchmark",
137
+ base_ips: "Base IPS",
138
+ curr_ips: "Curr IPS",
139
+ change: "Change",
140
+ status: "Status",
141
+ })
142
+ table.render
143
+ puts
103
144
  end
104
145
 
105
146
  def compare_metrics(label, curr, base, threshold)
@@ -197,7 +238,9 @@ module PerformanceHelpers
197
238
  # Handle new benchmarks that don't exist in base
198
239
  if base_metrics.nil?
199
240
  curr_ips = (curr_metrics[:lower] + curr_metrics[:upper]) / 2.0
200
- puts "#{format('%-30s', label)}: #{GREEN}NEW#{CLEAR} (current: #{format('%.2f', curr_ips)} IPS) [N/A]\n\n"
241
+ puts "#{format('%-30s',
242
+ label)}: #{GREEN}NEW#{CLEAR} (current: #{format('%.2f',
243
+ curr_ips)} IPS) [N/A]\n\n"
201
244
  return
202
245
  end
203
246
 
@@ -212,7 +255,8 @@ module PerformanceHelpers
212
255
  base_str = format("%.2f", base_ips)
213
256
  curr_str = format("%.2f", curr_ips)
214
257
 
215
- puts "#{format('%-30s', label)}: #{GRAY}#{base_str}#{CLEAR} → #{color}#{curr_str}#{CLEAR} IPS " \
258
+ puts "#{format('%-30s',
259
+ label)}: #{GRAY}#{base_str}#{CLEAR} → #{color}#{curr_str}#{CLEAR} IPS " \
216
260
  "(change: #{color}#{delta_str}#{CLEAR}) [#{color}#{status}#{CLEAR}]\n\n"
217
261
  end
218
262
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: canon
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.16
4
+ version: 0.1.18
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-03-21 00:00:00.000000000 Z
11
+ date: 2026-03-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: diff-lcs