multi_xml 0.8.1 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/benchmark.rb ADDED
@@ -0,0 +1,1002 @@
1
+ lib_dir = File.expand_path("lib", __dir__)
2
+ $LOAD_PATH.unshift(lib_dir) unless $LOAD_PATH.include?(lib_dir)
3
+
4
+ require "optparse"
5
+ require "multi_xml"
6
+
7
+ # Benchmark harness for comparing MultiXML parsers across representative XML workloads.
8
+ class MultiXMLBenchmark
9
+ ParserEntry = Struct.new(:name, :module_ref, keyword_init: true)
10
+ PayloadCase = Struct.new(:shape, :bucket, :label, :xml, :bytes, :options, keyword_init: true)
11
+ Measurement = Struct.new(:payload, :parser, :ips, :allocations_per_parse, keyword_init: true)
12
+ RunResults = Struct.new(:parsers, :measurements, :excluded_parsers, keyword_init: true)
13
+
14
+ PARSER_NAMES = %i[ox libxml libxml_sax nokogiri nokogiri_sax rexml oga].freeze
15
+ BYTES_PER_MEGABYTE = 1024.0 * 1024.0
16
+ CLOCK = Process::CLOCK_MONOTONIC
17
+ EPSILON = 1e-12
18
+ MAX_ITERATIONS = 1_000_000
19
+ DEFAULTS = {
20
+ warmup: 0.03,
21
+ time: 0.15,
22
+ samples: 5,
23
+ format: :plain,
24
+ validate: true,
25
+ verify_preference: false
26
+ }.freeze
27
+
28
+ class << self
29
+ def run(argv = ARGV)
30
+ options = CLI.new.parse(argv)
31
+ parsers = ParserLoader.new.load(options[:parsers])
32
+ raise "No supported parsers are available for benchmarking" if parsers.empty?
33
+
34
+ results = Runner.new(parsers:, payloads: PayloadCatalog.new.build, options:).run
35
+ Reporter.new(results:, options:).print
36
+ options[:verify_preference] ? verify_preference(results) : 0
37
+ end
38
+
39
+ private
40
+
41
+ def verify_preference(results)
42
+ verifier = PreferenceVerifier.new(results)
43
+ verifier.report
44
+ verifier.valid? ? 0 : 1
45
+ end
46
+ end
47
+ end
48
+
49
+ class MultiXMLBenchmark
50
+ # Command-line option parsing for the benchmark script.
51
+ class CLI
52
+ QUICK_OPTIONS = {
53
+ warmup: 0.005,
54
+ time: 0.02,
55
+ samples: 1
56
+ }.freeze
57
+ private_constant :QUICK_OPTIONS
58
+
59
+ def parse(argv)
60
+ options = MultiXMLBenchmark::DEFAULTS.dup
61
+ parser(options).parse!(argv)
62
+ validate!(options)
63
+ options
64
+ end
65
+
66
+ private
67
+
68
+ def parser(options)
69
+ OptionParser.new do |opts|
70
+ opts.banner = "Usage: bundle exec ruby benchmark.rb [options]"
71
+ add_timing_options(opts, options)
72
+ add_format_options(opts, options)
73
+ add_validation_options(opts, options)
74
+ add_quick_option(opts, options)
75
+ end
76
+ end
77
+
78
+ def add_timing_options(parser, options)
79
+ parser.on("--warmup SECONDS", Float, "Warmup time budget per benchmark (default: #{MultiXMLBenchmark::DEFAULTS[:warmup]})") do |value|
80
+ options[:warmup] = value
81
+ end
82
+ parser.on("--time SECONDS", Float, "Measurement time budget per sample (default: #{MultiXMLBenchmark::DEFAULTS[:time]})") do |value|
83
+ options[:time] = value
84
+ end
85
+ parser.on("--samples COUNT", Integer, "Samples per benchmark (default: #{MultiXMLBenchmark::DEFAULTS[:samples]})") do |value|
86
+ options[:samples] = value
87
+ end
88
+ end
89
+
90
+ def add_format_options(parser, options)
91
+ parser.on("--parsers x,y,z", Array, "Restrict to specific parsers") do |value|
92
+ options[:parsers] = value.map { |name| name.strip.to_sym }
93
+ end
94
+ parser.on("--format FORMAT", %w[plain markdown], "Output format: plain or markdown") do |value|
95
+ options[:format] = value.to_sym
96
+ end
97
+ end
98
+
99
+ def add_validation_options(parser, options)
100
+ parser.on("--no-validate", "Skip cross-parser result validation") do
101
+ options[:validate] = false
102
+ end
103
+ parser.on("--verify-preference", "Assert MultiXML::PARSER_PREFERENCE matches benchmark ranking") do
104
+ options[:verify_preference] = true
105
+ end
106
+ end
107
+
108
+ def add_quick_option(parser, options)
109
+ parser.on("--quick", "Smoke-test mode with shorter timings") do
110
+ options.merge!(QUICK_OPTIONS)
111
+ end
112
+ end
113
+
114
+ def validate!(options)
115
+ validate_warmup!(options[:warmup])
116
+ validate_time!(options[:time])
117
+ validate_samples!(options[:samples])
118
+ end
119
+
120
+ def validate_warmup!(value)
121
+ raise OptionParser::InvalidArgument, "--warmup must be >= 0" if value.negative?
122
+ end
123
+
124
+ def validate_time!(value)
125
+ raise OptionParser::InvalidArgument, "--time must be > 0" unless value.positive?
126
+ end
127
+
128
+ def validate_samples!(value)
129
+ raise OptionParser::InvalidArgument, "--samples must be > 0" unless value.positive?
130
+ end
131
+ end
132
+ end
133
+
134
+ class MultiXMLBenchmark
135
+ # Resolves benchmarkable parsers from the current Ruby environment.
136
+ class ParserLoader
137
+ def load(selected = nil)
138
+ parser_names(selected).filter_map { |name| load_entry(name) }
139
+ end
140
+
141
+ private
142
+
143
+ def parser_names(selected)
144
+ selected || MultiXMLBenchmark::PARSER_NAMES
145
+ end
146
+
147
+ def load_entry(name)
148
+ module_ref = MultiXML.send(:resolve_parser, name)
149
+ MultiXMLBenchmark::ParserEntry.new(name: name, module_ref: module_ref)
150
+ rescue MultiXML::ParserLoadError
151
+ warn "Skipping parser #{name.inspect}: not available"
152
+ nil
153
+ end
154
+ end
155
+ end
156
+
157
+ class MultiXMLBenchmark
158
+ # Builds the benchmark payload matrix across XML shapes and sizes.
159
+ class PayloadCatalog
160
+ CASES = [
161
+ [:shallow_fields, :small, 40],
162
+ [:shallow_fields, :medium, 450],
163
+ [:deep_tree, :medium, 180],
164
+ [:record_batch, :small, 120],
165
+ [:record_batch, :medium, 320],
166
+ [:attribute_dense, :medium, 520],
167
+ [:mixed_content, :medium, 180],
168
+ [:namespace_feed, :medium, 260],
169
+ [:catalog, :large, 1_400]
170
+ ].freeze
171
+ private_constant :CASES
172
+
173
+ def build
174
+ CASES.map { |shape, bucket, count| payload_case(shape, bucket, payload_xml(shape, count)) }
175
+ end
176
+
177
+ private
178
+
179
+ def payload_xml(shape, count)
180
+ case shape
181
+ when :shallow_fields then ShallowPayloadFactory.new.shallow_fields(count)
182
+ when :deep_tree then ShallowPayloadFactory.new.deep_tree(count)
183
+ when :record_batch then RecordPayloadFactory.new.record_batch(count)
184
+ when :attribute_dense then RecordPayloadFactory.new.attribute_dense(count)
185
+ when :mixed_content then MixedPayloadFactory.new.mixed_content(count)
186
+ when :namespace_feed then MixedPayloadFactory.new.namespace_feed(count)
187
+ else CatalogPayloadFactory.new.catalog(count)
188
+ end
189
+ end
190
+
191
+ def payload_case(shape, bucket, payload)
192
+ MultiXMLBenchmark::PayloadCase.new(
193
+ shape: shape,
194
+ bucket: bucket,
195
+ label: "#{shape}/#{bucket}",
196
+ bytes: payload.fetch(:xml).bytesize,
197
+ options: payload.fetch(:options).freeze,
198
+ xml: payload.fetch(:xml).freeze
199
+ )
200
+ end
201
+
202
+ # Shared helpers for generating XML benchmark payloads.
203
+ class FactoryBase
204
+ private
205
+
206
+ def wrap_root(inner)
207
+ "<root>#{inner}</root>"
208
+ end
209
+
210
+ def default_options
211
+ {typecast_xml_value: false}
212
+ end
213
+
214
+ def timestamp(index)
215
+ Kernel.format("2026-04-%<day>02dT12:%<minute>02d:56Z", day: (index % 28) + 1, minute: index % 60)
216
+ end
217
+
218
+ def token(prefix, index, width)
219
+ base = "#{prefix}_#{index.to_s(36)}_abcdefghijklmnopqrstuvwxyz0123456789"
220
+ repeats = (width.to_f / base.length).ceil
221
+ (base * repeats)[0, width]
222
+ end
223
+ end
224
+
225
+ # Builds shallow and deep tree payloads.
226
+ class ShallowPayloadFactory < FactoryBase
227
+ def shallow_fields(count)
228
+ xml = wrap_root(
229
+ Array.new(count) do |index|
230
+ "<field#{index}>#{token("node", index, 24)}</field#{index}>"
231
+ end.join
232
+ )
233
+
234
+ {xml: xml, options: default_options}
235
+ end
236
+
237
+ def deep_tree(depth)
238
+ body = +""
239
+ depth.times do |index|
240
+ body << %(<level#{index} depth="#{index}"><value>#{token("value", index, 24)}</value>)
241
+ end
242
+ depth.times.reverse_each do |index|
243
+ body << %(</level#{index}>)
244
+ end
245
+
246
+ {xml: wrap_root(body), options: default_options}
247
+ end
248
+ end
249
+
250
+ # Builds record and attribute-heavy payloads.
251
+ class RecordPayloadFactory < FactoryBase
252
+ def record_batch(count)
253
+ xml = wrap_root(Array.new(count) { |index| record(index) }.join)
254
+ {xml: xml, options: default_options}
255
+ end
256
+
257
+ def attribute_dense(count)
258
+ xml = wrap_root(Array.new(count) { |index| attributed_node(index) }.join)
259
+ {xml: xml, options: default_options}
260
+ end
261
+
262
+ private
263
+
264
+ def record(index)
265
+ <<~XML.delete("\n")
266
+ <record id="#{index}">
267
+ #{record_body(index)}
268
+ </record>
269
+ XML
270
+ end
271
+
272
+ def record_body(index)
273
+ [
274
+ "<title>#{token("title", index, 40)}</title>",
275
+ "<status>#{(index % 3).zero? ? "active" : "pending"}</status>",
276
+ "<created_at>#{timestamp(index)}</created_at>",
277
+ "<amount>#{Kernel.format("%.2f", ((index * 17) % 10_000) / 10.0)}</amount>",
278
+ "<tags><tag>alpha</tag><tag>beta</tag><tag>#{token("tag", index, 12)}</tag></tags>"
279
+ ].join
280
+ end
281
+
282
+ def attributed_node(index)
283
+ attrs = 8.times.map do |slot|
284
+ %(a#{slot}="#{token("attr", index + slot, 14)}")
285
+ end.join(" ")
286
+ %(<node #{attrs}>#{token("node", index, 18)}</node>)
287
+ end
288
+ end
289
+
290
+ # Builds mixed-content and namespace-heavy payloads.
291
+ class MixedPayloadFactory < FactoryBase
292
+ def mixed_content(count)
293
+ xml = wrap_root(Array.new(count) { |index| section(index) }.join)
294
+ {xml: xml, options: default_options}
295
+ end
296
+
297
+ def namespace_feed(count)
298
+ xml = <<~XML.delete("\n")
299
+ <atom:feed xmlns:atom="http://www.w3.org/2005/Atom"
300
+ xmlns:gd="http://schemas.google.com/g/2005"
301
+ xmlns:ex="https://example.test/schema">
302
+ #{Array.new(count) { |index| feed_entry(index) }.join}
303
+ </atom:feed>
304
+ XML
305
+
306
+ {xml: xml, options: default_options.merge(namespaces: :preserve)}
307
+ end
308
+
309
+ private
310
+
311
+ def section(index)
312
+ <<~XML.delete("\n")
313
+ <section id="s#{index}">
314
+ <title>#{token("title", index, 28)}</title>
315
+ <p>This #{token("text", index, 18)} text has <em>inline emphasis #{index}</em> and <strong>strong #{index}</strong>.</p>
316
+ <p>#{token("body", index + 1, 46)} <a href="https://example.test/#{index}">link #{index}</a> #{token("tail", index + 2, 20)}</p>
317
+ </section>
318
+ XML
319
+ end
320
+
321
+ def feed_entry(index)
322
+ <<~XML.delete("\n")
323
+ <atom:entry gd:id="item-#{index}" ex:version="#{index}">
324
+ <atom:title>#{token("title", index, 34)}</atom:title>
325
+ <atom:content type="text">#{token("content", index, 58)}</atom:content>
326
+ <gd:rating value="#{(index % 5) + 1}"/>
327
+ <ex:metadata ex:region="us-west-2" ex:trace="trace-#{index}"/>
328
+ </atom:entry>
329
+ XML
330
+ end
331
+ end
332
+
333
+ # Builds large catalog-style payloads.
334
+ class CatalogPayloadFactory < FactoryBase
335
+ def catalog(count)
336
+ xml = wrap_root(Array.new(count) { |index| product(index) }.join)
337
+ {xml: xml, options: default_options}
338
+ end
339
+
340
+ private
341
+
342
+ def product(index)
343
+ <<~XML.delete("\n")
344
+ <product sku="sku-#{index}" region="us" updated="#{timestamp(index)}">
345
+ #{product_body(index)}
346
+ </product>
347
+ XML
348
+ end
349
+
350
+ def product_body(index)
351
+ [
352
+ "<name>#{token("name", index, 30)}</name>",
353
+ "<price currency=\"USD\">#{Kernel.format("%.2f", ((index * 13) % 100_000) / 100.0)}</price>",
354
+ "<inventory available=\"#{index % 2}\" warehouse=\"w#{index % 11}\">#{(index * 7) % 400}</inventory>",
355
+ categories(index),
356
+ dimensions(index),
357
+ "<description>#{token("description", index, 120)}</description>"
358
+ ].join
359
+ end
360
+
361
+ def categories(index)
362
+ [
363
+ "<categories>",
364
+ "<category>hardware</category>",
365
+ "<category>component</category>",
366
+ "<category>#{token("category", index, 12)}</category>",
367
+ "</categories>"
368
+ ].join
369
+ end
370
+
371
+ def dimensions(index)
372
+ "<dimensions><width>#{(index % 100) + 1}</width><height>#{(index % 80) + 1}</height><depth>#{(index % 50) + 1}</depth></dimensions>"
373
+ end
374
+ end
375
+ end
376
+ end
377
+
378
+ class MultiXMLBenchmark
379
+ # Runs the benchmark matrix across parsers and XML payloads.
380
+ class Runner
381
+ # JRuby surfaces parser backend incompatibilities (e.g. Oga's Java
382
+ # backend against newer JRuby) as java.lang.Error subclasses, which
383
+ # are outside Ruby's StandardError hierarchy. Catch the broader Java
384
+ # tree on JRuby so a busted parser is excluded instead of aborting
385
+ # the run. Java::JavaLang::Throwable resolves lazily under JRuby and
386
+ # doesn't respond to defined?, so gate on RUBY_ENGINE.
387
+ RESCUABLE_PARSE_ERRORS = if RUBY_ENGINE == "jruby"
388
+ require "java"
389
+ [StandardError, Java::JavaLang::Throwable].freeze
390
+ else
391
+ [StandardError].freeze
392
+ end
393
+ private_constant :RESCUABLE_PARSE_ERRORS
394
+
395
+ def initialize(parsers:, payloads:, options:)
396
+ @parsers = parsers
397
+ @payloads = payloads
398
+ @options = options
399
+ @sampler = MultiXMLBenchmark::Sampler.new(options)
400
+ end
401
+
402
+ def run
403
+ eligible_parsers, excluded_parsers = validate_parsers(parsers)
404
+ raise "No parsers passed validation" if eligible_parsers.empty?
405
+
406
+ measurements = payloads.each_with_index.flat_map do |payload, index|
407
+ run_payload(payload, eligible_parsers, index)
408
+ end
409
+
410
+ MultiXMLBenchmark::RunResults.new(
411
+ excluded_parsers: excluded_parsers,
412
+ measurements: measurements,
413
+ parsers: eligible_parsers
414
+ )
415
+ end
416
+
417
+ private
418
+
419
+ attr_reader :parsers, :payloads, :options, :sampler
420
+
421
+ def validate_parsers(entries)
422
+ return [entries, {}] unless options[:validate]
423
+
424
+ expected = expected_outputs(entries)
425
+ excluded = excluded_parsers(entries, expected)
426
+ [entries.reject { |entry| excluded.key?(entry.name) }, excluded]
427
+ end
428
+
429
+ def expected_outputs(entries)
430
+ baseline = entries.find { |entry| entry.name == :rexml } || entries.first
431
+ payloads.to_h { |payload| [payload.label, parse_with(baseline, payload)] }
432
+ end
433
+
434
+ def excluded_parsers(entries, expected)
435
+ payloads.each_with_object({}) do |payload, excluded|
436
+ entries.each do |entry|
437
+ next if excluded.key?(entry.name)
438
+
439
+ reason = validation_failure(entry, payload, expected.fetch(payload.label))
440
+ excluded[entry.name] = "#{payload.label}: #{reason}" if reason
441
+ end
442
+ end
443
+ end
444
+
445
+ def validation_failure(entry, payload, expected_output)
446
+ actual = parse_with(entry, payload)
447
+ return nil if actual == expected_output
448
+
449
+ "output mismatch"
450
+ rescue *RESCUABLE_PARSE_ERRORS => e
451
+ error_summary(e)
452
+ end
453
+
454
+ def run_payload(payload, eligible_parsers, index)
455
+ puts "Benchmarking parse #{payload.label} (#{MultiXMLBenchmark::Formatter.human_bytes(payload.bytes)})"
456
+ rotated_parsers(eligible_parsers, index).map { |entry| measure(entry, payload) }
457
+ end
458
+
459
+ def rotated_parsers(entries, index)
460
+ entries.rotate(index % entries.length)
461
+ end
462
+
463
+ def measure(entry, payload)
464
+ prime_parser!(entry, payload)
465
+ stats = sampler.sample(entry, payload)
466
+ MultiXMLBenchmark::Measurement.new(
467
+ allocations_per_parse: stats.fetch(:allocations_per_parse),
468
+ ips: stats.fetch(:ips),
469
+ parser: entry.name,
470
+ payload: payload
471
+ )
472
+ end
473
+
474
+ def prime_parser!(entry, payload)
475
+ parse_with(entry, payload)
476
+ end
477
+
478
+ def parse_with(entry, payload)
479
+ MultiXML.with_parser(entry.module_ref) do
480
+ MultiXML.parse(payload.xml, payload.options)
481
+ end
482
+ end
483
+
484
+ def error_summary(error)
485
+ first_line = error.message.to_s.lines.first.to_s.strip
486
+ text = first_line.empty? ? error.class.to_s : "#{error.class}: #{first_line}"
487
+ (text.length > 140) ? "#{text[0, 137]}..." : text
488
+ end
489
+ end
490
+ end
491
+
492
+ class MultiXMLBenchmark
493
+ # Measures throughput for a single parser/payload combination.
494
+ class Sampler
495
+ def initialize(options)
496
+ @options = options
497
+ end
498
+
499
+ def sample(entry, payload)
500
+ work = work_for(entry, payload)
501
+ iterations = estimate_iterations(work)
502
+ warmup(work, iterations)
503
+ sample_stats(work, iterations)
504
+ end
505
+
506
+ private
507
+
508
+ attr_reader :options
509
+
510
+ def work_for(entry, payload)
511
+ lambda do
512
+ MultiXML.with_parser(entry.module_ref) do
513
+ MultiXML.parse(payload.xml, payload.options)
514
+ end
515
+ end
516
+ end
517
+
518
+ def warmup(work, iterations)
519
+ warmup_iterations = [(iterations * options[:warmup] / options[:time]).round, 1].max
520
+ timed_loop(work, warmup_iterations)
521
+ end
522
+
523
+ def sample_stats(work, iterations)
524
+ {
525
+ allocations_per_parse: allocation_median(work, iterations),
526
+ ips: MultiXMLBenchmark::Formatter.median(sample_rates(work, iterations))
527
+ }
528
+ end
529
+
530
+ def sample_rates(work, iterations)
531
+ Array.new(options[:samples]) do
532
+ GC.start
533
+ elapsed = with_gc_disabled { timed_loop(work, iterations) }
534
+ iterations.fdiv([elapsed, MultiXMLBenchmark::EPSILON].max)
535
+ end
536
+ end
537
+
538
+ def allocation_median(work, iterations)
539
+ allocations = Array.new(options[:samples]) do
540
+ GC.start
541
+ allocation_before = allocation_count
542
+ with_gc_disabled { timed_loop(work, iterations) }
543
+ allocation_delta(allocation_before, iterations)
544
+ end.compact
545
+
546
+ allocations.empty? ? nil : MultiXMLBenchmark::Formatter.median(allocations)
547
+ end
548
+
549
+ def estimate_iterations(work)
550
+ iterations = 1
551
+ elapsed = with_gc_disabled { timed_loop(work, iterations) }
552
+
553
+ while elapsed < 0.001 && iterations < MultiXMLBenchmark::MAX_ITERATIONS
554
+ iterations *= 10
555
+ elapsed = with_gc_disabled { timed_loop(work, iterations) }
556
+ end
557
+
558
+ estimated = ((options[:time] / [elapsed, MultiXMLBenchmark::EPSILON].max) * iterations).ceil
559
+ estimated.clamp(1, MultiXMLBenchmark::MAX_ITERATIONS)
560
+ end
561
+
562
+ def timed_loop(work, iterations)
563
+ started_at = Process.clock_gettime(MultiXMLBenchmark::CLOCK)
564
+ sink = nil
565
+ iterations.times { sink = work.call }
566
+ raise "Benchmark produced nil" if sink.nil?
567
+
568
+ Process.clock_gettime(MultiXMLBenchmark::CLOCK) - started_at
569
+ end
570
+
571
+ def allocation_count
572
+ GC.stat.fetch(:total_allocated_objects)
573
+ rescue NoMethodError, KeyError
574
+ nil
575
+ end
576
+
577
+ def allocation_delta(before, iterations)
578
+ return nil unless before
579
+
580
+ (GC.stat.fetch(:total_allocated_objects) - before).fdiv(iterations)
581
+ rescue NoMethodError, KeyError
582
+ nil
583
+ end
584
+
585
+ def with_gc_disabled
586
+ already_disabled = GC.disable
587
+ yield
588
+ ensure
589
+ GC.enable unless already_disabled
590
+ end
591
+ end
592
+ end
593
+
594
+ class MultiXMLBenchmark
595
+ # Prints the benchmark summary and detailed result tables.
596
+ class Reporter
597
+ SUMMARY_HEADERS = ["parser", "overall score", "alloc score", "wins"].freeze
598
+ SUMMARY_ALIGNMENTS = %i[left right right right].freeze
599
+ EXCLUSION_HEADERS = %w[parser reason].freeze
600
+ EXCLUSION_ALIGNMENTS = %i[left left].freeze
601
+ private_constant :SUMMARY_HEADERS, :SUMMARY_ALIGNMENTS, :EXCLUSION_HEADERS, :EXCLUSION_ALIGNMENTS
602
+
603
+ def initialize(results:, options:)
604
+ @results = results
605
+ @options = options
606
+ end
607
+
608
+ def print
609
+ print_header
610
+ print_summary
611
+ puts
612
+ print_details
613
+ print_exclusions unless results.excluded_parsers.empty?
614
+ end
615
+
616
+ private
617
+
618
+ attr_reader :results, :options
619
+
620
+ def print_header
621
+ puts
622
+ puts "Ruby: #{RUBY_ENGINE} #{RUBY_VERSION} (#{RUBY_PLATFORM})"
623
+ puts "Parsers: #{results.parsers.map(&:name).join(", ")}"
624
+ puts "Method: median ops/s across #{options[:samples]} sample(s); overall score is the geometric"
625
+ puts "mean of per-benchmark throughput normalized to that benchmark's winner."
626
+ puts "Allocation score is a secondary geometric-mean score based on allocated objects per parse."
627
+ puts
628
+ end
629
+
630
+ def print_summary
631
+ rows = MultiXMLBenchmark::Summary.new(results.parsers, results.measurements).rows
632
+ puts "Overall winner: #{rows.first[0]}"
633
+ puts MultiXMLBenchmark::TableRenderer.new(format: options[:format]).render(
634
+ SUMMARY_HEADERS,
635
+ rows,
636
+ alignments: SUMMARY_ALIGNMENTS
637
+ )
638
+ end
639
+
640
+ def print_details
641
+ detail = MultiXMLBenchmark::Details.new(results.parsers, results.measurements)
642
+ puts MultiXMLBenchmark::TableRenderer.new(format: options[:format]).render(
643
+ detail.headers,
644
+ detail.rows,
645
+ alignments: detail.alignments
646
+ )
647
+ end
648
+
649
+ def print_exclusions
650
+ rows = results.excluded_parsers.map { |parser, reason| [parser.to_s, reason] }
651
+ puts
652
+ puts "Excluded parsers"
653
+ puts MultiXMLBenchmark::TableRenderer.new(format: options[:format]).render(
654
+ EXCLUSION_HEADERS,
655
+ rows,
656
+ alignments: EXCLUSION_ALIGNMENTS
657
+ )
658
+ end
659
+ end
660
+ end
661
+
662
+ class MultiXMLBenchmark
663
+ # Asserts MultiXML::PARSER_PREFERENCE matches benchmark throughput ranking.
664
+ #
665
+ # Compares only the parsers that both appear in PARSER_PREFERENCE and were
666
+ # benchmarked on this run, so missing native parsers (e.g. ox on JRuby) are
667
+ # tolerated rather than treated as failures. Adjacent parsers whose
668
+ # observed scores fall within TOLERANCE of each other are treated as
669
+ # tied so noisy benchmark runs that flip close pairs (e.g. oga vs
670
+ # nokogiri on TruffleRuby) don't trigger a failure.
671
+ class PreferenceVerifier
672
+ TOLERANCE = 0.10
673
+ private_constant :TOLERANCE
674
+
675
+ def initialize(results)
676
+ @results = results
677
+ end
678
+
679
+ def valid?
680
+ violations.empty?
681
+ end
682
+
683
+ def report
684
+ if valid?
685
+ report_match
686
+ else
687
+ report_violations
688
+ end
689
+ end
690
+
691
+ private
692
+
693
+ attr_reader :results
694
+
695
+ def preference_order
696
+ @preference_order ||= MultiXML::PARSER_PREFERENCE.map { |_lib, parser| parser }
697
+ end
698
+
699
+ def scores
700
+ @scores ||= summary.rows.to_h { |row| [row[0].to_sym, row[1].to_f] }
701
+ end
702
+
703
+ def summary
704
+ @summary ||= MultiXMLBenchmark::Summary.new(results.parsers, results.measurements)
705
+ end
706
+
707
+ def relevant_parsers
708
+ @relevant_parsers ||= preference_order.select { |parser| scores.key?(parser) }
709
+ end
710
+
711
+ def violations
712
+ @violations ||= relevant_parsers.each_cons(2).filter_map do |earlier, later|
713
+ violation_for(earlier, later)
714
+ end
715
+ end
716
+
717
+ def violation_for(earlier, later)
718
+ earlier_score = scores.fetch(earlier)
719
+ later_score = scores.fetch(later)
720
+ return nil if later_score <= earlier_score * (1 + TOLERANCE)
721
+
722
+ {earlier: earlier, later: later, earlier_score: earlier_score, later_score: later_score}
723
+ end
724
+
725
+ def report_match
726
+ puts
727
+ puts "PARSER_PREFERENCE matches benchmark ranking within #{tolerance_pct}% tolerance: #{relevant_parsers.join(", ")}"
728
+ end
729
+
730
+ def report_violations
731
+ puts
732
+ puts "PARSER_PREFERENCE does not match benchmark ranking (>#{tolerance_pct}% tolerance):"
733
+ violations.each { |violation| puts " #{format_violation(violation)}" }
734
+ end
735
+
736
+ def format_violation(violation)
737
+ later = violation.fetch(:later)
738
+ earlier = violation.fetch(:earlier)
739
+ later_score = format_score(violation.fetch(:later_score))
740
+ earlier_score = format_score(violation.fetch(:earlier_score))
741
+ excess = (((violation.fetch(:later_score) / violation.fetch(:earlier_score)) - 1) * 100).round
742
+ "#{later} (#{later_score}) outranks #{earlier} (#{earlier_score}) by #{excess}% but is preferenced after it"
743
+ end
744
+
745
+ def format_score(value)
746
+ Kernel.format("%.3f", value)
747
+ end
748
+
749
+ def tolerance_pct
750
+ (TOLERANCE * 100).to_i
751
+ end
752
+ end
753
+ end
754
+
755
+ class MultiXMLBenchmark
756
+ # Computes overall parser scores and benchmark wins.
757
+ class Summary
758
+ def initialize(parsers, measurements)
759
+ @parsers = parsers
760
+ @measurements = measurements
761
+ end
762
+
763
+ def rows
764
+ parsers
765
+ .map { |parser| summary_row(parser) }
766
+ .sort_by { |row| [-row[1].to_f, -allocation_sort_value(row[2]), -row[3].to_i] }
767
+ end
768
+
769
+ private
770
+
771
+ attr_reader :parsers, :measurements
772
+
773
+ def summary_row(parser)
774
+ overall_score = score_for(parser.name)
775
+ allocation_score = allocation_score_for(parser.name)
776
+ [
777
+ parser.name.to_s,
778
+ Kernel.format("%.3f", overall_score),
779
+ allocation_score.nil? ? "n/a" : Kernel.format("%.3f", allocation_score),
780
+ wins.fetch(parser.name, 0).to_s
781
+ ]
782
+ end
783
+
784
+ def allocation_sort_value(value)
785
+ return -1.0 if value == "n/a"
786
+
787
+ value.to_f
788
+ end
789
+
790
+ def score_for(parser_name)
791
+ MultiXMLBenchmark::Formatter.geometric_mean(grouped_ratios.fetch(parser_name))
792
+ end
793
+
794
+ def allocation_score_for(parser_name)
795
+ values = grouped_allocation_ratios.fetch(parser_name, [])
796
+ return nil if values.empty?
797
+
798
+ MultiXMLBenchmark::Formatter.geometric_mean(values)
799
+ end
800
+
801
+ def grouped_ratios
802
+ @grouped_ratios ||= begin
803
+ ratios = Hash.new { |hash, key| hash[key] = [] }
804
+ grouped_measurements.each_value { |entries| append_ratios(ratios, entries) }
805
+ ratios
806
+ end
807
+ end
808
+
809
+ def grouped_allocation_ratios
810
+ @grouped_allocation_ratios ||= begin
811
+ ratios = Hash.new { |hash, key| hash[key] = [] }
812
+ grouped_measurements.each_value { |entries| append_allocation_ratios(ratios, entries) }
813
+ ratios
814
+ end
815
+ end
816
+
817
+ def wins
818
+ @wins ||= begin
819
+ counts = Hash.new(0)
820
+ grouped_measurements.each_value { |entries| counts[entries.max_by(&:ips).parser] += 1 }
821
+ counts
822
+ end
823
+ end
824
+
825
+ def grouped_measurements
826
+ @grouped_measurements ||= measurements.group_by { |measurement| measurement.payload.label }
827
+ end
828
+
829
+ def append_ratios(ratios, entries)
830
+ peak = entries.max_by(&:ips).ips
831
+ entries.each do |entry|
832
+ ratios[entry.parser] << normalized_ratio(entry.ips, peak)
833
+ end
834
+ end
835
+
836
+ def append_allocation_ratios(ratios, entries)
837
+ alloc_entries = entries.reject { |entry| entry.allocations_per_parse.nil? }
838
+ return if alloc_entries.empty?
839
+
840
+ fewest = alloc_entries.min_by(&:allocations_per_parse).allocations_per_parse
841
+ alloc_entries.each do |entry|
842
+ ratios[entry.parser] << normalized_ratio(fewest, entry.allocations_per_parse)
843
+ end
844
+ end
845
+
846
+ def normalized_ratio(value, peak)
847
+ value / [peak, MultiXMLBenchmark::EPSILON].max
848
+ end
849
+ end
850
+ end
851
+
852
+ class MultiXMLBenchmark
853
+ # Builds the per-benchmark detail table rows.
854
+ class Details
855
+ def initialize(parsers, measurements)
856
+ @parsers = parsers
857
+ @measurements = measurements
858
+ end
859
+
860
+ def headers
861
+ ["benchmark", "bytes", *parsers.map { |parser| "#{parser.name} ops/s" }, "winner"]
862
+ end
863
+
864
+ def rows
865
+ payload_labels.map { |label| detail_row(label) }
866
+ end
867
+
868
+ def alignments
869
+ [:left, :right, *Array.new(parsers.length, :right), :left]
870
+ end
871
+
872
+ private
873
+
874
+ attr_reader :parsers, :measurements
875
+
876
+ def detail_row(label)
877
+ entries = grouped_measurements.fetch(label)
878
+ [
879
+ label,
880
+ MultiXMLBenchmark::Formatter.human_bytes(entries.first.payload.bytes),
881
+ *parser_rates(index_entries(entries)),
882
+ winner_label(entries)
883
+ ]
884
+ end
885
+
886
+ def index_entries(entries)
887
+ entries.each_with_object({}) { |entry, hash| hash[entry.parser] = entry }
888
+ end
889
+
890
+ def parser_rates(indexed)
891
+ parsers.map { |parser| MultiXMLBenchmark::Formatter.format_rate(indexed.fetch(parser.name).ips) }
892
+ end
893
+
894
+ def winner_label(entries)
895
+ fastest = entries.max_by(&:ips)
896
+ rate = MultiXMLBenchmark::Formatter.format_rate(fastest.ips)
897
+ "#{fastest.parser} (#{rate})"
898
+ end
899
+
900
+ def payload_labels
901
+ @payload_labels ||= measurements.map { |measurement| measurement.payload.label }.uniq
902
+ end
903
+
904
+ def grouped_measurements
905
+ @grouped_measurements ||= measurements.group_by { |measurement| measurement.payload.label }
906
+ end
907
+ end
908
+ end
909
+
910
+ class MultiXMLBenchmark
911
+ # Renders plain-text and markdown tables for benchmark output.
912
+ class TableRenderer
913
+ def initialize(format:)
914
+ @format = format
915
+ end
916
+
917
+ def render(headers, rows, alignments:)
918
+ widths = column_widths(headers, rows)
919
+ return markdown_table(headers, rows, widths) if format == :markdown
920
+
921
+ plain_table(headers, rows, widths, alignments)
922
+ end
923
+
924
+ private
925
+
926
+ attr_reader :format
927
+
928
+ def column_widths(headers, rows)
929
+ headers.each_index.map do |index|
930
+ ([headers[index].length] + rows.map { |row| row[index].to_s.length }).max
931
+ end
932
+ end
933
+
934
+ def plain_table(headers, rows, widths, alignments)
935
+ [
936
+ format_row(headers, widths, alignments),
937
+ format_row(widths.map { |width| "-" * width }, widths, Array.new(widths.length, :left)),
938
+ *rows.map { |row| format_row(row, widths, alignments) }
939
+ ].join("\n")
940
+ end
941
+
942
+ def markdown_table(headers, rows, widths)
943
+ [
944
+ markdown_row(headers, widths),
945
+ markdown_row(widths.map { |width| "-" * width }, widths),
946
+ *rows.map { |row| markdown_row(row, widths) }
947
+ ].join("\n")
948
+ end
949
+
950
+ def format_row(row, widths, alignments)
951
+ row.each_with_index.map do |cell, index|
952
+ alignment = alignments[index] || :left
953
+ align_cell(cell.to_s, widths[index], alignment)
954
+ end.join(" ")
955
+ end
956
+
957
+ def markdown_row(row, widths)
958
+ cells = row.each_with_index.map { |cell, index| cell.to_s.ljust(widths[index]) }
959
+ "| #{cells.join(" | ")} |"
960
+ end
961
+
962
+ def align_cell(text, width, alignment)
963
+ (alignment == :right) ? text.rjust(width) : text.ljust(width)
964
+ end
965
+ end
966
+ end
967
+
968
+ class MultiXMLBenchmark
969
+ # Shared numeric and display formatting helpers for benchmark output.
970
+ class Formatter
971
+ class << self
972
+ def median(values)
973
+ sorted = values.sort
974
+ midpoint = sorted.length / 2
975
+ return sorted[midpoint] if sorted.length.odd?
976
+
977
+ (sorted[midpoint - 1] + sorted[midpoint]) / 2.0
978
+ end
979
+
980
+ def geometric_mean(values)
981
+ Math.exp(values.sum { |value| Math.log([value, MultiXMLBenchmark::EPSILON].max) } / values.length)
982
+ end
983
+
984
+ def format_rate(rate)
985
+ return Kernel.format("%.2fM", rate / 1_000_000.0) if rate >= 1_000_000
986
+ return Kernel.format("%.1fk", rate / 1_000.0) if rate >= 1_000
987
+ return Kernel.format("%.2f", rate) if rate < 10
988
+
989
+ Kernel.format("%.0f", rate)
990
+ end
991
+
992
+ def human_bytes(bytes)
993
+ return "#{bytes} B" if bytes < 1024
994
+ return Kernel.format("%.1f KB", bytes / 1024.0) if bytes < 1024 * 1024
995
+
996
+ Kernel.format("%.2f MB", bytes / MultiXMLBenchmark::BYTES_PER_MEGABYTE)
997
+ end
998
+ end
999
+ end
1000
+ end
1001
+
1002
+ exit(MultiXMLBenchmark.run) if $PROGRAM_NAME == __FILE__