ssrjson-benchmark 0.0.9__cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1190 @@
1
+ import gc
2
+ import io
3
+ import json
4
+ import math
5
+ import os
6
+ import pathlib
7
+ import platform
8
+ import re
9
+ import sys
10
+ import time
11
+ from importlib.util import find_spec
12
+ from typing import TYPE_CHECKING, Any, Callable, List
13
+
14
+ import matplotlib as mpl
15
+ import matplotlib.pyplot as plt
16
+
17
+ from . import _ssrjson_benchmark as internal
18
+ from .result_types import BenchmarkFinalResult, BenchmarkResultPerFile
19
+
20
+ if TYPE_CHECKING:
21
+ from reportlab.pdfgen import canvas
22
+
23
+ _CUR_DIR = os.path.dirname(os.path.abspath(__file__))
24
+ _NS_IN_ONE_S = 1000000000
25
+
26
+ _PDF_HEADING_FONT = "Helvetica-Bold"
27
+ _PDF_TEXT_FONT = "Courier"
28
+
29
+ # baseline is the first one.
30
+ _LIBRARIES_COLORS = {
31
+ "json": "#74c476",
32
+ "ujson": "#c994c7",
33
+ "msgspec": "#8856a7",
34
+ "orjson": "#2c7fb8",
35
+ "ssrjson": "#fd8d3c",
36
+ }
37
+
38
+ _NAME_LOADDUMP = "load&dump"
39
+ _INDEX_LOADDUMP = 0
40
+ _NAME_DUMPSTOBYTES = "dumps_to_bytes"
41
+ _INDEX_DUMPSTOBYTES = 1
42
+
43
+ _INDEXED_GROUPS = [_NAME_LOADDUMP, _NAME_DUMPSTOBYTES]
44
+ _PRINT_INDEX_GROUPS = ["Loads & Dumps to str", "Dumps to bytes"]
45
+
46
+
47
+ class BenchmarkFunction:
48
+ def __init__(self, func: Callable, library_name: str) -> None:
49
+ self.func = func
50
+ self.library_name = library_name
51
+
52
+
53
+ class BenchmarkGroup:
54
+ def __init__(
55
+ self,
56
+ benchmarker: Callable,
57
+ functions: list[BenchmarkFunction],
58
+ index_name: str,
59
+ group_name: str,
60
+ input_preprocessor: Callable[[Any], Any] = lambda x: x,
61
+ is_dumps=False,
62
+ skip_when_ascii=False,
63
+ ) -> None:
64
+ self.benchmarker = benchmarker
65
+ self.functions = functions
66
+ self.index_name = index_name
67
+ self.group_name = group_name
68
+ self.input_preprocessor = input_preprocessor
69
+ self.is_dumps = is_dumps
70
+ self.skip_when_ascii = skip_when_ascii
71
+
72
+
73
+ # benchmarkers
74
+ def _benchmark(repeat_time: int, times_per_bin: int, func, data: bytes):
75
+ """
76
+ Run repeat benchmark for bytes input.
77
+ returns time used (ns).
78
+ """
79
+ # times_per_bin not used
80
+ # disable automatic GC
81
+ gc_was_enabled = _gc_prepare()
82
+ try:
83
+ # warm up
84
+ internal.run_object_accumulate_benchmark(func, 1, (data,))
85
+ return internal.run_object_accumulate_benchmark(func, repeat_time, (data,))
86
+ finally:
87
+ if gc_was_enabled:
88
+ gc.enable()
89
+
90
+
91
+ def _benchmark_unicode_arg(repeat_time: int, times_per_bin: int, func, unicode: str):
92
+ """
93
+ Run repeat benchmark, disabling utf-8 cache.
94
+ returns time used (ns).
95
+ """
96
+ # disable automatic GC
97
+ gc_was_enabled = _gc_prepare()
98
+ try:
99
+ times_left = repeat_time
100
+ total = 0
101
+ while times_left != 0:
102
+ cur_bin_size = min(times_left, times_per_bin)
103
+ times_left -= cur_bin_size
104
+ # prepare identical data, without sharing objects
105
+ benchmark_data = internal.copy_unicode_list_invalidate_cache(
106
+ unicode, cur_bin_size + 1
107
+ )
108
+ assert _check_str_cache(benchmark_data[0], False)
109
+ # warm up
110
+ internal.run_object_benchmark(func, (benchmark_data[0],))
111
+ #
112
+ for i in range(1, cur_bin_size + 1):
113
+ total += internal.run_object_benchmark(func, (benchmark_data[i],))
114
+ del benchmark_data
115
+ return total
116
+ finally:
117
+ if gc_was_enabled:
118
+ gc.enable()
119
+
120
+
121
+ def _benchmark_invalidate_dump_cache(
122
+ repeat_time: int, times_per_bin: int, func, raw_bytes: bytes
123
+ ):
124
+ """
125
+ Invalidate UTF-8 cache for the same input.
126
+ returns time used (ns).
127
+ """
128
+ # disable automatic GC
129
+ gc_was_enabled = _gc_prepare()
130
+ try:
131
+ times_left = repeat_time
132
+ total = 0
133
+ while times_left != 0:
134
+ cur_bin_size = min(times_left, times_per_bin)
135
+ times_left -= cur_bin_size
136
+ # prepare identical data, without sharing objects
137
+ benchmark_data = [json.loads(raw_bytes) for _ in range(cur_bin_size + 1)]
138
+ assert _recursive_check_cache(benchmark_data[0], False)
139
+ # warm up
140
+ internal.run_object_benchmark(func, (benchmark_data[0],))
141
+ #
142
+ for i in range(1, cur_bin_size + 1):
143
+ # assert _recursive_check_cache(benchmark_data[i], False)
144
+ total += internal.run_object_benchmark(func, (benchmark_data[i],))
145
+ del benchmark_data
146
+ return total
147
+ finally:
148
+ if gc_was_enabled:
149
+ gc.enable()
150
+
151
+
152
+ def _benchmark_with_dump_cache(
153
+ repeat_time: int, times_per_bin: int, func, raw_bytes: bytes
154
+ ):
155
+ # times_per_bin not used
156
+ # disable automatic GC
157
+ gc_was_enabled = _gc_prepare()
158
+ try:
159
+ data = json.loads(raw_bytes)
160
+ # ensure cache
161
+ ensure_utf8_cache(data)
162
+ assert _recursive_check_cache(data, True)
163
+ # warm up
164
+ internal.run_object_accumulate_benchmark(func, 1, (data,))
165
+ return internal.run_object_accumulate_benchmark(func, repeat_time, (data,))
166
+ finally:
167
+ if gc_was_enabled:
168
+ gc.enable()
169
+
170
+
171
+ def _get_benchmark_defs() -> tuple[BenchmarkGroup, ...]:
172
+ import msgspec
173
+ import orjson
174
+ import ssrjson
175
+ import ujson
176
+
177
+ return (
178
+ BenchmarkGroup(
179
+ _benchmark_unicode_arg,
180
+ [
181
+ BenchmarkFunction(json.loads, "json"),
182
+ BenchmarkFunction(ujson.loads, "ujson"),
183
+ BenchmarkFunction(msgspec.json.decode, "msgspec"),
184
+ BenchmarkFunction(orjson.loads, "orjson"),
185
+ BenchmarkFunction(ssrjson.loads, "ssrjson"),
186
+ ],
187
+ _INDEXED_GROUPS[0],
188
+ "loads str",
189
+ input_preprocessor=lambda x: x.decode("utf-8"),
190
+ is_dumps=False,
191
+ skip_when_ascii=False,
192
+ ),
193
+ BenchmarkGroup(
194
+ _benchmark,
195
+ [
196
+ BenchmarkFunction(json.loads, "json"),
197
+ BenchmarkFunction(ujson.loads, "ujson"),
198
+ BenchmarkFunction(msgspec.json.decode, "msgspec"),
199
+ BenchmarkFunction(orjson.loads, "orjson"),
200
+ BenchmarkFunction(ssrjson.loads, "ssrjson"),
201
+ ],
202
+ _INDEXED_GROUPS[0],
203
+ "loads bytes",
204
+ is_dumps=False,
205
+ skip_when_ascii=False,
206
+ ),
207
+ BenchmarkGroup(
208
+ _benchmark_invalidate_dump_cache,
209
+ [
210
+ BenchmarkFunction(lambda x: json.dumps(x, ensure_ascii=False), "json"),
211
+ BenchmarkFunction(
212
+ lambda x: ujson.dumps(x, ensure_ascii=False), "ujson"
213
+ ),
214
+ BenchmarkFunction(
215
+ lambda x: msgspec.json.encode(x).decode("utf-8"), "msgspec"
216
+ ),
217
+ BenchmarkFunction(lambda x: orjson.dumps(x).decode("utf-8"), "orjson"),
218
+ BenchmarkFunction(ssrjson.dumps, "ssrjson"),
219
+ ],
220
+ _INDEXED_GROUPS[0],
221
+ "dumps to str",
222
+ is_dumps=True,
223
+ skip_when_ascii=False,
224
+ ),
225
+ BenchmarkGroup(
226
+ _benchmark_invalidate_dump_cache,
227
+ [
228
+ BenchmarkFunction(
229
+ lambda x: json.dumps(x, indent=2, ensure_ascii=False), "json"
230
+ ),
231
+ BenchmarkFunction(
232
+ lambda x: ujson.dumps(x, indent=2, ensure_ascii=False), "ujson"
233
+ ),
234
+ BenchmarkFunction(
235
+ lambda x: msgspec.json.format(
236
+ msgspec.json.encode(x), indent=2
237
+ ).decode("utf-8"),
238
+ "msgspec",
239
+ ),
240
+ BenchmarkFunction(
241
+ lambda x: orjson.dumps(x, option=orjson.OPT_INDENT_2).decode(
242
+ "utf-8"
243
+ ),
244
+ "orjson",
245
+ ),
246
+ BenchmarkFunction(lambda x: ssrjson.dumps(x, indent=2), "ssrjson"),
247
+ ],
248
+ _INDEXED_GROUPS[0],
249
+ "dumps to str (indented2)",
250
+ is_dumps=True,
251
+ skip_when_ascii=False,
252
+ ),
253
+ BenchmarkGroup(
254
+ _benchmark_invalidate_dump_cache,
255
+ [
256
+ BenchmarkFunction(
257
+ lambda x: json.dumps(x, ensure_ascii=False).encode("utf-8"), "json"
258
+ ),
259
+ BenchmarkFunction(
260
+ lambda x: ujson.dumps(x, ensure_ascii=False).encode("utf-8"),
261
+ "ujson",
262
+ ),
263
+ BenchmarkFunction(
264
+ msgspec.json.encode,
265
+ "msgspec",
266
+ ),
267
+ BenchmarkFunction(orjson.dumps, "orjson"),
268
+ BenchmarkFunction(ssrjson.dumps_to_bytes, "ssrjson"),
269
+ ],
270
+ _INDEXED_GROUPS[1],
271
+ "dumps to bytes",
272
+ is_dumps=True,
273
+ skip_when_ascii=False,
274
+ ),
275
+ BenchmarkGroup(
276
+ _benchmark_invalidate_dump_cache,
277
+ [
278
+ BenchmarkFunction(
279
+ lambda x: json.dumps(x, indent=2, ensure_ascii=False).encode(
280
+ "utf-8"
281
+ ),
282
+ "json",
283
+ ),
284
+ BenchmarkFunction(
285
+ lambda x: ujson.dumps(x, indent=2, ensure_ascii=False).encode(
286
+ "utf-8"
287
+ ),
288
+ "ujson",
289
+ ),
290
+ BenchmarkFunction(
291
+ lambda x: msgspec.json.format(msgspec.json.encode(x), indent=2),
292
+ "msgspec",
293
+ ),
294
+ BenchmarkFunction(
295
+ lambda x: orjson.dumps(x, option=orjson.OPT_INDENT_2), "orjson"
296
+ ),
297
+ BenchmarkFunction(
298
+ lambda x: ssrjson.dumps_to_bytes(x, indent=2), "ssrjson"
299
+ ),
300
+ ],
301
+ _INDEXED_GROUPS[1],
302
+ "dumps to bytes (indented2)",
303
+ is_dumps=True,
304
+ skip_when_ascii=False,
305
+ ),
306
+ BenchmarkGroup(
307
+ _benchmark_with_dump_cache,
308
+ [
309
+ BenchmarkFunction(
310
+ lambda x: json.dumps(x, ensure_ascii=False).encode("utf-8"), "json"
311
+ ),
312
+ BenchmarkFunction(
313
+ lambda x: ujson.dumps(x, ensure_ascii=False).encode("utf-8"),
314
+ "ujson",
315
+ ),
316
+ BenchmarkFunction(
317
+ msgspec.json.encode,
318
+ "msgspec",
319
+ ),
320
+ BenchmarkFunction(orjson.dumps, "orjson"),
321
+ BenchmarkFunction(ssrjson.dumps_to_bytes, "ssrjson"),
322
+ ],
323
+ _INDEXED_GROUPS[1],
324
+ "dumps to bytes (cached)",
325
+ is_dumps=True,
326
+ skip_when_ascii=True,
327
+ ),
328
+ BenchmarkGroup(
329
+ _benchmark_invalidate_dump_cache,
330
+ [
331
+ BenchmarkFunction(
332
+ lambda x: json.dumps(x, ensure_ascii=False).encode("utf-8"), "json"
333
+ ),
334
+ BenchmarkFunction(
335
+ lambda x: ujson.dumps(x, ensure_ascii=False).encode("utf-8"),
336
+ "ujson",
337
+ ),
338
+ BenchmarkFunction(
339
+ msgspec.json.encode,
340
+ "msgspec",
341
+ ),
342
+ BenchmarkFunction(orjson.dumps, "orjson"),
343
+ BenchmarkFunction(
344
+ lambda x: ssrjson.dumps_to_bytes(x, is_write_cache=True), "ssrjson"
345
+ ),
346
+ ],
347
+ _INDEXED_GROUPS[1],
348
+ "dumps to bytes (write cache)",
349
+ is_dumps=True,
350
+ skip_when_ascii=True,
351
+ ),
352
+ )
353
+
354
+
355
+ def _get_benchmark_libraries() -> dict[str, BenchmarkGroup]:
356
+ return {x.group_name: x for x in _get_benchmark_defs()}
357
+
358
+
359
+ def _gc_prepare():
360
+ """
361
+ Call collect once, and then disable automatic GC.
362
+ Return True if automatic GC was enabled.
363
+ """
364
+ gc.collect()
365
+ gc_was_enabled = gc.isenabled()
366
+ if gc_was_enabled:
367
+ gc.disable()
368
+ return gc_was_enabled
369
+
370
+
371
+ def _check_str_cache(s: str, want_cache: bool):
372
+ _, _, is_ascii, _ = internal.inspect_pyunicode(s)
373
+ return is_ascii or want_cache == internal.pyunicode_has_utf8_cache(s)
374
+
375
+
376
+ def _recursive_check_cache(obj, want_cache: bool):
377
+ if isinstance(obj, str):
378
+ return _check_str_cache(obj, want_cache)
379
+ if isinstance(obj, list):
380
+ for item in obj:
381
+ if not _recursive_check_cache(item, want_cache):
382
+ return False
383
+ return True
384
+ if isinstance(obj, dict):
385
+ for key, value in obj.items():
386
+ if not _recursive_check_cache(key, want_cache):
387
+ return False
388
+ if not _recursive_check_cache(value, want_cache):
389
+ return False
390
+ return True
391
+ # other types
392
+ return True
393
+
394
+
395
+ def ensure_utf8_cache(encodable):
396
+ """
397
+ Ensure UTF-8 cache(s) in object.
398
+ """
399
+ # We use orjson.dumps,
400
+ # which always create UTF-8 caches for all non-ASCII str.
401
+ import orjson
402
+
403
+ orjson.dumps(encodable)
404
+
405
+
406
+ def _get_processed_size(func: Callable, input_data, is_dumps):
407
+ if is_dumps:
408
+ # get output size of dumps
409
+ data_obj = json.loads(input_data)
410
+ output = func(data_obj)
411
+ if isinstance(output, bytes):
412
+ size = len(output)
413
+ else:
414
+ size = internal.inspect_pyunicode(output)[1]
415
+ else:
416
+ # get loads input size
417
+ size = (
418
+ len(input_data)
419
+ if isinstance(input_data, bytes)
420
+ else internal.inspect_pyunicode(input_data)[1]
421
+ )
422
+ return size
423
+
424
+
425
+ def _update_inspect_result(old_kind, old_size, old_is_ascii, kind, str_size, is_ascii):
426
+ return (
427
+ max(old_kind, kind),
428
+ old_size + str_size,
429
+ old_is_ascii and is_ascii,
430
+ )
431
+
432
+
433
+ def _inspect_pyunicode_in_json(obj):
434
+ kind = 1
435
+ str_size = 0
436
+ is_ascii = True
437
+ if isinstance(obj, dict):
438
+ for k, v in obj.items():
439
+ _kind, _str_size, _is_ascii, _ = internal.inspect_pyunicode(k)
440
+ kind, str_size, is_ascii = _update_inspect_result(
441
+ kind, str_size, is_ascii, _kind, _str_size, _is_ascii
442
+ )
443
+ _kind, _str_size, _is_ascii = _inspect_pyunicode_in_json(v)
444
+ # check empty dict/list
445
+ kind, str_size, is_ascii = _update_inspect_result(
446
+ kind, str_size, is_ascii, _kind, _str_size, _is_ascii
447
+ )
448
+ return kind, str_size, is_ascii
449
+ if isinstance(obj, list):
450
+ for item in obj:
451
+ _kind, _str_size, _is_ascii = _inspect_pyunicode_in_json(item)
452
+ kind, str_size, is_ascii = _update_inspect_result(
453
+ kind, str_size, is_ascii, _kind, _str_size, _is_ascii
454
+ )
455
+ return kind, str_size, is_ascii
456
+ if isinstance(obj, str):
457
+ return internal.inspect_pyunicode(obj)[:3]
458
+ return kind, str_size, is_ascii
459
+
460
+
461
+ def _run_benchmark(
462
+ cur_result_file: BenchmarkResultPerFile,
463
+ repeat_times: int,
464
+ times_per_bin: int,
465
+ input_data: str | bytes,
466
+ benchmark_group: BenchmarkGroup,
467
+ ):
468
+ group_name = benchmark_group.group_name
469
+ cur_target = cur_result_file[group_name]
470
+
471
+ input_data = benchmark_group.input_preprocessor(input_data)
472
+
473
+ for benchmark_target in benchmark_group.functions:
474
+ prefix = f"[{benchmark_target.library_name}][{benchmark_group.group_name}]"
475
+ print(
476
+ prefix
477
+ + (" " * max(0, 50 - len(prefix)))
478
+ + f"repeat_times={repeat_times} times_per_bin={times_per_bin}"
479
+ )
480
+ speed = benchmark_group.benchmarker(
481
+ repeat_times, times_per_bin, benchmark_target.func, input_data
482
+ )
483
+ cur_lib = cur_target[benchmark_target.library_name]
484
+ cur_lib.speed = speed
485
+
486
+ baseline_name = "json"
487
+ baseline_data = cur_target[baseline_name]
488
+ for benchmark_target in benchmark_group.functions:
489
+ cur_lib = cur_target[benchmark_target.library_name]
490
+ if benchmark_target.library_name == "ssrjson":
491
+ # calculate bytes per sec for ssrJSON
492
+ size = _get_processed_size(
493
+ benchmark_target.func, input_data, benchmark_group.is_dumps
494
+ )
495
+ cur_target.ssrjson_bytes_per_sec = (
496
+ size * repeat_times / (cur_lib.speed / _NS_IN_ONE_S)
497
+ )
498
+
499
+ cur_lib.ratio = (
500
+ math.inf
501
+ if baseline_data.speed == 0
502
+ else (baseline_data.speed / cur_lib.speed)
503
+ )
504
+
505
+
506
+ def _run_file_benchmark(
507
+ benchmark_libraries: dict[str, BenchmarkGroup],
508
+ file: pathlib.Path,
509
+ process_bytes: int,
510
+ bin_process_bytes: int,
511
+ index_s: str,
512
+ ):
513
+ print(f"Running benchmark for {file.name}, index group: {index_s}")
514
+ with open(file, "rb") as f:
515
+ raw_bytes = f.read()
516
+ base_file_name = os.path.basename(file)
517
+ cur_result_file = BenchmarkResultPerFile()
518
+ cur_result_file.byte_size = bytes_size = len(raw_bytes)
519
+ if bytes_size == 0:
520
+ raise RuntimeError(f"File {file} is empty.")
521
+ kind, str_size, is_ascii = _inspect_pyunicode_in_json(json.loads(raw_bytes))
522
+ assert isinstance(kind, int)
523
+ assert isinstance(str_size, int)
524
+ assert isinstance(is_ascii, bool)
525
+ cur_result_file.pyunicode_size = str_size
526
+ cur_result_file.pyunicode_kind = kind
527
+ cur_result_file.pyunicode_is_ascii = is_ascii
528
+ repeat_times = int((process_bytes + bytes_size - 1) // bytes_size)
529
+ times_per_bin = max(1, bin_process_bytes // bytes_size)
530
+
531
+ for benchmark_group in benchmark_libraries.values():
532
+ if benchmark_group.index_name == index_s and (
533
+ not benchmark_group.skip_when_ascii or not is_ascii
534
+ ):
535
+ _run_benchmark(
536
+ cur_result_file, repeat_times, times_per_bin, raw_bytes, benchmark_group
537
+ )
538
+ return base_file_name, cur_result_file
539
+
540
+
541
+ def _get_ssrjson_rev():
542
+ import ssrjson
543
+
544
+ return (
545
+ getattr(ssrjson, "__version__", None) or getattr(ssrjson, "ssrjson").__version__
546
+ )
547
+
548
+
549
+ def _get_real_output_file_name():
550
+ rev = _get_ssrjson_rev()
551
+ if not rev:
552
+ file = "benchmark_result.json"
553
+ else:
554
+ file = f"benchmark_result_{rev}.json"
555
+ return file
556
+
557
+
558
+ def _get_cpu_name() -> str:
559
+ cpuinfo_spec = find_spec("cpuinfo")
560
+ if cpuinfo_spec is not None:
561
+ import cpuinfo
562
+
563
+ cpu_name = cpuinfo.get_cpu_info().get("brand_raw", "UnknownCPU")
564
+ else:
565
+ # fallback
566
+ cpu_name: str = platform.processor()
567
+ if cpu_name.strip() == "":
568
+ # linux fallback
569
+ if os.path.exists("/proc/cpuinfo"):
570
+ with open(file="/proc/cpuinfo", mode="r") as file:
571
+ cpu_info_lines = file.readlines()
572
+ for line in cpu_info_lines:
573
+ if "model name" in line:
574
+ cpu_name = re.sub(
575
+ pattern=r"model name\s+:\s+", repl="", string=line
576
+ )
577
+ break
578
+ else:
579
+ cpu_name = "UnknownCPU"
580
+ # merge nearby spaces
581
+ return re.sub(pattern=r"\s+", repl=" ", string=cpu_name).strip()
582
+
583
+
584
+ def _get_mem_total() -> str:
585
+ mem_total: int = 0
586
+ if platform.system() == "Linux":
587
+ with open(file="/proc/meminfo", mode="r") as file:
588
+ mem_info_lines = file.readlines()
589
+ for line in mem_info_lines:
590
+ if "MemTotal" in line:
591
+ mem_total = int(re.sub(pattern=r"[^0-9]", repl="", string=line))
592
+ break
593
+ elif platform.system() == "Windows":
594
+ import psutil
595
+
596
+ mem_total = psutil.virtual_memory().total // 1024 # in KB
597
+ return f"{mem_total / (1024**2):.3f}GiB"
598
+
599
+
600
+ def _plot_prepare():
601
+ mpl.use("Agg")
602
+ mpl.rcParams["svg.fonttype"] = "none"
603
+
604
+
605
+ def _get_ratio_color(ratio: float) -> str:
606
+ if ratio < 1:
607
+ return "#d63031" # red (worse than baseline)
608
+ elif ratio == 1:
609
+ return "black" # black (baseline)
610
+ elif ratio < 2:
611
+ return "#e67e22" # orange (similar/slightly better)
612
+ elif ratio < 4:
613
+ return "#f39c12" # amber (decent improvement)
614
+ elif ratio < 8:
615
+ return "#27ae60" # green (good)
616
+ elif ratio < 16:
617
+ return "#2980b9" # blue (great)
618
+ else:
619
+ return "#8e44ad" # purple (exceptional)
620
+
621
+
622
+ def _plot_relative_ops(
623
+ categories: list[str], data: dict, doc_name: str, mask: list[bool] = None
624
+ ) -> io.BytesIO:
625
+ if mask is None:
626
+ mask = [True] * len(categories)
627
+ libs = list(_LIBRARIES_COLORS.keys())
628
+ colors = [_LIBRARIES_COLORS[n] for n in libs]
629
+ n = len(categories)
630
+ bar_width = 0.2
631
+ inner_pad = 0
632
+
633
+ fig, axs = plt.subplots(
634
+ 1,
635
+ n,
636
+ figsize=(3 * n, 4),
637
+ sharey=False,
638
+ tight_layout=True,
639
+ gridspec_kw={"wspace": 0},
640
+ )
641
+
642
+ x_positions = [i * (bar_width + inner_pad) for i in range(len(libs))]
643
+
644
+ for i, (ax, cat) in enumerate(zip(axs, categories)):
645
+ if not mask[i]:
646
+ ax.axis("off")
647
+ continue
648
+ vals = [1.0] + [data[cat][name]["ratio"] for name in libs[1:]]
649
+ gbps = (data[cat]["ssrjson_bytes_per_sec"]) / (1024**3)
650
+
651
+ for xi, val, col in zip(x_positions, vals, colors):
652
+ ax.bar(xi, val, width=bar_width, color=col)
653
+ ax.text(
654
+ xi,
655
+ val + 0.05,
656
+ f"{val:.2f}x",
657
+ ha="center",
658
+ va="bottom",
659
+ fontsize=9,
660
+ color=_get_ratio_color(val),
661
+ )
662
+
663
+ ssrjson_index = libs.index("ssrjson")
664
+ ax.text(
665
+ x_positions[ssrjson_index],
666
+ vals[ssrjson_index] / 2,
667
+ f"{gbps:.2f} GB/s",
668
+ ha="center",
669
+ va="center",
670
+ fontsize=10,
671
+ color="#2c3e50",
672
+ fontweight="bold",
673
+ )
674
+
675
+ # baseline line
676
+ ax.axhline(1.0, color="gray", linestyle="--", linewidth=1)
677
+ # height = 1.1 * max bar height
678
+ ax.set_ylim(0, max(vals + [1.0]) * 1.1)
679
+
680
+ # hide all tick
681
+ ax.tick_params(
682
+ axis="both",
683
+ which="both",
684
+ left=False,
685
+ bottom=False,
686
+ labelleft=False,
687
+ labelbottom=False,
688
+ )
689
+
690
+ # and spine
691
+ for spine in ("left", "top", "right"):
692
+ ax.spines[spine].set_visible(False)
693
+
694
+ ax.set_xlabel(cat, fontsize=10, labelpad=6)
695
+
696
+ fig.suptitle(
697
+ doc_name,
698
+ fontsize=20,
699
+ fontweight="bold",
700
+ y=0.98,
701
+ )
702
+
703
+ # color legend
704
+ legend_elements = [
705
+ plt.Line2D([0], [0], color=col, lw=4, label=name)
706
+ for name, col in _LIBRARIES_COLORS.items()
707
+ ]
708
+ fig.legend(
709
+ handles=legend_elements,
710
+ loc="upper right",
711
+ bbox_to_anchor=(0.98, 0.95),
712
+ ncol=len(libs),
713
+ fontsize=14,
714
+ frameon=False,
715
+ )
716
+
717
+ fig.text(
718
+ 0.5,
719
+ 0,
720
+ "Higher is better",
721
+ ha="center",
722
+ va="bottom",
723
+ fontsize=8,
724
+ style="italic",
725
+ color="#555555",
726
+ )
727
+
728
+ buf = io.BytesIO()
729
+ plt.savefig(buf, format="svg", bbox_inches="tight")
730
+ buf.seek(0)
731
+ plt.close(fig)
732
+ return buf
733
+
734
+
735
+ def _plot_distribution(ratio_distr: List[List[float]]):
736
+ lib_names = list(_LIBRARIES_COLORS.keys())[1:]
737
+ fig, ax = plt.subplots(1, 1, figsize=(3 * len(lib_names), 4), tight_layout=True)
738
+
739
+ bplot = ax.boxplot(
740
+ ratio_distr,
741
+ vert=True,
742
+ patch_artist=True,
743
+ showfliers=False,
744
+ )
745
+
746
+ for median in bplot["medians"]:
747
+ median.set_color("red")
748
+ median.set_linewidth(2)
749
+
750
+ ax.axhline(1.0, color="gray", linestyle="--", linewidth=1)
751
+ ax.text(
752
+ 0.5,
753
+ 1.02,
754
+ "Baseline (json)",
755
+ ha="left",
756
+ va="bottom",
757
+ fontsize=10,
758
+ color="gray",
759
+ )
760
+ ax.set_xticklabels(lib_names)
761
+ ax.set_ylabel("Speed Ratio to json")
762
+ ax.yaxis.set_major_formatter("{x:.1f}x")
763
+ ax.set_title("Speed Ratio Distribution per Library")
764
+
765
+ for patch, color in zip(bplot["boxes"], list(_LIBRARIES_COLORS.values())[1:]):
766
+ patch.set_facecolor(color)
767
+
768
+ buf = io.BytesIO()
769
+ plt.savefig(buf, format="svg", bbox_inches="tight")
770
+ buf.seek(0)
771
+ plt.close(fig)
772
+ return buf
773
+
774
+
775
+ def _draw_page_number(c: "canvas.Canvas", page_num: int):
776
+ from reportlab.lib.pagesizes import A4
777
+
778
+ width, _ = A4
779
+ c.setFont("Helvetica-Oblique", 8) # italic
780
+ c.setFillColorRGB(0.5, 0.5, 0.5) # grey
781
+ c.drawRightString(width - 40, 20, f"{page_num}")
782
+
783
+
784
+ def _generate_pdf_report(
785
+ figures: List[List[io.BytesIO]],
786
+ header_text: str,
787
+ output_pdf_path: str,
788
+ distribution_digest: List[List[float]],
789
+ ) -> str:
790
+ from reportlab.graphics import renderPDF
791
+ from reportlab.lib.pagesizes import A4
792
+ from reportlab.pdfgen import canvas
793
+ from svglib.svglib import svg2rlg
794
+
795
+ try:
796
+ from svglib.fonts import FontMap
797
+
798
+ font_map = FontMap()
799
+ font_map.register_default_fonts()
800
+ # workaround for matplotlib using 700 to represent bold font, but svg2rlg using 700 as normal.
801
+ font_map.register_font("Helvetica", weight="700", rlgFontName="Helvetica-Bold")
802
+ except ImportError:
803
+ font_map = None
804
+
805
+ c = canvas.Canvas(output_pdf_path, pagesize=A4)
806
+ width, height = A4
807
+
808
+ # heading info
809
+ heading = header_text.splitlines()
810
+ # first line is # header
811
+ header, heading_info = heading[0].removeprefix("#").strip(), heading[1:]
812
+ c.setFont(_PDF_HEADING_FONT, 20)
813
+ text_obj = c.beginText(40, height - 50)
814
+ text_obj.textLine(header)
815
+ c.drawText(text_obj)
816
+
817
+ # Wrap heading_info lines if overflow
818
+ max_width = width - 80 # 40 margin on both sides
819
+ wrapped_heading_info = []
820
+ for line in heading_info:
821
+ while c.stringWidth(line, _PDF_TEXT_FONT, 10) > max_width:
822
+ # Find a split point
823
+ split_idx = int(max_width // c.stringWidth(" ", _PDF_TEXT_FONT, 10))
824
+ # Try to split at nearest space before split_idx
825
+ space_idx = line.rfind(" ", 0, split_idx)
826
+ if space_idx == -1:
827
+ space_idx = split_idx
828
+ wrapped_heading_info.append(line[:space_idx])
829
+ # TODO fixed indent
830
+ line = " " + line[space_idx:].lstrip()
831
+ wrapped_heading_info.append(line)
832
+ heading_info = wrapped_heading_info
833
+
834
+ c.setFont(_PDF_TEXT_FONT, 10)
835
+ text_obj = c.beginText(40, height - 70)
836
+ for line in heading_info:
837
+ text_obj.textLine(line)
838
+ c.drawText(text_obj)
839
+
840
+ c.setFont("Helvetica-Oblique", 8)
841
+ text = "This report was generated by https://github.com/Nambers/ssrJSON-benchmark"
842
+ c.drawString(40, 20, text)
843
+ link_start = 40 + c.stringWidth("This report was generated by ")
844
+ link_end = link_start + c.stringWidth(
845
+ "https://github.com/Nambers/ssrJSON-benchmark"
846
+ )
847
+ text_height = 5 # Adjusted height to better fit the link area
848
+ c.linkURL(
849
+ "https://github.com/Nambers/ssrJSON-benchmark",
850
+ (link_start, 20, link_end, 20 + text_height),
851
+ relative=1,
852
+ )
853
+
854
+ header_lines = header_text.count("\n") + 1
855
+ header_height = header_lines * 14 + 10
856
+ # subheading spacing = 30
857
+ y_pos = height - header_height - 40
858
+ bottom_margin = 20
859
+ vertical_gap = 20
860
+
861
+ p = 0
862
+
863
+ # distribution plot
864
+ text_obj = c.beginText()
865
+ text_obj.setTextOrigin(40, y_pos)
866
+ text_obj.setFont(_PDF_HEADING_FONT, 14)
867
+ text_obj.textLine("TL;DR")
868
+
869
+ c.drawText(text_obj)
870
+ c.bookmarkHorizontal("TL;DR", 0, y_pos + 20)
871
+ c.addOutlineEntry("TL;DR", "TL;DR", level=0)
872
+ y_pos -= 20
873
+
874
+ dist_svg_io = _plot_distribution(distribution_digest)
875
+ drawing = svg2rlg(dist_svg_io, font_map=font_map)
876
+
877
+ avail_w = width - 80
878
+ scale = avail_w / drawing.width
879
+ drawing.width *= scale
880
+ drawing.height *= scale
881
+ drawing.scale(scale, scale)
882
+
883
+ img_h = drawing.height
884
+ # no enough space
885
+ if y_pos - img_h - vertical_gap < bottom_margin:
886
+ _draw_page_number(c, p)
887
+ p += 1
888
+ c.showPage()
889
+ y_pos = height - bottom_margin
890
+
891
+ renderPDF.draw(drawing, c, 40, y_pos - img_h)
892
+ y_pos -= img_h + vertical_gap
893
+
894
+ for i in range(len(_INDEXED_GROUPS)):
895
+ name = _PRINT_INDEX_GROUPS[i]
896
+ figs = figures[i]
897
+
898
+ text_obj = c.beginText()
899
+ text_obj.setTextOrigin(40, y_pos)
900
+ text_obj.setFont(_PDF_HEADING_FONT, 14)
901
+ text_obj.textLine(f"{name}")
902
+
903
+ c.drawText(text_obj)
904
+ c.bookmarkHorizontal(name, 0, y_pos + 20)
905
+ c.addOutlineEntry(name, name, level=0)
906
+ y_pos -= 20
907
+ for svg_io in figs:
908
+ svg_io.seek(0)
909
+ drawing = svg2rlg(svg_io, font_map=font_map)
910
+
911
+ avail_w = width - 80
912
+ scale = avail_w / drawing.width
913
+ drawing.width *= scale
914
+ drawing.height *= scale
915
+ drawing.scale(scale, scale)
916
+
917
+ img_h = drawing.height
918
+ # no enough space
919
+ if y_pos - img_h - vertical_gap < bottom_margin:
920
+ _draw_page_number(c, p)
921
+ p += 1
922
+ c.showPage()
923
+ y_pos = height - bottom_margin
924
+
925
+ c.setStrokeColorRGB(0.9, 0.9, 0.9)
926
+ c.setLineWidth(0.4)
927
+ c.line(40, y_pos, width - 40, y_pos)
928
+
929
+ renderPDF.draw(drawing, c, 40, y_pos - img_h)
930
+ y_pos -= img_h + vertical_gap
931
+
932
+ _draw_page_number(c, p)
933
+ c.save()
934
+ return output_pdf_path
935
+
936
+
937
+ def _fetch_header(rev, processbytesgb, perbinbytesmb) -> str:
938
+ import msgspec
939
+ import orjson
940
+ import ssrjson
941
+ import ujson
942
+
943
+ with open(os.path.join(_CUR_DIR, "template.md"), "r") as f:
944
+ template = f.read()
945
+ ssrjson_features = ssrjson.get_current_features()
946
+ return template.format(
947
+ REV=rev,
948
+ TIME=time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime()),
949
+ OS=f"{platform.system()} {platform.machine()} {platform.release()} {platform.version()}",
950
+ PYTHON=sys.version,
951
+ ORJSON_VER=orjson.__version__,
952
+ MSGSPEC_VER=msgspec.__version__,
953
+ UJSON_VER=ujson.__version__,
954
+ SIMD_FLAGS={k: ssrjson_features[k] for k in ("MultiLib", "SIMD")},
955
+ CHIPSET=_get_cpu_name(),
956
+ MEM=_get_mem_total(),
957
+ PROCESS_MEM="{:.3f}GiB".format(processbytesgb),
958
+ PER_BIN_MEM="{}MiB".format(perbinbytesmb),
959
+ )
960
+
961
+
962
+ def _fetch_cats_mask(
963
+ benchmark_groups: dict[str, BenchmarkGroup],
964
+ cats: list[list[str]],
965
+ ):
966
+ return [
967
+ [
968
+ benchmark_groups[a].index_name != _NAME_DUMPSTOBYTES
969
+ or not benchmark_groups[a].skip_when_ascii
970
+ for a in cats[i]
971
+ ]
972
+ for i in range(len(_INDEXED_GROUPS))
973
+ ]
974
+
975
+
976
+ def generate_report_pdf(
977
+ result: BenchmarkFinalResult, file: str, out_dir: str | None = None
978
+ ):
979
+ """
980
+ Generate PDF report, using `result`.
981
+ """
982
+ _plot_prepare()
983
+
984
+ if out_dir is None:
985
+ out_dir = os.getcwd()
986
+
987
+ file = file.removesuffix(".json")
988
+ report_name = f"{file}.pdf"
989
+
990
+ figures = [[] for _ in range(len(_INDEXED_GROUPS))]
991
+ benchmark_groups = _get_benchmark_libraries()
992
+ cats = [
993
+ [a for a in result.categories if benchmark_groups[a].index_name == index_name]
994
+ for index_name in _INDEXED_GROUPS
995
+ ]
996
+
997
+ ratios = [[] for _ in range(len(_LIBRARIES_COLORS) - 1)]
998
+
999
+ dumps_to_bytes_ascii_mask = _fetch_cats_mask(benchmark_groups, cats)
1000
+
1001
+ for i, indexed_group in enumerate(_INDEXED_GROUPS):
1002
+ for bench_filename in result.filenames:
1003
+ print(f"Processing {bench_filename} [{indexed_group}](PDF)")
1004
+ this_result = result.results[indexed_group][bench_filename]
1005
+ if this_result.pyunicode_is_ascii:
1006
+ mask = dumps_to_bytes_ascii_mask[i]
1007
+ else:
1008
+ mask = [True] * len(cats[i])
1009
+ figures[i].append(
1010
+ _plot_relative_ops(
1011
+ cats[i],
1012
+ this_result,
1013
+ bench_filename,
1014
+ mask,
1015
+ )
1016
+ )
1017
+ for j, cat in enumerate(cats[i]):
1018
+ if not mask[j]:
1019
+ continue
1020
+ for lib_idx in range(len(_LIBRARIES_COLORS) - 1):
1021
+ lib_name = list(_LIBRARIES_COLORS.keys())[lib_idx + 1]
1022
+ ratios[lib_idx].append(
1023
+ result.results[indexed_group][bench_filename][cat][lib_name][
1024
+ "ratio"
1025
+ ]
1026
+ )
1027
+
1028
+ template = _fetch_header(
1029
+ file.removeprefix("benchmark_result_").removesuffix(".json"),
1030
+ result.processbytesgb,
1031
+ result.perbinbytesmb,
1032
+ )
1033
+ out_path = _generate_pdf_report(
1034
+ figures,
1035
+ header_text=template,
1036
+ output_pdf_path=os.path.join(out_dir, report_name),
1037
+ distribution_digest=ratios,
1038
+ )
1039
+ print(f"Report saved to {out_path}")
1040
+ return out_path
1041
+
1042
+
1043
+ def generate_report_markdown(
1044
+ result: BenchmarkFinalResult, file: str, out_dir: str | None = None
1045
+ ):
1046
+ """
1047
+ Generate Markdown report, using `result`.
1048
+ """
1049
+ _plot_prepare()
1050
+
1051
+ if out_dir is None:
1052
+ out_dir = os.getcwd()
1053
+
1054
+ file = file.removesuffix(".json")
1055
+ report_name = f"{file}.md"
1056
+ report_folder = os.path.join(out_dir, f"{file}_report")
1057
+
1058
+ # mkdir
1059
+ if not os.path.exists(report_folder):
1060
+ os.makedirs(report_folder)
1061
+
1062
+ template = _fetch_header(
1063
+ file.removeprefix("benchmark_result_").removesuffix(".json"),
1064
+ result.processbytesgb,
1065
+ result.perbinbytesmb,
1066
+ )
1067
+ template += "\n\n## TL;DR\n\nTLDRIMGPLACEHOLDER\n\n"
1068
+
1069
+ benchmark_groups = _get_benchmark_libraries()
1070
+ cats = [
1071
+ [a for a in result.categories if benchmark_groups[a].index_name == index_name]
1072
+ for index_name in _INDEXED_GROUPS
1073
+ ]
1074
+ dumps_to_bytes_ascii_categories = _fetch_cats_mask(benchmark_groups, cats)
1075
+
1076
+ ratios = [[] for _ in range(len(_LIBRARIES_COLORS) - 1)]
1077
+
1078
+ for i, indexed_group in enumerate(_INDEXED_GROUPS):
1079
+ template += f"\n\n## {_PRINT_INDEX_GROUPS[i]}\n\n"
1080
+ for bench_filename in result.filenames:
1081
+ print(f"Processing {bench_filename} [{indexed_group}](Markdown)")
1082
+ with open(
1083
+ os.path.join(report_folder, f"{bench_filename}_{indexed_group}.svg"),
1084
+ "wb",
1085
+ ) as svg_file:
1086
+ this_result = result.results[indexed_group][bench_filename]
1087
+ if this_result.pyunicode_is_ascii:
1088
+ mask = dumps_to_bytes_ascii_categories[i]
1089
+ else:
1090
+ mask = [True] * len(cats[i])
1091
+ write_value = _plot_relative_ops(
1092
+ cats[i],
1093
+ this_result,
1094
+ bench_filename,
1095
+ mask,
1096
+ ).getvalue()
1097
+ svg_file.write(write_value)
1098
+ for j, cat in enumerate(cats[i]):
1099
+ if not mask[j]:
1100
+ continue
1101
+ for lib_idx in range(len(_LIBRARIES_COLORS) - 1):
1102
+ lib_name = list(_LIBRARIES_COLORS.keys())[lib_idx + 1]
1103
+ ratios[lib_idx].append(
1104
+ result.results[indexed_group][bench_filename][cat][
1105
+ lib_name
1106
+ ]["ratio"]
1107
+ )
1108
+ # add svg
1109
+ template += f"![{bench_filename}_{indexed_group}](./{bench_filename}_{indexed_group}.svg)\n\n"
1110
+
1111
+ with open(os.path.join(report_folder, "ratio_distribution.svg"), "wb") as svg_file:
1112
+ svg_file.write(_plot_distribution(ratios).getvalue())
1113
+ template = template.replace(
1114
+ "TLDRIMGPLACEHOLDER", "![ratio_distribution](./ratio_distribution.svg)"
1115
+ )
1116
+ ret = os.path.join(report_folder, report_name)
1117
+ with open(ret, "w") as f:
1118
+ f.write(template)
1119
+ print(f"Report saved to {ret}")
1120
+ return ret
1121
+
1122
+
1123
+ def parse_file_result(j):
1124
+ return BenchmarkFinalResult.parse(j)
1125
+
1126
+
1127
+ def is_unix_except_macos():
1128
+ system = platform.system()
1129
+ return system in ("Linux", "AIX", "FreeBSD")
1130
+
1131
+
1132
+ # def _set_multiprocessing_start_method():
1133
+ # try:
1134
+ # multiprocessing.set_start_method("fork")
1135
+ # except RuntimeError as e:
1136
+ # if "context has already been set" not in str(e):
1137
+ # raise
1138
+
1139
+
1140
+ def run_benchmark(
1141
+ files: list[pathlib.Path],
1142
+ process_bytes: int,
1143
+ bin_process_bytes: int,
1144
+ ):
1145
+ """
1146
+ Generate a JSON result of benchmark.
1147
+ Also returns a result object.
1148
+ """
1149
+ import ssrjson
1150
+ # Set multiprocessing start method to fork, if Python version is 3.14+ on Unix
1151
+ # if sys.version_info >= (3, 14) and is_unix_except_macos():
1152
+ # _set_multiprocessing_start_method()
1153
+
1154
+ # disable ssrJSON cache writing globally. Restore it after benchmark.
1155
+ old_write_cache_status = ssrjson.get_current_features()["WriteUTF8Cache"]
1156
+ ssrjson.write_utf8_cache(False)
1157
+ try:
1158
+ file = _get_real_output_file_name()
1159
+
1160
+ result = BenchmarkFinalResult()
1161
+ result.results = dict()
1162
+
1163
+ benchmark_libraries = _get_benchmark_libraries()
1164
+
1165
+ result.categories = list(benchmark_libraries.keys())
1166
+ result.filenames = [files[i].name for i in range(len(files))]
1167
+ result.processbytesgb = process_bytes / 1024 / 1024 / 1024
1168
+ result.perbinbytesmb = int(bin_process_bytes / 1024 / 1024)
1169
+
1170
+ for index_s in _INDEXED_GROUPS:
1171
+ result.results[index_s] = dict()
1172
+ for bench_file in files:
1173
+ k, v = _run_file_benchmark(
1174
+ benchmark_libraries,
1175
+ bench_file,
1176
+ process_bytes,
1177
+ bin_process_bytes,
1178
+ index_s,
1179
+ )
1180
+ result.results[index_s][k] = v
1181
+ output_result = result.dumps()
1182
+
1183
+ if os.path.exists(file):
1184
+ os.remove(file)
1185
+
1186
+ with open(f"{file}", "w", encoding="utf-8") as f:
1187
+ f.write(output_result)
1188
+ return result, file
1189
+ finally:
1190
+ ssrjson.write_utf8_cache(old_write_cache_status)