benchmatrix 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1060 @@
1
+ """Build pytest-benchmark matrices and attach benchmatrix metadata.
2
+
3
+ pytest-benchmark remains the measurement engine and source of truth for
4
+ calibration, timing, statistics, reporting, and JSON export. This module
5
+ orchestrates metric-by-implementation-by-case matrices, attaches strict
6
+ JSON-safe metadata to ``benchmark.extra_info``, and streams lightweight
7
+ invocation progress records. Detailed timing statistics should be read from
8
+ pytest-benchmark's terminal report, CSV output, saved runs, or JSON output.
9
+
10
+ Target functions must be synchronous callables that complete the work to be
11
+ measured before returning. Async functions are not supported. Lazy return values
12
+ such as generators, lazy dataframe expressions, query objects, futures, and
13
+ deferred computation graphs are not forced by this harness; if such objects are
14
+ returned, the benchmark may measure only object construction.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import copy
20
+ import datetime as dt
21
+ import enum
22
+ import importlib
23
+ import inspect
24
+ import json
25
+ import math
26
+ import re
27
+ import sys
28
+ import warnings
29
+ from collections.abc import Callable, Iterable, Mapping, MutableMapping, Sequence
30
+ from dataclasses import dataclass, field
31
+ from pathlib import PurePath
32
+ from typing import Protocol, SupportsFloat, SupportsIndex, TextIO, TypeAlias, TypeVar, cast
33
+
34
+ from ._schema import (
35
+ DEFAULT_METRICS,
36
+ KEY_CASE_FRESH_INPUTS,
37
+ KEY_CASE_NAME,
38
+ KEY_IMPLEMENTATION_NAME,
39
+ KEY_METRIC_NAME,
40
+ KEY_PRODUCER,
41
+ KEY_SCHEMA_VERSION,
42
+ KEY_TAIL_LATENCY_NOTE,
43
+ KEY_TAIL_PERCENTILES,
44
+ KEY_THROUGHPUT_UNIT,
45
+ KEY_WORK_UNIT_NAME,
46
+ KEY_WORK_UNITS,
47
+ METRIC_BATCH_THROUGHPUT,
48
+ METRIC_SINGLE_CALL_LATENCY,
49
+ METRIC_TAIL_LATENCY,
50
+ PRODUCER,
51
+ SCHEMA_VERSION,
52
+ TAIL_PERCENTILES,
53
+ THROUGHPUT_UNIT_CALLS_PER_SECOND,
54
+ THROUGHPUT_UNIT_WORK_UNITS_PER_SECOND,
55
+ MetricName,
56
+ )
57
+ from ._schema import (
58
+ JsonValue as _JsonValue,
59
+ )
60
+ from .exceptions import MetadataSerializationError
61
+
62
+ T = TypeVar("T")
63
+
64
+ TargetFunction: TypeAlias = Callable[..., object]
65
+ """Synchronous callable measured by pytest-benchmark through benchmatrix.
66
+
67
+ Target functions must perform the work being measured before returning. Async
68
+ functions are rejected. Lazy return values are not forced by the harness.
69
+ """
70
+
71
+ _BenchmarkParameter: TypeAlias = object
72
+ _ExtraInfo: TypeAlias = dict[str, _JsonValue]
73
+
74
+ _DEFAULT_PEDANTIC_ROUNDS = 100
75
+ _DEFAULT_WARMUP_ROUNDS = 10
76
+ _DEFAULT_PEDANTIC_ITERATIONS = 1
77
+ _DEFAULT_WORK_UNIT_NAME = "items"
78
+ _WORK_UNIT_NAME_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9_-]*$")
79
+
80
+ _NO_NUMPY_SCALAR = object()
81
+
82
+
83
+ def _empty_args() -> tuple[object, ...]:
84
+ """Return empty positional arguments."""
85
+ return ()
86
+
87
+
88
+ def _empty_kwargs() -> dict[str, object]:
89
+ """Return empty keyword arguments."""
90
+ return {}
91
+
92
+
93
+ def _empty_metadata() -> dict[str, object]:
94
+ """Return empty case metadata."""
95
+ return {}
96
+
97
+
98
+ class BenchmarkFixture(Protocol):
99
+ """pytest-benchmark fixture surface used by benchmatrix.
100
+
101
+ Attributes:
102
+ extra_info: Mutable metadata attached to pytest-benchmark output.
103
+ """
104
+
105
+ extra_info: MutableMapping[str, object]
106
+
107
+ def __call__(self, target: Callable[..., T], *args: object, **kwargs: object) -> T:
108
+ """Benchmark ``target`` with pytest-benchmark automatic calibration."""
109
+ ...
110
+
111
+ def pedantic(
112
+ self,
113
+ target: Callable[..., T],
114
+ *,
115
+ args: Sequence[object] | None = None,
116
+ kwargs: Mapping[str, object] | None = None,
117
+ setup: Callable[[], tuple[Sequence[object], Mapping[str, object]]] | None = None,
118
+ teardown: Callable[..., object] | None = None,
119
+ rounds: int = _DEFAULT_PEDANTIC_ROUNDS,
120
+ warmup_rounds: int = _DEFAULT_WARMUP_ROUNDS,
121
+ iterations: int = _DEFAULT_PEDANTIC_ITERATIONS,
122
+ ) -> T:
123
+ """Benchmark ``target`` with pytest-benchmark pedantic mode."""
124
+ ...
125
+
126
+
127
+ class _PytestModule(Protocol):
128
+ """Small surface of pytest used by the harness."""
129
+
130
+ mark: _PytestMark
131
+
132
+ def param(self, *values: object, id: str | None = None) -> object:
133
+ """Return a pytest parameter value."""
134
+ ...
135
+
136
+
137
+ class _PytestMark(Protocol):
138
+ """Small surface of pytest.mark used by the harness."""
139
+
140
+ def parametrize(
141
+ self,
142
+ names: str | Sequence[str],
143
+ values: Iterable[object],
144
+ ) -> Callable[[Callable[..., None]], Callable[..., None]]:
145
+ """Parametrize a pytest test function."""
146
+ ...
147
+
148
+
149
+ @dataclass(frozen=True, slots=True)
150
+ class BenchmarkConfig:
151
+ """Configuration passed from benchmatrix to pytest-benchmark.
152
+
153
+ Args:
154
+ pedantic_rounds: Number of pedantic benchmark rounds to request.
155
+ warmup_rounds: Number of pedantic warmup rounds to request.
156
+ pedantic_iterations: Number of function calls per pedantic round when
157
+ inputs are reused. This value is intentionally ignored when
158
+ ``BenchmarkCase.fresh_inputs`` is true because pytest-benchmark
159
+ setup mode is used to keep input construction outside the timed
160
+ target-function body.
161
+ stream_progress: Whether benchmark helpers should print one progress
162
+ line per benchmark invocation.
163
+
164
+ Attributes:
165
+ pedantic_rounds: Number of pedantic benchmark rounds to request.
166
+ warmup_rounds: Number of pedantic warmup rounds to request.
167
+ pedantic_iterations: Number of function calls per pedantic round when
168
+ inputs are reused.
169
+ stream_progress: Whether benchmark helpers should print one progress
170
+ line per benchmark invocation.
171
+
172
+ Raises:
173
+ ValueError: If rounds or iterations are not positive, or if warmup
174
+ rounds are negative.
175
+
176
+ Warning:
177
+ For ``tail_latency`` benchmarks, setting ``pedantic_iterations`` above
178
+ one means raw samples should be interpreted as per-round aggregate
179
+ timings rather than clean one-call latency samples. The harness emits a
180
+ runtime warning for this configuration.
181
+ """
182
+
183
+ pedantic_rounds: int = _DEFAULT_PEDANTIC_ROUNDS
184
+ warmup_rounds: int = _DEFAULT_WARMUP_ROUNDS
185
+ pedantic_iterations: int = _DEFAULT_PEDANTIC_ITERATIONS
186
+ stream_progress: bool = True
187
+
188
+ def __post_init__(self) -> None:
189
+ """Validate benchmark configuration after initialization."""
190
+ if self.pedantic_rounds <= 0:
191
+ raise ValueError("BenchmarkConfig.pedantic_rounds must be positive.")
192
+
193
+ if self.warmup_rounds < 0:
194
+ raise ValueError("BenchmarkConfig.warmup_rounds must be non-negative.")
195
+
196
+ if self.pedantic_iterations <= 0:
197
+ raise ValueError("BenchmarkConfig.pedantic_iterations must be positive.")
198
+
199
+
200
+ @dataclass(frozen=True, slots=True)
201
+ class BenchmarkCase:
202
+ """Named input case and metadata for a pytest-benchmark matrix.
203
+
204
+ Warning:
205
+ If ``fresh_inputs`` is false, pytest-benchmark may call the target
206
+ function repeatedly with the same argument objects. That is appropriate
207
+ only when the target function treats its inputs as immutable or when
208
+ reuse reflects the workload you want to measure.
209
+
210
+ If ``fresh_inputs`` is true, this harness uses pytest-benchmark
211
+ pedantic setup so input construction is setup work rather than timed
212
+ target-function work. That avoids accidentally timing input creation,
213
+ but it also means the benchmark is not an end-to-end measurement that
214
+ includes input construction. To benchmark construction cost, put that
215
+ construction inside the target function itself.
216
+
217
+ When ``fresh_inputs`` is true, ``BenchmarkConfig.pedantic_iterations``
218
+ is ignored because pytest-benchmark setup mode is used. The harness
219
+ emits a runtime warning when a non-default value is ignored.
220
+
221
+ Args:
222
+ name: Human-readable case name used in parameter IDs and metadata.
223
+ make_args: Factory returning positional arguments for the target
224
+ function.
225
+ make_kwargs: Factory returning keyword arguments for the target
226
+ function.
227
+ work_units: Positive logical amount of work performed by one target
228
+ call. This can represent items, rows, bytes, tokens, records,
229
+ events, or any other domain-specific unit.
230
+ work_unit_name: Name of the logical work unit, such as ``"items"``,
231
+ ``"rows"``, ``"bytes"``, or ``"tokens"``. Use a base unit name
232
+ without spaces, slashes, or ``"/s"``; display code appends ``"/s"``
233
+ for throughput.
234
+ fresh_inputs: Whether each benchmark round needs newly created inputs.
235
+ metadata: Additional strict-JSON-renderable metadata describing the
236
+ case. Reasonable scalar types such as paths, datetimes, enums, and
237
+ NumPy scalars are coerced; unsupported values raise
238
+ ``MetadataSerializationError``.
239
+
240
+ Attributes:
241
+ name: Human-readable case name used in parameter IDs and metadata.
242
+ make_args: Factory returning positional arguments for the target
243
+ function.
244
+ make_kwargs: Factory returning keyword arguments for the target
245
+ function.
246
+ work_units: Positive logical amount of work performed by one target
247
+ call.
248
+ work_unit_name: Name of the logical work unit.
249
+ fresh_inputs: Whether each benchmark round needs newly created inputs.
250
+ metadata: Strict JSON-safe metadata describing the case.
251
+ """
252
+
253
+ name: str
254
+ make_args: Callable[[], tuple[object, ...]] = _empty_args
255
+ make_kwargs: Callable[[], dict[str, object]] = _empty_kwargs
256
+ work_units: float | Callable[[], float] | None = None
257
+ work_unit_name: str = _DEFAULT_WORK_UNIT_NAME
258
+ fresh_inputs: bool = False
259
+ metadata: Mapping[str, object] = field(default_factory=_empty_metadata)
260
+
261
+ def __post_init__(self) -> None:
262
+ """Validate benchmark case fields after initialization."""
263
+ if not self.name:
264
+ raise ValueError("Benchmark case name must not be empty.")
265
+
266
+ _validate_work_unit_name(self.work_unit_name)
267
+
268
+ if self.work_units is not None and not callable(self.work_units):
269
+ _ = _validate_work_units(self.work_units)
270
+
271
+ coerced_metadata = _coerce_json_mapping(
272
+ self.metadata,
273
+ path="BenchmarkCase.metadata",
274
+ )
275
+ object.__setattr__(self, "metadata", coerced_metadata)
276
+
277
+ def make_call(self) -> tuple[tuple[object, ...], dict[str, object]]:
278
+ """Return positional and keyword arguments for one target invocation.
279
+
280
+ Returns:
281
+ A tuple containing positional arguments and keyword arguments.
282
+ """
283
+ return self.make_args(), self.make_kwargs()
284
+
285
+ def work_unit_count(self) -> float | None:
286
+ """Return the logical work-unit count for throughput metrics.
287
+
288
+ Returns:
289
+ The logical work-unit count, or ``None`` when the case has no work
290
+ unit count.
291
+
292
+ Raises:
293
+ ValueError: If the work-unit count is not positive or finite.
294
+ """
295
+ if self.work_units is None:
296
+ return None
297
+
298
+ value = self.work_units() if callable(self.work_units) else self.work_units
299
+ return _validate_work_units(value)
300
+
301
+ @classmethod
302
+ def from_values(
303
+ cls,
304
+ name: str,
305
+ *args: object,
306
+ work_units: float | Callable[[], float] | None = None,
307
+ work_unit_name: str = _DEFAULT_WORK_UNIT_NAME,
308
+ fresh_inputs: bool = False,
309
+ copier: Callable[[object], object] | None = None,
310
+ metadata: Mapping[str, object] | None = None,
311
+ **kwargs: object,
312
+ ) -> BenchmarkCase:
313
+ """Create a benchmark case from concrete argument values.
314
+
315
+ Args:
316
+ name: Case name.
317
+ *args: Positional arguments for the target function.
318
+ work_units: Positive logical amount of work performed by one target
319
+ call.
320
+ work_unit_name: Name of the logical work unit, such as ``"items"``,
321
+ ``"rows"``, ``"bytes"``, or ``"tokens"``. Use a base unit name
322
+ without spaces, slashes, or ``"/s"``.
323
+ fresh_inputs: Whether target invocations need fresh inputs. When
324
+ true and ``copier`` is omitted, a shallow copy is made for each
325
+ argument value.
326
+ copier: Optional copy function applied to each argument value. Use
327
+ ``deep_copy`` or a domain-specific copy function when shallow
328
+ copies are not fresh enough for the benchmarked workload.
329
+ metadata: Optional strict-JSON-renderable case metadata.
330
+ **kwargs: Keyword arguments for the target function.
331
+
332
+ Returns:
333
+ A configured benchmark case.
334
+ """
335
+
336
+ effective_copier = shallow_copy if fresh_inputs and copier is None else copier
337
+
338
+ def make_args() -> tuple[object, ...]:
339
+ """Return case positional arguments."""
340
+ if effective_copier is None:
341
+ return args
342
+
343
+ return tuple(effective_copier(arg) for arg in args)
344
+
345
+ def make_kwargs() -> dict[str, object]:
346
+ """Return case keyword arguments."""
347
+ if effective_copier is None:
348
+ return dict(kwargs)
349
+
350
+ return {key: effective_copier(value) for key, value in kwargs.items()}
351
+
352
+ return cls(
353
+ name=name,
354
+ make_args=make_args,
355
+ make_kwargs=make_kwargs,
356
+ work_units=work_units,
357
+ work_unit_name=work_unit_name,
358
+ fresh_inputs=fresh_inputs or effective_copier is not None,
359
+ metadata={} if metadata is None else dict(metadata),
360
+ )
361
+
362
+
363
+ @dataclass(frozen=True, slots=True)
364
+ class BenchmarkInvocationRecord:
365
+ """Lightweight record returned after one benchmark invocation.
366
+
367
+ This record is not a timing result. Timing results come from
368
+ pytest-benchmark's report, saved runs, CSV output, or JSON output.
369
+
370
+ Attributes:
371
+ metric_name: Metric requested for this benchmark invocation.
372
+ implementation_name: Name of the implementation under test.
373
+ case_name: Name of the input case under test.
374
+ extra_info: Strict JSON-safe metadata attached to pytest-benchmark
375
+ output. Values are limited to JSON primitives, lists, and
376
+ string-keyed mappings after metadata coercion. The metadata includes
377
+ benchmatrix producer and schema-version markers.
378
+ """
379
+
380
+ metric_name: MetricName
381
+ implementation_name: str
382
+ case_name: str
383
+ extra_info: Mapping[str, object]
384
+
385
+
386
+ def benchmark_single_call_latency(
387
+ benchmark: BenchmarkFixture,
388
+ implementation_name: str,
389
+ function: TargetFunction,
390
+ case_name: str,
391
+ case: BenchmarkCase,
392
+ *,
393
+ config: BenchmarkConfig | None = None,
394
+ stream: TextIO | None = None,
395
+ ) -> BenchmarkInvocationRecord:
396
+ """Benchmark single-call latency for one implementation and case.
397
+
398
+ Args:
399
+ benchmark: Pytest-benchmark fixture instance.
400
+ implementation_name: Name of the implementation under test.
401
+ function: Synchronous function implementation to benchmark. The
402
+ function must complete the measured work before returning.
403
+ case_name: Name of the input case under test.
404
+ case: Benchmark input case.
405
+ config: Benchmark harness configuration. Defaults to
406
+ ``BenchmarkConfig()``.
407
+ stream: Stream used for progress output. Defaults to ``sys.stdout``
408
+ when progress output is enabled.
409
+
410
+ Returns:
411
+ A lightweight invocation record containing metadata attached to the
412
+ benchmark. This is not a timing result.
413
+
414
+ Raises:
415
+ TypeError: If ``function`` is an async function.
416
+
417
+ Warning:
418
+ This measures completed target-function work only. Input construction,
419
+ lazy-result consumption, and other setup are excluded unless they occur
420
+ inside ``function``.
421
+ """
422
+ resolved_config = _resolve_config(config)
423
+ metric_name = METRIC_SINGLE_CALL_LATENCY
424
+ extra_info: dict[str, object] = _make_base_extra_info(
425
+ metric_name,
426
+ implementation_name,
427
+ case_name,
428
+ case,
429
+ )
430
+ final_extra_info = _set_extra_info(benchmark, extra_info)
431
+ _ = _run_target(benchmark, function, case, config=resolved_config, force_pedantic=False)
432
+
433
+ record = BenchmarkInvocationRecord(
434
+ metric_name=metric_name,
435
+ implementation_name=implementation_name,
436
+ case_name=case_name,
437
+ extra_info=final_extra_info,
438
+ )
439
+ _maybe_display_invocation_record(record, config=resolved_config, stream=stream)
440
+ return record
441
+
442
+
443
+ def benchmark_batch_throughput(
444
+ benchmark: BenchmarkFixture,
445
+ implementation_name: str,
446
+ function: TargetFunction,
447
+ case_name: str,
448
+ case: BenchmarkCase,
449
+ *,
450
+ config: BenchmarkConfig | None = None,
451
+ stream: TextIO | None = None,
452
+ ) -> BenchmarkInvocationRecord:
453
+ """Benchmark batch throughput for one implementation and case.
454
+
455
+ Args:
456
+ benchmark: Pytest-benchmark fixture instance.
457
+ implementation_name: Name of the implementation under test.
458
+ function: Synchronous function implementation to benchmark. The
459
+ function must complete the measured work before returning.
460
+ case_name: Name of the input case under test.
461
+ case: Benchmark input case. If ``case.work_units`` is provided,
462
+ throughput is later derived as work units per second; otherwise it
463
+ is derived as calls per second.
464
+ config: Benchmark harness configuration. Defaults to
465
+ ``BenchmarkConfig()``.
466
+ stream: Stream used for progress output. Defaults to ``sys.stdout``
467
+ when progress output is enabled.
468
+
469
+ Returns:
470
+ A lightweight invocation record containing metadata attached to the
471
+ benchmark. This is not a timing result.
472
+
473
+ Raises:
474
+ TypeError: If ``function`` is an async function.
475
+ ValueError: If ``case.work_units`` is not positive or finite.
476
+
477
+ Warning:
478
+ Throughput is derived from one synchronous target invocation. It does
479
+ not model concurrency, saturation, queueing, or service request load.
480
+ ``case.work_units`` must accurately describe work completed by each
481
+ target call.
482
+ """
483
+ resolved_config = _resolve_config(config)
484
+ metric_name = METRIC_BATCH_THROUGHPUT
485
+ extra_info: dict[str, object] = _make_base_extra_info(
486
+ metric_name,
487
+ implementation_name,
488
+ case_name,
489
+ case,
490
+ )
491
+ work_unit_count = case.work_unit_count()
492
+
493
+ if work_unit_count is None:
494
+ extra_info[KEY_THROUGHPUT_UNIT] = THROUGHPUT_UNIT_CALLS_PER_SECOND
495
+ else:
496
+ extra_info[KEY_WORK_UNITS] = work_unit_count
497
+ extra_info[KEY_WORK_UNIT_NAME] = case.work_unit_name
498
+ extra_info[KEY_THROUGHPUT_UNIT] = THROUGHPUT_UNIT_WORK_UNITS_PER_SECOND
499
+
500
+ final_extra_info = _set_extra_info(benchmark, extra_info)
501
+ _ = _run_target(benchmark, function, case, config=resolved_config, force_pedantic=False)
502
+
503
+ record = BenchmarkInvocationRecord(
504
+ metric_name=metric_name,
505
+ implementation_name=implementation_name,
506
+ case_name=case_name,
507
+ extra_info=final_extra_info,
508
+ )
509
+ _maybe_display_invocation_record(record, config=resolved_config, stream=stream)
510
+ return record
511
+
512
+
513
+ def benchmark_tail_latency(
514
+ benchmark: BenchmarkFixture,
515
+ implementation_name: str,
516
+ function: TargetFunction,
517
+ case_name: str,
518
+ case: BenchmarkCase,
519
+ *,
520
+ config: BenchmarkConfig | None = None,
521
+ stream: TextIO | None = None,
522
+ ) -> BenchmarkInvocationRecord:
523
+ """Benchmark latency distribution for one implementation and case.
524
+
525
+ Args:
526
+ benchmark: Pytest-benchmark fixture instance.
527
+ implementation_name: Name of the implementation under test.
528
+ function: Synchronous function implementation to benchmark. The
529
+ function must complete the measured work before returning.
530
+ case_name: Name of the input case under test.
531
+ case: Benchmark input case.
532
+ config: Benchmark harness configuration. Defaults to
533
+ ``BenchmarkConfig()``.
534
+ stream: Stream used for progress output. Defaults to ``sys.stdout``
535
+ when progress output is enabled.
536
+
537
+ Returns:
538
+ A lightweight invocation record containing metadata attached to the
539
+ benchmark. This is not a timing result.
540
+
541
+ Raises:
542
+ TypeError: If ``function`` is an async function.
543
+
544
+ Warning:
545
+ This uses pedantic mode. Tail percentiles should be calculated from
546
+ pytest-benchmark JSON ``data`` values. This is an
547
+ implementation-comparison metric, not production p95/p99 latency under
548
+ load.
549
+
550
+ If ``case.fresh_inputs`` is false and ``config.pedantic_iterations`` is
551
+ greater than one, raw samples should be interpreted as per-round
552
+ aggregate timings rather than clean one-call latency samples. The
553
+ harness emits a runtime warning for that configuration.
554
+ """
555
+ resolved_config = _resolve_config(config)
556
+ metric_name = METRIC_TAIL_LATENCY
557
+ _warn_for_tail_latency_iteration_semantics(case, resolved_config)
558
+
559
+ extra_info: dict[str, object] = _make_base_extra_info(
560
+ metric_name,
561
+ implementation_name,
562
+ case_name,
563
+ case,
564
+ )
565
+ extra_info[KEY_TAIL_LATENCY_NOTE] = (
566
+ "Use pytest-benchmark JSON data to compute p50/p90/p95/p99. "
567
+ "This is not production p95/p99 under load. If pedantic_iterations is "
568
+ "greater than one, samples may represent per-round aggregate timings."
569
+ )
570
+ extra_info[KEY_TAIL_PERCENTILES] = list(TAIL_PERCENTILES)
571
+
572
+ final_extra_info = _set_extra_info(benchmark, extra_info)
573
+ _ = _run_target(benchmark, function, case, config=resolved_config, force_pedantic=True)
574
+
575
+ record = BenchmarkInvocationRecord(
576
+ metric_name=metric_name,
577
+ implementation_name=implementation_name,
578
+ case_name=case_name,
579
+ extra_info=final_extra_info,
580
+ )
581
+ _maybe_display_invocation_record(record, config=resolved_config, stream=stream)
582
+ return record
583
+
584
+
585
+ def run_benchmark_metric(
586
+ benchmark: BenchmarkFixture,
587
+ metric_name: MetricName,
588
+ implementation_name: str,
589
+ function: TargetFunction,
590
+ case_name: str,
591
+ case: BenchmarkCase,
592
+ *,
593
+ config: BenchmarkConfig | None = None,
594
+ stream: TextIO | None = None,
595
+ ) -> BenchmarkInvocationRecord:
596
+ """Run one benchmark metric for one implementation and case.
597
+
598
+ Args:
599
+ benchmark: Pytest-benchmark fixture instance.
600
+ metric_name: Metric to benchmark.
601
+ implementation_name: Name of the implementation under test.
602
+ function: Synchronous function implementation to benchmark.
603
+ case_name: Name of the input case under test.
604
+ case: Benchmark input case.
605
+ config: Benchmark harness configuration. Defaults to
606
+ ``BenchmarkConfig()``.
607
+ stream: Stream used for progress output. Defaults to ``sys.stdout``
608
+ when progress output is enabled.
609
+
610
+ Returns:
611
+ A lightweight invocation record containing metadata attached to the
612
+ benchmark. This is not a timing result.
613
+
614
+ Raises:
615
+ TypeError: If ``function`` is an async function.
616
+ ValueError: If ``metric_name`` is unsupported.
617
+ """
618
+ resolved_config = _resolve_config(config)
619
+
620
+ if metric_name == METRIC_SINGLE_CALL_LATENCY:
621
+ return benchmark_single_call_latency(
622
+ benchmark,
623
+ implementation_name,
624
+ function,
625
+ case_name,
626
+ case,
627
+ config=resolved_config,
628
+ stream=stream,
629
+ )
630
+
631
+ if metric_name == METRIC_BATCH_THROUGHPUT:
632
+ return benchmark_batch_throughput(
633
+ benchmark,
634
+ implementation_name,
635
+ function,
636
+ case_name,
637
+ case,
638
+ config=resolved_config,
639
+ stream=stream,
640
+ )
641
+
642
+ if metric_name == METRIC_TAIL_LATENCY:
643
+ return benchmark_tail_latency(
644
+ benchmark,
645
+ implementation_name,
646
+ function,
647
+ case_name,
648
+ case,
649
+ config=resolved_config,
650
+ stream=stream,
651
+ )
652
+
653
+ raise ValueError(f"Unsupported benchmark metric: {metric_name!r}")
654
+
655
+
656
+ def make_benchmark_parameters(
657
+ implementations: Mapping[str, TargetFunction],
658
+ cases: Mapping[str, BenchmarkCase] | Iterable[BenchmarkCase],
659
+ *,
660
+ metrics: Iterable[MetricName] | None = None,
661
+ ) -> list[_BenchmarkParameter]:
662
+ """Create pytest parameters for a metric-by-implementation-by-case matrix.
663
+
664
+ Args:
665
+ implementations: Mapping from implementation name to target function.
666
+ cases: Mapping or iterable of benchmark input cases.
667
+ metrics: Metrics to include in the parameter matrix. Defaults to all
668
+ supported benchmatrix metrics.
669
+
670
+ Returns:
671
+ A list of values suitable for ``pytest.mark.parametrize``.
672
+ """
673
+ resolved_metrics = DEFAULT_METRICS if metrics is None else tuple(metrics)
674
+ case_items = _case_items(cases)
675
+ pytest = _load_pytest()
676
+ parameters: list[_BenchmarkParameter] = []
677
+
678
+ for metric_name in resolved_metrics:
679
+ for implementation_name, function in implementations.items():
680
+ for case_name, case in case_items:
681
+ parameters.append(
682
+ pytest.param(
683
+ metric_name,
684
+ implementation_name,
685
+ function,
686
+ case_name,
687
+ case,
688
+ id=f"{metric_name}::{implementation_name}::{case_name}",
689
+ )
690
+ )
691
+
692
+ return parameters
693
+
694
+
695
+ def make_benchmark_test(
696
+ implementations: Mapping[str, TargetFunction],
697
+ cases: Mapping[str, BenchmarkCase] | Iterable[BenchmarkCase],
698
+ *,
699
+ metrics: Iterable[MetricName] | None = None,
700
+ config: BenchmarkConfig | None = None,
701
+ ) -> Callable[..., None]:
702
+ """Create a pytest test function for a complete benchmark matrix.
703
+
704
+ Assign the returned function to a module-level name beginning with
705
+ ``test_`` so pytest collects it.
706
+
707
+ Args:
708
+ implementations: Mapping from implementation name to target function.
709
+ cases: Mapping or iterable of benchmark input cases.
710
+ metrics: Metrics to include in the parameter matrix. Defaults to all
711
+ supported benchmatrix metrics.
712
+ config: Benchmark harness configuration. Defaults to
713
+ ``BenchmarkConfig()``.
714
+
715
+ Returns:
716
+ A parametrized pytest test function ready for module-level assignment.
717
+ """
718
+ resolved_config = _resolve_config(config)
719
+ parameters = make_benchmark_parameters(implementations, cases, metrics=metrics)
720
+
721
+ def benchmark_test(
722
+ benchmark: BenchmarkFixture,
723
+ metric_name: MetricName,
724
+ implementation_name: str,
725
+ function: TargetFunction,
726
+ case_name: str,
727
+ case: BenchmarkCase,
728
+ ) -> None:
729
+ """Run one entry in the generated benchmark matrix."""
730
+ _ = run_benchmark_metric(
731
+ benchmark,
732
+ metric_name,
733
+ implementation_name,
734
+ function,
735
+ case_name,
736
+ case,
737
+ config=resolved_config,
738
+ )
739
+
740
+ pytest = _load_pytest()
741
+ return pytest.mark.parametrize(
742
+ ("metric_name", "implementation_name", "function", "case_name", "case"),
743
+ parameters,
744
+ )(benchmark_test)
745
+
746
+
747
+ def shallow_copy(value: object) -> object:
748
+ """Return a shallow copy of ``value``.
749
+
750
+ Args:
751
+ value: Value to copy.
752
+
753
+ Returns:
754
+ A shallow copy of ``value``.
755
+ """
756
+ return copy.copy(value)
757
+
758
+
759
+ def deep_copy(value: object) -> object:
760
+ """Return a deep copy of ``value``.
761
+
762
+ Args:
763
+ value: Value to copy.
764
+
765
+ Returns:
766
+ A deep copy of ``value``.
767
+ """
768
+ return copy.deepcopy(value)
769
+
770
+
771
+ def _resolve_config(config: BenchmarkConfig | None) -> BenchmarkConfig:
772
+ """Return the supplied config or a default config."""
773
+ return BenchmarkConfig() if config is None else config
774
+
775
+
776
+ def _load_pytest() -> _PytestModule:
777
+ """Import pytest at runtime."""
778
+ module = cast(object, importlib.import_module("pytest"))
779
+ return cast(_PytestModule, module)
780
+
781
+
782
+ def _run_target(
783
+ benchmark: BenchmarkFixture,
784
+ function: TargetFunction,
785
+ case: BenchmarkCase,
786
+ *,
787
+ config: BenchmarkConfig,
788
+ force_pedantic: bool,
789
+ ) -> object:
790
+ """Run one target function through pytest-benchmark."""
791
+ _validate_target_function(function)
792
+
793
+ use_pedantic = force_pedantic or case.fresh_inputs
794
+
795
+ if not use_pedantic:
796
+ args, kwargs = case.make_call()
797
+ return benchmark(function, *args, **kwargs)
798
+
799
+ if case.fresh_inputs:
800
+ _warn_if_pedantic_iterations_ignored(config)
801
+ return benchmark.pedantic(
802
+ function,
803
+ setup=case.make_call,
804
+ rounds=config.pedantic_rounds,
805
+ warmup_rounds=config.warmup_rounds,
806
+ )
807
+
808
+ args, kwargs = case.make_call()
809
+ return benchmark.pedantic(
810
+ function,
811
+ args=args,
812
+ kwargs=kwargs,
813
+ rounds=config.pedantic_rounds,
814
+ warmup_rounds=config.warmup_rounds,
815
+ iterations=config.pedantic_iterations,
816
+ )
817
+
818
+
819
+ def _validate_target_function(function: TargetFunction) -> None:
820
+ """Reject unsupported target-function shapes."""
821
+ if inspect.iscoroutinefunction(function) or inspect.iscoroutinefunction(type(function).__call__):
822
+ message = (
823
+ "benchmatrix supports only synchronous target functions; "
824
+ + "async functions would benchmark coroutine creation rather than execution."
825
+ )
826
+ raise TypeError(message)
827
+
828
+
829
+ def _warn_if_pedantic_iterations_ignored(config: BenchmarkConfig) -> None:
830
+ """Warn when pedantic_iterations is ignored for fresh-input cases."""
831
+ if config.pedantic_iterations == _DEFAULT_PEDANTIC_ITERATIONS:
832
+ return
833
+
834
+ message = (
835
+ "BenchmarkConfig.pedantic_iterations is ignored when BenchmarkCase.fresh_inputs is true because "
836
+ + "pytest-benchmark setup mode is used to keep input construction outside the timed function body."
837
+ )
838
+ warnings.warn(
839
+ message,
840
+ RuntimeWarning,
841
+ stacklevel=3,
842
+ )
843
+
844
+
845
+ def _warn_for_tail_latency_iteration_semantics(
846
+ case: BenchmarkCase,
847
+ config: BenchmarkConfig,
848
+ ) -> None:
849
+ """Warn when tail-latency samples are not clean one-call samples."""
850
+ if case.fresh_inputs or config.pedantic_iterations == _DEFAULT_PEDANTIC_ITERATIONS:
851
+ return
852
+
853
+ message = (
854
+ "tail_latency with pedantic_iterations greater than one produces per-round aggregate timing samples, not clean "
855
+ + "one-call latency samples."
856
+ )
857
+ warnings.warn(
858
+ message,
859
+ RuntimeWarning,
860
+ stacklevel=3,
861
+ )
862
+
863
+
864
+ def _validate_work_units(value: object) -> float:
865
+ """Validate and return a positive finite work-unit count."""
866
+ if not isinstance(value, str | bytes | bytearray | SupportsFloat | SupportsIndex):
867
+ raise ValueError("Benchmark work_units must be numeric.")
868
+
869
+ try:
870
+ numeric_value = float(value)
871
+ except (TypeError, ValueError) as exc:
872
+ raise ValueError("Benchmark work_units must be numeric.") from exc
873
+
874
+ if not math.isfinite(numeric_value):
875
+ raise ValueError("Benchmark work_units must be finite.")
876
+
877
+ if numeric_value <= 0.0:
878
+ raise ValueError("Benchmark work_units must be positive.")
879
+
880
+ return numeric_value
881
+
882
+
883
+ def _validate_work_unit_name(value: str) -> None:
884
+ """Validate a throughput work-unit name."""
885
+ if not value:
886
+ raise ValueError("Benchmark work_unit_name must not be empty.")
887
+
888
+ if not _WORK_UNIT_NAME_PATTERN.fullmatch(value):
889
+ message = (
890
+ "Benchmark work_unit_name must start with a letter and contain only letters, digits, underscores, or "
891
+ + "hyphens. Use base units such as 'items', 'rows', 'bytes', or 'tokens', not units like 'rows/s'."
892
+ )
893
+ raise ValueError(message)
894
+
895
+
896
+ def _make_base_extra_info(
897
+ metric_name: MetricName,
898
+ implementation_name: str,
899
+ case_name: str,
900
+ case: BenchmarkCase,
901
+ ) -> dict[str, object]:
902
+ """Build raw metadata common to every benchmark metric."""
903
+ extra_info: dict[str, object] = {
904
+ KEY_PRODUCER: PRODUCER,
905
+ KEY_SCHEMA_VERSION: SCHEMA_VERSION,
906
+ KEY_METRIC_NAME: metric_name,
907
+ KEY_IMPLEMENTATION_NAME: implementation_name,
908
+ KEY_CASE_NAME: case_name,
909
+ KEY_CASE_FRESH_INPUTS: case.fresh_inputs,
910
+ }
911
+
912
+ for key, value in case.metadata.items():
913
+ extra_info[f"case_{key}"] = value
914
+
915
+ return extra_info
916
+
917
+
918
+ def _set_extra_info(
919
+ benchmark: BenchmarkFixture,
920
+ extra_info: Mapping[str, object],
921
+ ) -> _ExtraInfo:
922
+ """Validate, attach, and return strict JSON-safe benchmark metadata."""
923
+ final_extra_info = _coerce_json_mapping(extra_info, path="extra_info")
924
+ benchmark.extra_info.clear()
925
+ benchmark.extra_info.update(final_extra_info)
926
+ return final_extra_info
927
+
928
+
929
+ def _maybe_display_invocation_record(
930
+ record: BenchmarkInvocationRecord,
931
+ *,
932
+ config: BenchmarkConfig,
933
+ stream: TextIO | None,
934
+ ) -> None:
935
+ """Display a progress record when streaming is enabled."""
936
+ if config.stream_progress:
937
+ _display_invocation_record(record, stream=stream)
938
+
939
+
940
+ def _display_invocation_record(
941
+ record: BenchmarkInvocationRecord,
942
+ stream: TextIO | None = None,
943
+ ) -> None:
944
+ """Print one lightweight benchmark invocation progress record."""
945
+ output = sys.stdout if stream is None else stream
946
+ message = (
947
+ f"[benchmark invoked] metric={record.metric_name} "
948
+ + f"implementation={record.implementation_name} case={record.case_name}; "
949
+ + "timing is available in pytest-benchmark output"
950
+ )
951
+ print(message, file=output, flush=True)
952
+
953
+
954
+ def _case_items(
955
+ cases: Mapping[str, BenchmarkCase] | Iterable[BenchmarkCase],
956
+ ) -> list[tuple[str, BenchmarkCase]]:
957
+ """Normalize case inputs into named case pairs."""
958
+ if isinstance(cases, Mapping):
959
+ mapping = cast(Mapping[str, BenchmarkCase], cases)
960
+ return list(mapping.items())
961
+
962
+ return [(case.name, case) for case in cases]
963
+
964
+
965
+ def _coerce_json_mapping(
966
+ mapping: Mapping[str, object] | Mapping[object, object],
967
+ *,
968
+ path: str,
969
+ ) -> _ExtraInfo:
970
+ """Coerce a mapping to strict JSON-safe metadata."""
971
+ output: _ExtraInfo = {}
972
+
973
+ for key, value in mapping.items():
974
+ if not isinstance(key, str):
975
+ raise MetadataSerializationError(f"Metadata key at {path} must be str, got {type(key).__name__}.")
976
+
977
+ output[key] = _coerce_json_value(value, path=f"{path}.{key}")
978
+
979
+ _validate_strict_json(output, path=path)
980
+ return output
981
+
982
+
983
+ def _coerce_json_value(value: object, *, path: str) -> _JsonValue:
984
+ """Coerce a value to strict JSON or raise a serialization error."""
985
+ if value is None or isinstance(value, str | bool):
986
+ return value
987
+
988
+ if isinstance(value, int) and not isinstance(value, bool):
989
+ return value
990
+
991
+ if isinstance(value, float):
992
+ if math.isfinite(value):
993
+ return value
994
+
995
+ raise MetadataSerializationError(f"Metadata value at {path} must be finite, got {value!r}.")
996
+
997
+ if isinstance(value, PurePath):
998
+ return str(value)
999
+
1000
+ if isinstance(value, dt.datetime | dt.date | dt.time):
1001
+ return value.isoformat()
1002
+
1003
+ if isinstance(value, enum.Enum):
1004
+ enum_value = cast(object, value.value)
1005
+ return _coerce_json_value(enum_value, path=f"{path}.value")
1006
+
1007
+ if isinstance(value, list | tuple):
1008
+ sequence = cast(Sequence[object], value)
1009
+ return [_coerce_json_value(item, path=f"{path}[{index}]") for index, item in enumerate(sequence)]
1010
+
1011
+ if isinstance(value, Mapping):
1012
+ return _coerce_json_mapping(cast(Mapping[object, object], value), path=path)
1013
+
1014
+ numpy_scalar = _maybe_numpy_scalar_to_python(value, path=path)
1015
+ if numpy_scalar is not _NO_NUMPY_SCALAR:
1016
+ return _coerce_json_value(numpy_scalar, path=path)
1017
+
1018
+ raise MetadataSerializationError(
1019
+ f"Metadata value at {path} has unsupported type {type(value).__module__}.{type(value).__qualname__}."
1020
+ )
1021
+
1022
+
1023
+ def _maybe_numpy_scalar_to_python(value: object, *, path: str) -> object:
1024
+ """Return a Python scalar for NumPy scalar values."""
1025
+ value_type = type(value)
1026
+ module_name = value_type.__module__
1027
+
1028
+ if not module_name.startswith("numpy"):
1029
+ return _NO_NUMPY_SCALAR
1030
+
1031
+ if value_type.__name__ == "ndarray":
1032
+ message = (
1033
+ f"Metadata value at {path} is a NumPy array, not a NumPy scalar. "
1034
+ + "Convert arrays to JSON-safe lists explicitly."
1035
+ )
1036
+ raise MetadataSerializationError(message)
1037
+
1038
+ item = getattr(value, "item", None)
1039
+ if not callable(item):
1040
+ return _NO_NUMPY_SCALAR
1041
+
1042
+ try:
1043
+ scalar = item()
1044
+ except Exception as exc:
1045
+ raise MetadataSerializationError(
1046
+ f"Metadata value at {path} looks NumPy-like but could not be converted to a Python scalar."
1047
+ ) from exc
1048
+
1049
+ if scalar is value:
1050
+ return _NO_NUMPY_SCALAR
1051
+
1052
+ return scalar
1053
+
1054
+
1055
+ def _validate_strict_json(value: _JsonValue, *, path: str) -> None:
1056
+ """Validate that a coerced value can be serialized as strict JSON."""
1057
+ try:
1058
+ _ = json.dumps(value, allow_nan=False)
1059
+ except (TypeError, ValueError) as exc:
1060
+ raise MetadataSerializationError(f"Metadata at {path} could not be serialized as strict JSON.") from exc