verily 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
src/verily/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .models import Benchmark, BenchmarkCase, RunConfigOverride
2
+
3
+ __all__ = ["Benchmark", "BenchmarkCase", "RunConfigOverride"]
src/verily/display.py ADDED
@@ -0,0 +1,112 @@
1
+ from rich.spinner import Spinner
2
+ from rich.table import Table
3
+ from rich.text import Text
4
+
5
+ from .models import BenchmarkRunResult
6
+
7
+
8
+ def generate_benchmark_table(
9
+ results: list[BenchmarkRunResult], confidence_level: float
10
+ ) -> Table:
11
+ table = Table(show_header=True, header_style="cyan", expand=True)
12
+ table.add_column("Benchmark", style="dim", width=30)
13
+ table.add_column(f"Mean @ {confidence_level:.0%} CI", justify="right")
14
+ table.add_column("Std Dev", justify="right")
15
+ table.add_column("Stability", justify="right")
16
+ table.add_column("Runs", justify="right")
17
+ table.add_column("Avg Dur~ (s)", justify="right")
18
+
19
+ if not results:
20
+ table.add_row("Waiting for benchmarks to start...", "", "", "", "", "")
21
+ return table
22
+
23
+ for result in results:
24
+ benchmark_name = result.benchmark.name
25
+ all_runs = [run for case in result.case_results for run in case.runs]
26
+
27
+ pending_runs = [r for r in all_runs if r.result.type == "pending"]
28
+ successful_runs = [r for r in all_runs if r.result.type == "success"]
29
+ failed_runs = [r for r in all_runs if r.result.type == "failure"]
30
+
31
+ latest_stats = result.stats
32
+
33
+ all_cases_done = all(cr.done for cr in result.case_results)
34
+
35
+ def with_spinner(text: str | None) -> str | Spinner | Text:
36
+ if pending_runs and not all_cases_done:
37
+ if text:
38
+ return Spinner("dots", text=Text(text, style="white"), style="cyan")
39
+ return Spinner("dots", style="cyan")
40
+ return text or "[yellow]N/A[/yellow]"
41
+
42
+ mean_display_text = None
43
+ if latest_stats:
44
+ mean_val_text = f"{latest_stats.mean * 100:.2f}%"
45
+ if latest_stats.precision:
46
+ margin_of_error = (
47
+ latest_stats.precision.high - latest_stats.precision.low
48
+ ) / 2
49
+ margin_of_error_text = f"±{margin_of_error * 100:.2f}%"
50
+ mean_display_text = f"{mean_val_text} ({margin_of_error_text})"
51
+ else:
52
+ mean_display_text = mean_val_text
53
+
54
+ std_dev_text = f"{latest_stats.std_dev:.3f}" if latest_stats else None
55
+ stability_text = f"{latest_stats.stability * 100:.2f}%" if latest_stats else None
56
+
57
+ mean_score_display = with_spinner(mean_display_text)
58
+ std_dev_display = with_spinner(std_dev_text)
59
+ stability_display = with_spinner(stability_text)
60
+
61
+ total_runs_count = len(all_runs)
62
+ runs_display_text = f"{total_runs_count}"
63
+ if failed_runs:
64
+ runs_display_text += f" [red]({len(failed_runs)} failed)[/red]"
65
+
66
+ avg_run_duration_display: str | Spinner | Text
67
+ avg_runtime_text = None
68
+ completed_runs = successful_runs + failed_runs
69
+ if completed_runs:
70
+ total_runtime = sum(r.result.runtime for r in completed_runs)
71
+ avg_runtime = total_runtime / len(completed_runs)
72
+ avg_runtime_text = f"{avg_runtime:.2f}"
73
+
74
+ avg_run_duration_display = with_spinner(avg_runtime_text)
75
+
76
+ table.add_row(
77
+ benchmark_name,
78
+ mean_score_display,
79
+ std_dev_display,
80
+ stability_display,
81
+ runs_display_text,
82
+ avg_run_duration_display,
83
+ )
84
+
85
+ return table
86
+
87
+
88
+ def generate_failures_table(
89
+ results: list[BenchmarkRunResult]
90
+ ) -> Table:
91
+ table = Table(show_header=True, header_style="bold red")
92
+ table.title = "Failures"
93
+ table.add_column("Benchmark", style="dim", width=30)
94
+ table.add_column("Case", justify="right")
95
+ table.add_column("Error")
96
+ table.add_column("Stdout")
97
+ table.add_column("Stderr")
98
+
99
+ for result in results:
100
+ for case_result in result.case_results:
101
+ for run in case_result.runs:
102
+ if run.result.type == "failure":
103
+ failure = run.result
104
+ table.add_row(
105
+ result.benchmark.name,
106
+ str(case_result.case.inputs),
107
+ failure.error_message,
108
+ failure.stdout.decode("utf-8", errors="ignore"),
109
+ failure.stderr.decode("utf-8", errors="ignore"),
110
+ )
111
+
112
+ return table
src/verily/io.py ADDED
@@ -0,0 +1,152 @@
1
+ import json
2
+ from collections.abc import Callable, Iterator
3
+ from contextlib import contextmanager
4
+
5
+ import fsspec
6
+
7
+ from .models import (
8
+ BenchmarkCase,
9
+ BenchmarkRunResult,
10
+ BenchmarkStatistics,
11
+ CaseResult,
12
+ FailureResult,
13
+ NamedBenchmark,
14
+ Run,
15
+ RunConfig,
16
+ RunResult,
17
+ SuccessResult,
18
+ )
19
+ from .stats import aggregate_benchmark_statistics, calculate_run_stats
20
+ from .utils import create_sequential_run_directory
21
+
22
+
23
+ def _serialize_run(run: Run) -> dict:
24
+ result_dump = None
25
+ if hasattr(run.result, "model_dump"):
26
+ result_dump = run.result.model_dump()
27
+ if "stdout" in result_dump and isinstance(result_dump["stdout"], bytes):
28
+ result_dump["stdout"] = result_dump["stdout"].decode("utf-8", "ignore")
29
+ if "stderr" in result_dump and isinstance(result_dump["stderr"], bytes):
30
+ result_dump["stderr"] = result_dump["stderr"].decode("utf-8", "ignore")
31
+
32
+ return {
33
+ "result": result_dump,
34
+ "stats": run.stats.model_dump() if run.stats else None,
35
+ }
36
+
37
+
38
+ @contextmanager
39
+ def create_results_writer(
40
+ results_dir: str,
41
+ config: RunConfig,
42
+ ) -> Iterator[Callable[[BenchmarkRunResult], None]]:
43
+ fs, path = fsspec.url_to_fs(results_dir)
44
+ run_dir = create_sequential_run_directory(fs=fs, base_path=path)
45
+ written_runs = set()
46
+
47
+ with fs.open(f"{run_dir}/config.json", "w") as f:
48
+ json.dump(config.model_dump(), f, indent=4)
49
+
50
+ with fs.open(f"{run_dir}/results.jsonl", "w") as f:
51
+
52
+ def writer(result: BenchmarkRunResult) -> None:
53
+ benchmark_name = result.benchmark.name
54
+ for case_idx, case_result in enumerate(result.case_results):
55
+ for run_idx, run in enumerate(case_result.runs):
56
+ run_id = (benchmark_name, case_idx, run_idx)
57
+ if run.result.type != "pending" and run_id not in written_runs:
58
+ run_data = _serialize_run(run)
59
+ output_record = {
60
+ "benchmark_name": benchmark_name,
61
+ "case_index": case_idx,
62
+ "case_inputs": case_result.case.inputs,
63
+ "run_index": run_idx,
64
+ **run_data,
65
+ }
66
+ f.write(json.dumps(output_record) + "\n")
67
+ written_runs.add(run_id)
68
+
69
+ yield writer
70
+
71
+
72
+ def load_results(run_path: str) -> tuple[list[BenchmarkRunResult], RunConfig]:
73
+ fs, path = fsspec.url_to_fs(run_path)
74
+
75
+ with fs.open(f"{path}/config.json", "r") as f:
76
+ config_dict = json.load(f)
77
+ config = RunConfig(**config_dict)
78
+
79
+ results_by_benchmark_case: dict[
80
+ tuple[str, int], list[Run]
81
+ ] = {}
82
+
83
+ with fs.open(f"{path}/results.jsonl", "r") as f:
84
+ for line in f:
85
+ record = json.loads(line)
86
+ benchmark_name = record["benchmark_name"]
87
+ case_index = record["case_index"]
88
+ case_inputs = record["case_inputs"]
89
+ run_index = record["run_index"]
90
+ run_data = record["result"]
91
+ stats_data = record["stats"]
92
+
93
+ run_result: RunResult
94
+ if run_data["type"] == "success":
95
+ run_result = SuccessResult(
96
+ comparison=run_data["comparison"],
97
+ stdout=run_data["stdout"].encode("utf-8"),
98
+ stderr=run_data["stderr"].encode("utf-8"),
99
+ runtime=run_data["runtime"],
100
+ )
101
+ elif run_data["type"] == "failure":
102
+ run_result = FailureResult(
103
+ error_message=run_data["error_message"],
104
+ stdout=run_data["stdout"].encode("utf-8"),
105
+ stderr=run_data["stderr"].encode("utf-8"),
106
+ runtime=run_data["runtime"],
107
+ )
108
+ else:
109
+ # This should not happen with current serialization logic
110
+ continue
111
+
112
+ run_stats = (
113
+ BenchmarkStatistics(**stats_data) if stats_data else None
114
+ )
115
+ run = Run(result=run_result, stats=run_stats)
116
+
117
+ if (benchmark_name, case_index) not in results_by_benchmark_case:
118
+ results_by_benchmark_case[(benchmark_name, case_index)] = []
119
+
120
+ # Ensure the list is long enough to insert at run_index
121
+ current_runs = results_by_benchmark_case[(benchmark_name, case_index)]
122
+ while len(current_runs) <= run_index:
123
+ current_runs.append(None) # type: ignore
124
+ current_runs[run_index] = run
125
+
126
+
127
+ benchmark_results: dict[str, BenchmarkRunResult] = {}
128
+
129
+ for (benchmark_name, case_index), runs in results_by_benchmark_case.items():
130
+ # NOTE: We don't have the original Benchmark object, so we create a dummy one
131
+ # This is acceptable as it's only used for display purposes and not execution
132
+ dummy_benchmark_case = BenchmarkCase(inputs=[], expectation=None)
133
+
134
+ # Filter out None runs and calculate stats for the case
135
+ successful_runs_in_case = [r.result for r in runs if r and r.result.type == "success"]
136
+ case_stats = calculate_run_stats(successful_runs_in_case, config.confidence_level)
137
+
138
+ case_result = CaseResult(case=dummy_benchmark_case, runs=runs, stats=case_stats)
139
+
140
+ if benchmark_name not in benchmark_results:
141
+ dummy_named_benchmark = NamedBenchmark(name=benchmark_name, benchmark=None) # type: ignore
142
+ benchmark_results[benchmark_name] = BenchmarkRunResult(
143
+ benchmark=dummy_named_benchmark, case_results=[], stats=None
144
+ )
145
+ benchmark_results[benchmark_name].case_results.append(case_result)
146
+
147
+ # Calculate overall benchmark statistics
148
+ for benchmark_run_result in benchmark_results.values():
149
+ all_case_stats = [cr.stats for cr in benchmark_run_result.case_results if cr.stats]
150
+ benchmark_run_result.stats = aggregate_benchmark_statistics(all_case_stats)
151
+
152
+ return list(benchmark_results.values()), config
src/verily/main.py ADDED
@@ -0,0 +1,198 @@
1
+ import asyncio
2
+ import os
3
+ import pickle
4
+ from collections.abc import AsyncIterator
5
+ from typing import Annotated
6
+
7
+ import typer
8
+ from rich.console import Console, Group
9
+ from rich.live import Live
10
+
11
+ from .display import (
12
+ generate_benchmark_table,
13
+ generate_failures_table,
14
+ )
15
+ from .io import create_results_writer, load_results
16
+ from .models import BenchmarkSession, RunConfig
17
+ from .runner import (
18
+ resolve_benchmark,
19
+ run_benchmark_case,
20
+ stream_benchmarks_runs,
21
+ )
22
+
23
+ app = typer.Typer()
24
+
25
+
26
+ def _render_benchmark_output(
27
+ results: list[BenchmarkSession], config: RunConfig
28
+ ):
29
+ benchmark_table = generate_benchmark_table(
30
+ results, confidence_level=config.confidence_level
31
+ )
32
+
33
+ all_benchmarks_and_cases_done = all(
34
+ all(cr.done for cr in r.case_results) for r in results
35
+ )
36
+
37
+ if not all_benchmarks_and_cases_done or not any(
38
+ run.result.type == "failure"
39
+ for r in results
40
+ for case in r.case_results
41
+ for run in case.runs
42
+ ):
43
+ return benchmark_table
44
+
45
+ failures_table = generate_failures_table(results)
46
+ return Group(failures_table, benchmark_table)
47
+
48
+
49
+ async def _run_benchmarks(
50
+ stream: AsyncIterator[BenchmarkSession],
51
+ results_dir: str,
52
+ live: Live,
53
+ config: RunConfig,
54
+ ):
55
+ session: BenchmarkSession | None = None
56
+
57
+ live.update(_render_benchmark_output(session.results if session else [], config))
58
+
59
+ with create_results_writer(results_dir, config) as write_result:
60
+ async for session in stream:
61
+ live.update(_render_benchmark_output(session.results, config))
62
+ for result in session.results:
63
+ write_result(result)
64
+ live.update(_render_benchmark_output(session.results, config))
65
+
66
+
67
+ async def _run_worker_case_and_dump_result(
68
+ benchmark_path: str,
69
+ case_index: int,
70
+ result_fd: int,
71
+ ):
72
+ benchmark = resolve_benchmark(benchmark_path)
73
+ output_data = await run_benchmark_case(benchmark, case_index)
74
+ with os.fdopen(result_fd, "wb") as result_pipe_w:
75
+ pickle.dump(output_data, result_pipe_w)
76
+
77
+
78
+ def _show_results(results: list[BenchmarkSession], config: RunConfig, console: Console):
79
+ output = _render_benchmark_output(results, config)
80
+ console.print(output)
81
+
82
+
83
+ @app.command("run")
84
+ def run_command(
85
+ benchmark_paths: Annotated[
86
+ list[str],
87
+ typer.Option(
88
+ "--benchmark-path",
89
+ "-b",
90
+ help="Path to the benchmark to run (module:instance or file path)",
91
+ ),
92
+ ],
93
+ min_runs: Annotated[
94
+ int,
95
+ typer.Option(
96
+ "--min-runs",
97
+ help="Minimum number of times to repeat each benchmark case.",
98
+ ),
99
+ ],
100
+ max_runs: Annotated[
101
+ int,
102
+ typer.Option(
103
+ "--max-runs",
104
+ "-r",
105
+ help="Maximum number of times to repeat each benchmark case.",
106
+ ),
107
+ ],
108
+ batch_size: Annotated[
109
+ int,
110
+ typer.Option(
111
+ "--batch-size",
112
+ help="The number of runs to execute in a batch.",
113
+ ),
114
+ ],
115
+ stability_goal: Annotated[
116
+ float,
117
+ typer.Option(
118
+ "--stability-goal",
119
+ help="The stability goal for the benchmark runs.",
120
+ ),
121
+ ],
122
+ confidence_level: Annotated[
123
+ float,
124
+ typer.Option(
125
+ "--confidence-level",
126
+ help="Confidence level for the precision estimate.",
127
+ ),
128
+ ],
129
+ results_dir: Annotated[
130
+ str,
131
+ typer.Option(
132
+ "--results-dir",
133
+ "-o",
134
+ help="Directory to save benchmark results",
135
+ ),
136
+ ],
137
+ ):
138
+ loop = asyncio.get_event_loop()
139
+ run_config = RunConfig(
140
+ min_runs=min_runs,
141
+ max_runs=max_runs,
142
+ batch_size=batch_size,
143
+ stability_goal=stability_goal,
144
+ confidence_level=confidence_level,
145
+ )
146
+ stream = stream_benchmarks_runs(
147
+ benchmark_paths,
148
+ run_config=run_config,
149
+ )
150
+ with Live(refresh_per_second=10) as live:
151
+ loop.run_until_complete(
152
+ _run_benchmarks(stream, results_dir, live, run_config)
153
+ )
154
+
155
+
156
+ @app.command("worker")
157
+ def worker_command(
158
+ benchmark_path: Annotated[
159
+ str,
160
+ typer.Option(
161
+ "--benchmark-path",
162
+ help="Path to the benchmark (module:instance).",
163
+ ),
164
+ ],
165
+ case_index: Annotated[
166
+ int,
167
+ typer.Option(
168
+ "--case-index",
169
+ help="Index of the case to run.",
170
+ ),
171
+ ],
172
+ result_fd: Annotated[
173
+ int,
174
+ typer.Option(
175
+ "--result-fd",
176
+ help="File descriptor to write pickled result to.",
177
+ ),
178
+ ],
179
+ ):
180
+ loop = asyncio.get_event_loop()
181
+ loop.run_until_complete(
182
+ _run_worker_case_and_dump_result(benchmark_path, case_index, result_fd)
183
+ )
184
+
185
+
186
+ @app.command("show")
187
+ def show_command(
188
+ run_path: Annotated[
189
+ str,
190
+ typer.Argument(
191
+ help="Path to the directory containing benchmark results (e.g., a 'run-XXXX' directory)."
192
+ ),
193
+ ],
194
+ ):
195
+ console = Console()
196
+ results, run_config = load_results(run_path)
197
+ _show_results(results, run_config, console)
198
+
src/verily/models.py ADDED
@@ -0,0 +1,141 @@
1
+ from collections.abc import Awaitable, Callable
2
+ from dataclasses import dataclass
3
+ from typing import Any, Literal, Protocol
4
+
5
+ from pydantic import BaseModel, ConfigDict
6
+
7
+
8
+ class Comparisonlike(Protocol):
9
+ similarity: float
10
+ actual: Any
11
+ expected: Any
12
+
13
+
14
+ class Comparer(Protocol):
15
+ async def compare(self, item1: Any, item2: Any) -> Comparisonlike: ...
16
+
17
+
18
+ class RunConfigOverride(BaseModel):
19
+ min_runs: int | None = None
20
+ max_runs: int | None = None
21
+ batch_size: int | None = None
22
+ stability_goal: float | None = None
23
+
24
+
25
+ class BenchmarkCase(BaseModel):
26
+ inputs: list[Any]
27
+ expectation: Any
28
+ config: RunConfigOverride | None = None
29
+
30
+ def __hash__(self):
31
+ return id(self)
32
+
33
+ def __eq__(self, other):
34
+ return self is other
35
+
36
+
37
+ @dataclass
38
+ class Benchmark:
39
+ runner: Callable[..., Awaitable[Any]]
40
+ comparer: Comparer
41
+ cases: list[BenchmarkCase]
42
+ config: RunConfigOverride | None = None
43
+
44
+
45
+ class Comparison(BaseModel):
46
+ model_config = ConfigDict(from_attributes=True)
47
+ similarity: float
48
+ actual: Any
49
+ expected: Any
50
+
51
+
52
+ class SuccessfulEvaluation(BaseModel):
53
+ type: Literal["success"] = "success"
54
+ comparison: Comparison
55
+
56
+
57
+ class FailingEvaluation(BaseModel):
58
+ type: Literal["failure"] = "failure"
59
+ error_message: str
60
+
61
+
62
+ EvaluationResult = SuccessfulEvaluation | FailingEvaluation
63
+
64
+
65
+ class SuccessResult(BaseModel):
66
+ type: Literal["success"] = "success"
67
+ comparison: Comparison
68
+ stdout: bytes
69
+ stderr: bytes
70
+ runtime: float
71
+
72
+
73
+ class FailureResult(BaseModel):
74
+ type: Literal["failure"] = "failure"
75
+ error_message: str
76
+ stdout: bytes
77
+ stderr: bytes
78
+ runtime: float
79
+
80
+
81
+ class PendingResult(BaseModel):
82
+ type: Literal["pending"] = "pending"
83
+
84
+
85
+ RunResult = SuccessResult | FailureResult
86
+ RunState = RunResult | PendingResult
87
+
88
+
89
+ @dataclass
90
+ class Run:
91
+ result: RunState
92
+ stats: "BenchmarkStatistics | None"
93
+
94
+
95
+ @dataclass
96
+ class CaseResult:
97
+ case: BenchmarkCase
98
+ runs: list[Run | None]
99
+ stats: "BenchmarkStatistics | None" = None
100
+ done: bool = False
101
+
102
+
103
+ @dataclass(eq=False)
104
+ class NamedBenchmark:
105
+ name: str
106
+ benchmark: Benchmark
107
+
108
+
109
+ @dataclass
110
+ class BenchmarkRunResult:
111
+ benchmark: NamedBenchmark
112
+ case_results: list[CaseResult]
113
+ stats: "BenchmarkStatistics | None" = None
114
+
115
+
116
+ @dataclass
117
+ class BenchmarkSession:
118
+ results: list[BenchmarkRunResult]
119
+
120
+
121
+ class Interval(BaseModel):
122
+ low: float
123
+ high: float
124
+
125
+
126
+ class BenchmarkStatistics(BaseModel):
127
+ mean: float
128
+ std_dev: float
129
+ precision: Interval | None
130
+ stability: float
131
+
132
+
133
+ RunCallable = Callable[[], Awaitable[RunResult]]
134
+
135
+
136
+ class RunConfig(BaseModel):
137
+ min_runs: int
138
+ max_runs: int
139
+ batch_size: int
140
+ stability_goal: float
141
+ confidence_level: float
src/verily/runner.py ADDED
@@ -0,0 +1,305 @@
1
+ import inspect
2
+ import os
3
+ import pickle
4
+ import time
5
+ import traceback
6
+ from asyncio import create_subprocess_exec, subprocess
7
+ from collections.abc import AsyncIterator, Iterable
8
+ from importlib import import_module
9
+
10
+ from aiostream import stream
11
+
12
+ from .models import (
13
+ Benchmark,
14
+ BenchmarkCase,
15
+ BenchmarkRunResult,
16
+ BenchmarkSession,
17
+ BenchmarkStatistics,
18
+ CaseResult,
19
+ Comparison,
20
+ EvaluationResult,
21
+ FailingEvaluation,
22
+ FailureResult,
23
+ NamedBenchmark,
24
+ PendingResult,
25
+ Run,
26
+ RunConfig,
27
+ RunConfigOverride,
28
+ RunResult,
29
+ SuccessfulEvaluation,
30
+ SuccessResult,
31
+ )
32
+ from .stats import aggregate_benchmark_statistics, calculate_run_stats
33
+ from .utils import _pipe
34
+
35
+
36
+ class WorkerRuntimeError(RuntimeError):
37
+ def __init__(self, message: str, returncode: int, stdout: bytes, stderr: bytes):
38
+ super().__init__(message)
39
+ self.returncode = returncode
40
+ self.stdout = stdout
41
+ self.stderr = stderr
42
+
43
+
44
+ # TODO: move
45
+ def resolve_benchmark(benchmark_path: str) -> Benchmark:
46
+ module_name, instance_name = benchmark_path.split(":")
47
+ module = import_module(module_name)
48
+ benchmark_instance = getattr(module, instance_name)
49
+ if not isinstance(benchmark_instance, Benchmark):
50
+ raise TypeError(
51
+ f"Expected '{instance_name}' in '{module_name}' to be an instance of Benchmark, "
52
+ f"got {type(benchmark_instance).__name__} instead."
53
+ )
54
+ return benchmark_instance
55
+
56
+
57
+ def resolve_benchmarks(benchmark_paths: list[str]) -> Iterable[NamedBenchmark]:
58
+ for path in benchmark_paths:
59
+ if ":" in path:
60
+ yield NamedBenchmark(name=path, benchmark=resolve_benchmark(path))
61
+ else:
62
+ module_name = path.replace("/", ".").removesuffix(".py")
63
+ module = import_module(module_name)
64
+ for name, member in inspect.getmembers(module):
65
+ if isinstance(member, Benchmark):
66
+ yield NamedBenchmark(
67
+ name=f"{module_name}:{name}", benchmark=member
68
+ )
69
+
70
+
71
+ def _get_run_result(read_fd: int) -> EvaluationResult:
72
+ with os.fdopen(read_fd, "rb") as result_pipe_r:
73
+ return pickle.load(result_pipe_r)
74
+
75
+
76
+ def _get_test_name_from_path(benchmark_path: str) -> str:
77
+ try:
78
+ test_name = benchmark_path.split(":")[1]
79
+ return test_name.replace("_", " ").title()
80
+ except IndexError:
81
+ return benchmark_path
82
+
83
+
84
+ def resolve_run_config(
85
+ global_config: RunConfig,
86
+ benchmark_config: RunConfigOverride | None,
87
+ case_config: RunConfigOverride | None,
88
+ ) -> RunConfig:
89
+ resolved_config = global_config.model_copy()
90
+
91
+ if benchmark_config:
92
+ resolved_config.min_runs = benchmark_config.min_runs or resolved_config.min_runs
93
+ resolved_config.max_runs = benchmark_config.max_runs or resolved_config.max_runs
94
+ resolved_config.batch_size = benchmark_config.batch_size or resolved_config.batch_size
95
+ resolved_config.stability_goal = benchmark_config.stability_goal or resolved_config.stability_goal
96
+
97
+ if case_config:
98
+ resolved_config.min_runs = case_config.min_runs or resolved_config.min_runs
99
+ resolved_config.max_runs = case_config.max_runs or resolved_config.max_runs
100
+ resolved_config.batch_size = case_config.batch_size or resolved_config.batch_size
101
+ resolved_config.stability_goal = case_config.stability_goal or resolved_config.stability_goal
102
+
103
+ return resolved_config
104
+
105
+
106
+ async def run_benchmark_case(
107
+ benchmark: Benchmark,
108
+ case_idx: int,
109
+ ) -> EvaluationResult:
110
+ case = benchmark.cases[case_idx]
111
+
112
+ try:
113
+ result = await benchmark.runner(*case.inputs)
114
+ comparison = Comparison.model_validate(
115
+ await benchmark.comparer.compare(case.expectation, result)
116
+ )
117
+ return SuccessfulEvaluation(comparison=comparison)
118
+ except Exception:
119
+ return FailingEvaluation(error_message=traceback.format_exc())
120
+
121
+
122
+ async def run_single_case(
123
+ benchmark_path_str: str,
124
+ case_idx: int,
125
+ ) -> RunResult:
126
+ start_time = time.monotonic()
127
+
128
+ with _pipe() as (read_fd, write_fd):
129
+ args = [
130
+ "stabe",
131
+ "worker",
132
+ "--benchmark-path",
133
+ benchmark_path_str,
134
+ "--case-index",
135
+ str(case_idx),
136
+ "--result-fd",
137
+ str(write_fd),
138
+ ]
139
+
140
+ proc = await create_subprocess_exec(
141
+ *args,
142
+ stdout=subprocess.PIPE,
143
+ stderr=subprocess.PIPE,
144
+ pass_fds=(write_fd,),
145
+ )
146
+
147
+ stdout_bytes, stderr_bytes = await proc.communicate()
148
+ runtime = time.monotonic() - start_time
149
+
150
+ if proc.returncode != 0:
151
+ raise WorkerRuntimeError(
152
+ message=f"Worker process exited with code {proc.returncode}.",
153
+ returncode=proc.returncode,
154
+ stdout=stdout_bytes,
155
+ stderr=stderr_bytes,
156
+ )
157
+
158
+ try:
159
+ worker_output = _get_run_result(read_fd)
160
+ if not worker_output:
161
+ raise WorkerRuntimeError(
162
+ message="Worker exited successfully but provided no data via pipe.",
163
+ returncode=0,
164
+ stdout=stdout_bytes,
165
+ stderr=stderr_bytes,
166
+ )
167
+ if worker_output.type == "success":
168
+ return SuccessResult(
169
+ comparison=worker_output.comparison,
170
+ stdout=stdout_bytes,
171
+ stderr=stderr_bytes,
172
+ runtime=runtime,
173
+ )
174
+ return FailureResult(
175
+ error_message=worker_output.error_message,
176
+ stdout=stdout_bytes,
177
+ stderr=stderr_bytes,
178
+ runtime=runtime,
179
+ )
180
+ except (EOFError, pickle.UnpicklingError) as e:
181
+ raise WorkerRuntimeError(
182
+ message=f"Worker exited successfully but failed to provide valid result via pipe: {type(e).__name__} - {e}",
183
+ returncode=0,
184
+ stdout=stdout_bytes,
185
+ stderr=stderr_bytes,
186
+ )
187
+
188
+
189
+ async def stream_case_runs(
190
+ benchmark_path: str,
191
+ case: BenchmarkCase,
192
+ case_idx: int,
193
+ global_run_config: RunConfig,
194
+ benchmark_run_config_override: RunConfigOverride | None,
195
+ ) -> AsyncIterator[CaseResult]:
196
+
197
+ run_config = resolve_run_config(
198
+ global_config=global_run_config,
199
+ benchmark_config=benchmark_run_config_override,
200
+ case_config=case.config,
201
+ )
202
+
203
+ runs: list[Run] = []
204
+ latest_stats: BenchmarkStatistics | None = None
205
+ run_count = 0
206
+
207
+ while run_count < run_config.max_runs:
208
+ # if we reach min runs and our latest stats
209
+ # meet the stability goal then we can stop
210
+ if (
211
+ run_count >= run_config.min_runs and
212
+ latest_stats and
213
+ latest_stats.stability >= run_config.stability_goal
214
+ ):
215
+ break
216
+
217
+ # Determine batch size for this iteration
218
+ batch_size = min(run_config.batch_size, run_config.max_runs - run_count)
219
+ batch_indices = list(range(run_count, run_count + batch_size))
220
+
221
+ run_count += batch_size
222
+
223
+ # start with pending results for the new batch
224
+ runs.extend([Run(result=PendingResult(), stats=None) for _ in batch_indices])
225
+ yield CaseResult(case=case, runs=list(runs), stats=latest_stats)
226
+
227
+ # note that while this function utlimately only yields once
228
+ # and could just be normal async, we leverage this iterator
229
+ # aspect to ensure we can provide continuos updates via the
230
+ # below stream merge
231
+ async def run_and_update(idx: int) -> AsyncIterator[CaseResult]:
232
+ nonlocal latest_stats
233
+ run_result = await run_single_case(benchmark_path, case_idx)
234
+ runs[idx] = Run(result=run_result, stats=None)
235
+ successful_runs = [
236
+ r.result for r in runs if r and r.result.type == "success"
237
+ ]
238
+ latest_stats = calculate_run_stats(
239
+ successful_runs, run_config.confidence_level
240
+ )
241
+ yield CaseResult(case=case, runs=list(runs), stats=latest_stats)
242
+
243
+ batch_runners = [run_and_update(idx) for idx in batch_indices]
244
+ merged_stream = stream.merge(*batch_runners)
245
+ async with merged_stream.stream() as streamer:
246
+ async for case_result in streamer:
247
+ yield case_result
248
+
249
+ yield CaseResult(case=case, runs=list(runs), stats=latest_stats, done=True)
250
+
251
+
252
+ async def stream_benchmark_runs(
253
+ benchmark: NamedBenchmark,
254
+ run_config: RunConfig,
255
+ ) -> AsyncIterator[BenchmarkRunResult]:
256
+ case_results: dict[BenchmarkCase, CaseResult] = {
257
+ case: CaseResult(case=case, runs=[]) for case in benchmark.benchmark.cases
258
+ }
259
+
260
+ def get_stats() -> BenchmarkStatistics | None:
261
+ all_stats = [cr.stats for cr in case_results.values() if cr.stats]
262
+ return aggregate_benchmark_statistics(all_stats)
263
+
264
+ all_case_runners = [
265
+ stream_case_runs(
266
+ benchmark_path=benchmark.name,
267
+ case=case,
268
+ case_idx=benchmark.benchmark.cases.index(case),
269
+ global_run_config=run_config,
270
+ benchmark_run_config_override=benchmark.benchmark.config,
271
+ )
272
+ for case in benchmark.benchmark.cases
273
+ ]
274
+
275
+ merged_stream = stream.merge(*all_case_runners)
276
+ async with merged_stream.stream() as streamer:
277
+ async for case_result in streamer:
278
+ case_results[case_result.case] = case_result
279
+ yield BenchmarkRunResult(
280
+ benchmark=benchmark,
281
+ case_results=list(case_results.values()),
282
+ stats=get_stats(),
283
+ )
284
+
285
+
286
+ # TODO: move or receive the result of resolve_benchmarks instead
287
+ async def stream_benchmarks_runs(
288
+ benchmark_paths: list[str],
289
+ run_config: RunConfig,
290
+ ) -> AsyncIterator[BenchmarkSession]:
291
+ benchmarks = list(resolve_benchmarks(benchmark_paths))
292
+ benchmark_streams = [
293
+ stream_benchmark_runs(
294
+ named_benchmark,
295
+ run_config=run_config,
296
+ )
297
+ for named_benchmark in benchmarks
298
+ ]
299
+
300
+ benchmark_states: dict[NamedBenchmark, BenchmarkRunResult] = {}
301
+ merged_stream = stream.merge(*benchmark_streams)
302
+ async with merged_stream.stream() as streamer:
303
+ async for item in streamer:
304
+ benchmark_states[item.benchmark] = item
305
+ yield BenchmarkSession(results=list(benchmark_states.values()))
src/verily/stats.py ADDED
@@ -0,0 +1,112 @@
1
+ import numpy as np
2
+ from scipy.stats import t
3
+
4
+ from .models import BenchmarkStatistics, Interval, SuccessResult
5
+
6
+
7
+ def calculate_stability(scores: list[float]) -> float:
8
+ """
9
+ calculates a stability score between 0 and 1 for a sequence of scores.
10
+ a score of 1.0 indicates perfect stability. this is based on a normalized
11
+ split r-hat diagnostic.
12
+ """
13
+ if len(scores) < 4:
14
+ r_hat = 2.0 # NOTE: not enough data to compute, assign high r-hat
15
+ else:
16
+ n = len(scores)
17
+ half_len = n // 2
18
+
19
+ # NOTE: split the sequence into two halves
20
+ chain1 = scores[:half_len]
21
+ chain2 = scores[half_len : half_len * 2] # NOTE: ensure equal length
22
+
23
+ # 1. calculate within-chain variance (w)
24
+ var1 = np.var(chain1, ddof=1)
25
+ var2 = np.var(chain2, ddof=1)
26
+ w = 0.5 * (var1 + var2)
27
+
28
+ if w == 0: # NOTE: avoid division by zero if variance is null
29
+ r_hat = 1.0
30
+ else:
31
+ # 2. calculate between-chain variance (b)
32
+ mean1 = np.mean(chain1)
33
+ mean2 = np.mean(chain2)
34
+ mean_total = np.mean(scores[: half_len * 2])
35
+ b = half_len * ((mean1 - mean_total) ** 2 + (mean2 - mean_total) ** 2)
36
+
37
+ # 3. estimate the marginal posterior variance (var_hat)
38
+ var_hat = ((half_len - 1) / half_len) * w + (1 / half_len) * b
39
+ r_hat = np.sqrt(var_hat / w)
40
+
41
+ # NOTE: ensure r_hat is at least 1.0 to prevent stability > 100%
42
+ return 1.0 / max(1.0, r_hat)
43
+
44
+ def aggregate_benchmark_statistics(
45
+ stats_list: list[BenchmarkStatistics],
46
+ ) -> BenchmarkStatistics | None:
47
+ """
48
+ aggregates a list of benchmark statistics into a single representative
49
+ statistic.
50
+ """
51
+ if not stats_list:
52
+ return None
53
+
54
+ mean_of_means = float(np.mean([s.mean for s in stats_list]))
55
+ mean_of_std_devs = float(np.mean([s.std_dev for s in stats_list]))
56
+ mean_of_stabilities = float(np.mean([s.stability for s in stats_list]))
57
+
58
+ # NOTE: aggregate precision by taking the pessimistic (widest) interval
59
+ precision_intervals = [s.precision for s in stats_list if s.precision]
60
+ if precision_intervals:
61
+ min_low = float(np.min([p.low for p in precision_intervals]))
62
+ max_high = float(np.max([p.high for p in precision_intervals]))
63
+ aggregated_precision = Interval(low=min_low, high=max_high)
64
+ else:
65
+ aggregated_precision = None
66
+
67
+ return BenchmarkStatistics(
68
+ mean=mean_of_means,
69
+ std_dev=mean_of_std_devs,
70
+ precision=aggregated_precision,
71
+ stability=mean_of_stabilities,
72
+ )
73
+
74
+ def calculate_run_stats(
75
+ runs: list[SuccessResult], confidence_level: float
76
+ ) -> BenchmarkStatistics | None:
77
+ """
78
+ calculates statistics for a series of successful runs.
79
+ """
80
+ if not runs:
81
+ return None
82
+
83
+ similarities = [run.comparison.similarity for run in runs]
84
+
85
+ mean_similarity = float(np.mean(similarities))
86
+ std_dev_similarity = (
87
+ float(np.std(similarities, ddof=1)) if len(similarities) > 1 else 0.0
88
+ )
89
+
90
+ precision = None
91
+ if len(similarities) > 1:
92
+ # NOTE: calculate confidence interval for the mean
93
+ with np.errstate(invalid="ignore"): # NOTE: avoid warning when sem is 0
94
+ sem = std_dev_similarity / np.sqrt(len(similarities))
95
+ if sem > 0 and np.isfinite(sem):
96
+ t_crit = t.ppf((1 + confidence_level) / 2, len(similarities) - 1)
97
+ margin_of_error = t_crit * sem
98
+ precision = Interval(
99
+ low=mean_similarity - margin_of_error,
100
+ high=mean_similarity + margin_of_error,
101
+ )
102
+ elif sem == 0:
103
+ precision = Interval(low=mean_similarity, high=mean_similarity)
104
+
105
+ stability = calculate_stability(similarities)
106
+
107
+ return BenchmarkStatistics(
108
+ mean=mean_similarity,
109
+ std_dev=std_dev_similarity,
110
+ precision=precision,
111
+ stability=stability,
112
+ )
src/verily/utils.py ADDED
@@ -0,0 +1,71 @@
1
+ import contextlib
2
+ import errno
3
+
4
+ from fsspec import AbstractFileSystem
5
+
6
+
7
+ def create_sequential_run_directory(
8
+ fs: AbstractFileSystem, base_path: str, prefix: str = "run-"
9
+ ) -> str:
10
+ """
11
+ Creates a sequentially numbered, race-condition-safe run directory.
12
+ e.g., ./benchmarks/run-0001, ./benchmarks/run-0002, etc.
13
+
14
+ Args:
15
+ fs: The fsspec filesystem instance to use.
16
+ base_path: The base directory where run directories will be created.
17
+ prefix: The prefix for the run directory names (e.g., "run-").
18
+
19
+ Returns:
20
+ The path to the created directory.
21
+
22
+ Raises:
23
+ OSError: If directory creation fails after multiple retries, or for
24
+ any other unexpected OS error.
25
+ """
26
+ # find the last successful index to start the search from.
27
+ pattern = f"{base_path}{fs.sep}{prefix}*"
28
+ existing_dirs = fs.glob(pattern)
29
+ max_index = 0
30
+ for d in existing_dirs:
31
+ try:
32
+ index_str = d.split(fs.sep)[-1].replace(prefix, "")
33
+ max_index = max(max_index, int(index_str))
34
+ except (ValueError, IndexError):
35
+ continue
36
+
37
+ next_index = max_index + 1
38
+
39
+ # zero-pad the index to 4 digits (note, this 4 digits is arbitrary)
40
+ dir_name = f"{prefix}{next_index:04d}"
41
+ full_path = f"{base_path}{fs.sep}{dir_name}"
42
+
43
+ fs.makedirs(full_path, exist_ok=False)
44
+ return full_path
45
+
46
+
47
+ def _try_close_fd(fd: int):
48
+ try:
49
+ # NOTE: os.close is still used here as fsspec does not provide a direct
50
+ # equivalent for low-level file descriptor operations.
51
+ # This function is related to _pipe, which is a low-level OS pipe.
52
+ import os
53
+
54
+ os.close(fd)
55
+ except OSError as e:
56
+ if e.errno != errno.EBADF:
57
+ raise
58
+
59
+
60
+ @contextlib.contextmanager
61
+ def _pipe():
62
+ # NOTE: os.pipe is still used here as fsspec does not provide a direct
63
+ # equivalent for low-level file descriptor operations.
64
+ import os
65
+
66
+ read_fd, write_fd = os.pipe()
67
+ try:
68
+ yield read_fd, write_fd
69
+ finally:
70
+ _try_close_fd(read_fd)
71
+ _try_close_fd(write_fd)
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: verily
3
+ Version: 0.1.0
4
+ Summary: Simple stability testing test for stochastic systems
5
+ License-File: LICENSE
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: aiostream>=0.6.4
8
+ Requires-Dist: fsspec>=2025.5.1
9
+ Requires-Dist: numpy>=2.3.1
10
+ Requires-Dist: pydantic>=2.11.7
11
+ Requires-Dist: rich>=14.0.0
12
+ Requires-Dist: scipy>=1.16.0
13
+ Requires-Dist: typer>=0.16.0
14
+ Description-Content-Type: text/markdown
15
+
16
+ # Verily
17
+ Simple stability testing test for stochastic systems
@@ -0,0 +1,13 @@
1
+ src/verily/__init__.py,sha256=ghf3ojwgzu-J1SpUvj7JF_3Dsg2VVNnhiS4MNBOVPnQ,127
2
+ src/verily/display.py,sha256=DlgY3skbZ8DS27w1atj-FQlpm7-xC0UA6nbkyoF-2EQ,4243
3
+ src/verily/io.py,sha256=TzIUlsZBFZrmIREdPH_oa-S3ww3eDB9PfRH5D4xbC3Y,6049
4
+ src/verily/main.py,sha256=JKUHYGOzzJHUT2V_RsE3M1CWzsi_VSzC_s-7Kmwff5g,5140
5
+ src/verily/models.py,sha256=YSvbsBOOQz20vRkrUCDGX8ls0wmlVdTD6viTWACsS8k,2739
6
+ src/verily/runner.py,sha256=Me7HGBOVpqi2NGG_CP9h6-v_1aencMpJOWu_FRZXDTQ,10514
7
+ src/verily/stats.py,sha256=raHuMaGNcZQk5Hq4UrnCtaZEDpokp6wTxGhEmdx3TWU,3983
8
+ src/verily/utils.py,sha256=ZwtfGn1WkXilF_HT7YUOM2AAOyVQRAn0nBgRRbFDwYs,2102
9
+ verily-0.1.0.dist-info/METADATA,sha256=UaiHKaZElJVNdaHx4eNI-j9MGXKVuALXIfkeXaaTv6s,471
10
+ verily-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
+ verily-0.1.0.dist-info/entry_points.txt,sha256=oOhhFgdZ0iaagLEahPtUvsFoAGaidzlCWeDGP0uN5Is,40
12
+ verily-0.1.0.dist-info/licenses/LICENSE,sha256=l6AVXL-la_CflPm9SetkDxEiHBNfIetKdFGiWyCq8-k,1063
13
+ verily-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ zen = verily.main:app
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 wowthx
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.