PyPI - verily - Versions diffs - 0.1.0__py3-none-any.whl - Mend

verily 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

src/verily/__init__.py +3 -0
src/verily/display.py +112 -0
src/verily/io.py +152 -0
src/verily/main.py +198 -0
src/verily/models.py +141 -0
src/verily/runner.py +305 -0
src/verily/stats.py +112 -0
src/verily/utils.py +71 -0
verily-0.1.0.dist-info/METADATA +17 -0
verily-0.1.0.dist-info/RECORD +13 -0
verily-0.1.0.dist-info/WHEEL +4 -0
verily-0.1.0.dist-info/entry_points.txt +2 -0
verily-0.1.0.dist-info/licenses/LICENSE +21 -0

src/verily/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .models import Benchmark, BenchmarkCase, RunConfigOverride
+__all__ = ["Benchmark", "BenchmarkCase", "RunConfigOverride"]

src/verily/display.py ADDED Viewed

@@ -0,0 +1,112 @@
+from rich.spinner import Spinner
+from rich.table import Table
+from rich.text import Text
+from .models import BenchmarkRunResult
+def generate_benchmark_table(
+    results: list[BenchmarkRunResult], confidence_level: float
+) -> Table:
+    table = Table(show_header=True, header_style="cyan", expand=True)
+    table.add_column("Benchmark", style="dim", width=30)
+    table.add_column(f"Mean @ {confidence_level:.0%} CI", justify="right")
+    table.add_column("Std Dev", justify="right")
+    table.add_column("Stability", justify="right")
+    table.add_column("Runs", justify="right")
+    table.add_column("Avg Dur~ (s)", justify="right")
+    if not results:
+        table.add_row("Waiting for benchmarks to start...", "", "", "", "", "")
+        return table
+    for result in results:
+        benchmark_name = result.benchmark.name
+        all_runs = [run for case in result.case_results for run in case.runs]
+        pending_runs = [r for r in all_runs if r.result.type == "pending"]
+        successful_runs = [r for r in all_runs if r.result.type == "success"]
+        failed_runs = [r for r in all_runs if r.result.type == "failure"]
+        latest_stats = result.stats
+        all_cases_done = all(cr.done for cr in result.case_results)
+        def with_spinner(text: str | None) -> str | Spinner | Text:
+            if pending_runs and not all_cases_done:
+                if text:
+                    return Spinner("dots", text=Text(text, style="white"), style="cyan")
+                return Spinner("dots", style="cyan")
+            return text or "[yellow]N/A[/yellow]"
+        mean_display_text = None
+        if latest_stats:
+            mean_val_text = f"{latest_stats.mean * 100:.2f}%"
+            if latest_stats.precision:
+                margin_of_error = (
+                    latest_stats.precision.high - latest_stats.precision.low
+                ) / 2
+                margin_of_error_text = f"±{margin_of_error * 100:.2f}%"
+                mean_display_text = f"{mean_val_text} ({margin_of_error_text})"
+            else:
+                mean_display_text = mean_val_text
+        std_dev_text = f"{latest_stats.std_dev:.3f}" if latest_stats else None
+        stability_text = f"{latest_stats.stability * 100:.2f}%" if latest_stats else None
+        mean_score_display = with_spinner(mean_display_text)
+        std_dev_display = with_spinner(std_dev_text)
+        stability_display = with_spinner(stability_text)
+        total_runs_count = len(all_runs)
+        runs_display_text = f"{total_runs_count}"
+        if failed_runs:
+            runs_display_text += f" [red]({len(failed_runs)} failed)[/red]"
+        avg_run_duration_display: str | Spinner | Text
+        avg_runtime_text = None
+        completed_runs = successful_runs + failed_runs
+        if completed_runs:
+            total_runtime = sum(r.result.runtime for r in completed_runs)
+            avg_runtime = total_runtime / len(completed_runs)
+            avg_runtime_text = f"{avg_runtime:.2f}"
+        avg_run_duration_display = with_spinner(avg_runtime_text)
+        table.add_row(
+            benchmark_name,
+            mean_score_display,
+            std_dev_display,
+            stability_display,
+            runs_display_text,
+            avg_run_duration_display,
+        )
+    return table
+def generate_failures_table(
+    results: list[BenchmarkRunResult]
+) -> Table:
+    table = Table(show_header=True, header_style="bold red")
+    table.title = "Failures"
+    table.add_column("Benchmark", style="dim", width=30)
+    table.add_column("Case", justify="right")
+    table.add_column("Error")
+    table.add_column("Stdout")
+    table.add_column("Stderr")
+    for result in results:
+        for case_result in result.case_results:
+            for run in case_result.runs:
+                if run.result.type == "failure":
+                    failure = run.result
+                    table.add_row(
+                        result.benchmark.name,
+                        str(case_result.case.inputs),
+                        failure.error_message,
+                        failure.stdout.decode("utf-8", errors="ignore"),
+                        failure.stderr.decode("utf-8", errors="ignore"),
+                    )
+    return table

src/verily/io.py ADDED Viewed

@@ -0,0 +1,152 @@
+import json
+from collections.abc import Callable, Iterator
+from contextlib import contextmanager
+import fsspec
+from .models import (
+    BenchmarkCase,
+    BenchmarkRunResult,
+    BenchmarkStatistics,
+    CaseResult,
+    FailureResult,
+    NamedBenchmark,
+    Run,
+    RunConfig,
+    RunResult,
+    SuccessResult,
+)
+from .stats import aggregate_benchmark_statistics, calculate_run_stats
+from .utils import create_sequential_run_directory
+def _serialize_run(run: Run) -> dict:
+    result_dump = None
+    if hasattr(run.result, "model_dump"):
+        result_dump = run.result.model_dump()
+        if "stdout" in result_dump and isinstance(result_dump["stdout"], bytes):
+            result_dump["stdout"] = result_dump["stdout"].decode("utf-8", "ignore")
+        if "stderr" in result_dump and isinstance(result_dump["stderr"], bytes):
+            result_dump["stderr"] = result_dump["stderr"].decode("utf-8", "ignore")
+    return {
+        "result": result_dump,
+        "stats": run.stats.model_dump() if run.stats else None,
+    }
+@contextmanager
+def create_results_writer(
+    results_dir: str,
+    config: RunConfig,
+) -> Iterator[Callable[[BenchmarkRunResult], None]]:
+    fs, path = fsspec.url_to_fs(results_dir)
+    run_dir = create_sequential_run_directory(fs=fs, base_path=path)
+    written_runs = set()
+    with fs.open(f"{run_dir}/config.json", "w") as f:
+        json.dump(config.model_dump(), f, indent=4)
+    with fs.open(f"{run_dir}/results.jsonl", "w") as f:
+        def writer(result: BenchmarkRunResult) -> None:
+            benchmark_name = result.benchmark.name
+            for case_idx, case_result in enumerate(result.case_results):
+                for run_idx, run in enumerate(case_result.runs):
+                    run_id = (benchmark_name, case_idx, run_idx)
+                    if run.result.type != "pending" and run_id not in written_runs:
+                        run_data = _serialize_run(run)
+                        output_record = {
+                            "benchmark_name": benchmark_name,
+                            "case_index": case_idx,
+                            "case_inputs": case_result.case.inputs,
+                            "run_index": run_idx,
+                            **run_data,
+                        }
+                        f.write(json.dumps(output_record) + "\n")
+                        written_runs.add(run_id)
+        yield writer
+def load_results(run_path: str) -> tuple[list[BenchmarkRunResult], RunConfig]:
+    fs, path = fsspec.url_to_fs(run_path)
+    with fs.open(f"{path}/config.json", "r") as f:
+        config_dict = json.load(f)
+    config = RunConfig(**config_dict)
+    results_by_benchmark_case: dict[
+        tuple[str, int], list[Run]
+    ] = {}
+    with fs.open(f"{path}/results.jsonl", "r") as f:
+        for line in f:
+            record = json.loads(line)
+            benchmark_name = record["benchmark_name"]
+            case_index = record["case_index"]
+            case_inputs = record["case_inputs"]
+            run_index = record["run_index"]
+            run_data = record["result"]
+            stats_data = record["stats"]
+            run_result: RunResult
+            if run_data["type"] == "success":
+                run_result = SuccessResult(
+                    comparison=run_data["comparison"],
+                    stdout=run_data["stdout"].encode("utf-8"),
+                    stderr=run_data["stderr"].encode("utf-8"),
+                    runtime=run_data["runtime"],
+                )
+            elif run_data["type"] == "failure":
+                run_result = FailureResult(
+                    error_message=run_data["error_message"],
+                    stdout=run_data["stdout"].encode("utf-8"),
+                    stderr=run_data["stderr"].encode("utf-8"),
+                    runtime=run_data["runtime"],
+                )
+            else:
+                # This should not happen with current serialization logic
+                continue
+            run_stats = (
+                BenchmarkStatistics(**stats_data) if stats_data else None
+            )
+            run = Run(result=run_result, stats=run_stats)
+            if (benchmark_name, case_index) not in results_by_benchmark_case:
+                results_by_benchmark_case[(benchmark_name, case_index)] = []
+            # Ensure the list is long enough to insert at run_index
+            current_runs = results_by_benchmark_case[(benchmark_name, case_index)]
+            while len(current_runs) <= run_index:
+                current_runs.append(None) # type: ignore
+            current_runs[run_index] = run
+    benchmark_results: dict[str, BenchmarkRunResult] = {}
+    for (benchmark_name, case_index), runs in results_by_benchmark_case.items():
+        # NOTE: We don't have the original Benchmark object, so we create a dummy one
+        # This is acceptable as it's only used for display purposes and not execution
+        dummy_benchmark_case = BenchmarkCase(inputs=[], expectation=None)
+        # Filter out None runs and calculate stats for the case
+        successful_runs_in_case = [r.result for r in runs if r and r.result.type == "success"]
+        case_stats = calculate_run_stats(successful_runs_in_case, config.confidence_level)
+        case_result = CaseResult(case=dummy_benchmark_case, runs=runs, stats=case_stats)
+        if benchmark_name not in benchmark_results:
+            dummy_named_benchmark = NamedBenchmark(name=benchmark_name, benchmark=None) # type: ignore
+            benchmark_results[benchmark_name] = BenchmarkRunResult(
+                benchmark=dummy_named_benchmark, case_results=[], stats=None
+            )
+        benchmark_results[benchmark_name].case_results.append(case_result)
+    # Calculate overall benchmark statistics
+    for benchmark_run_result in benchmark_results.values():
+        all_case_stats = [cr.stats for cr in benchmark_run_result.case_results if cr.stats]
+        benchmark_run_result.stats = aggregate_benchmark_statistics(all_case_stats)
+    return list(benchmark_results.values()), config

src/verily/main.py ADDED Viewed

@@ -0,0 +1,198 @@
+import asyncio
+import os
+import pickle
+from collections.abc import AsyncIterator
+from typing import Annotated
+import typer
+from rich.console import Console, Group
+from rich.live import Live
+from .display import (
+    generate_benchmark_table,
+    generate_failures_table,
+)
+from .io import create_results_writer, load_results
+from .models import BenchmarkSession, RunConfig
+from .runner import (
+    resolve_benchmark,
+    run_benchmark_case,
+    stream_benchmarks_runs,
+)
+app = typer.Typer()
+def _render_benchmark_output(
+    results: list[BenchmarkSession], config: RunConfig
+):
+    benchmark_table = generate_benchmark_table(
+        results, confidence_level=config.confidence_level
+    )
+    all_benchmarks_and_cases_done = all(
+        all(cr.done for cr in r.case_results) for r in results
+    )
+    if not all_benchmarks_and_cases_done or not any(
+        run.result.type == "failure"
+        for r in results
+        for case in r.case_results
+        for run in case.runs
+    ):
+        return benchmark_table
+    failures_table = generate_failures_table(results)
+    return Group(failures_table, benchmark_table)
+async def _run_benchmarks(
+    stream: AsyncIterator[BenchmarkSession],
+    results_dir: str,
+    live: Live,
+    config: RunConfig,
+):
+    session: BenchmarkSession | None = None
+    live.update(_render_benchmark_output(session.results if session else [], config))
+    with create_results_writer(results_dir, config) as write_result:
+        async for session in stream:
+            live.update(_render_benchmark_output(session.results, config))
+            for result in session.results:
+                write_result(result)
+        live.update(_render_benchmark_output(session.results, config))
+async def _run_worker_case_and_dump_result(
+    benchmark_path: str,
+    case_index: int,
+    result_fd: int,
+):
+    benchmark = resolve_benchmark(benchmark_path)
+    output_data = await run_benchmark_case(benchmark, case_index)
+    with os.fdopen(result_fd, "wb") as result_pipe_w:
+        pickle.dump(output_data, result_pipe_w)
+def _show_results(results: list[BenchmarkSession], config: RunConfig, console: Console):
+    output = _render_benchmark_output(results, config)
+    console.print(output)
+@app.command("run")
+def run_command(
+    benchmark_paths: Annotated[
+        list[str],
+        typer.Option(
+            "--benchmark-path",
+            "-b",
+            help="Path to the benchmark to run (module:instance or file path)",
+        ),
+    ],
+    min_runs: Annotated[
+        int,
+        typer.Option(
+            "--min-runs",
+            help="Minimum number of times to repeat each benchmark case.",
+        ),
+    ],
+    max_runs: Annotated[
+        int,
+        typer.Option(
+            "--max-runs",
+            "-r",
+            help="Maximum number of times to repeat each benchmark case.",
+        ),
+    ],
+    batch_size: Annotated[
+        int,
+        typer.Option(
+            "--batch-size",
+            help="The number of runs to execute in a batch.",
+        ),
+    ],
+    stability_goal: Annotated[
+        float,
+        typer.Option(
+            "--stability-goal",
+            help="The stability goal for the benchmark runs.",
+        ),
+    ],
+    confidence_level: Annotated[
+        float,
+        typer.Option(
+            "--confidence-level",
+            help="Confidence level for the precision estimate.",
+        ),
+    ],
+    results_dir: Annotated[
+        str,
+        typer.Option(
+            "--results-dir",
+            "-o",
+            help="Directory to save benchmark results",
+        ),
+    ],
+):
+    loop = asyncio.get_event_loop()
+    run_config = RunConfig(
+        min_runs=min_runs,
+        max_runs=max_runs,
+        batch_size=batch_size,
+        stability_goal=stability_goal,
+        confidence_level=confidence_level,
+    )
+    stream = stream_benchmarks_runs(
+        benchmark_paths,
+        run_config=run_config,
+    )
+    with Live(refresh_per_second=10) as live:
+        loop.run_until_complete(
+            _run_benchmarks(stream, results_dir, live, run_config)
+        )
+@app.command("worker")
+def worker_command(
+    benchmark_path: Annotated[
+        str,
+        typer.Option(
+            "--benchmark-path",
+            help="Path to the benchmark (module:instance).",
+        ),
+    ],
+    case_index: Annotated[
+        int,
+        typer.Option(
+            "--case-index",
+            help="Index of the case to run.",
+        ),
+    ],
+    result_fd: Annotated[
+        int,
+        typer.Option(
+            "--result-fd",
+            help="File descriptor to write pickled result to.",
+        ),
+    ],
+):
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(
+        _run_worker_case_and_dump_result(benchmark_path, case_index, result_fd)
+    )
+@app.command("show")
+def show_command(
+    run_path: Annotated[
+        str,
+        typer.Argument(
+            help="Path to the directory containing benchmark results (e.g., a 'run-XXXX' directory)."
+        ),
+    ],
+):
+    console = Console()
+    results, run_config = load_results(run_path)
+    _show_results(results, run_config, console)

src/verily/models.py ADDED Viewed

@@ -0,0 +1,141 @@
+from collections.abc import Awaitable, Callable
+from dataclasses import dataclass
+from typing import Any, Literal, Protocol
+from pydantic import BaseModel, ConfigDict
+class Comparisonlike(Protocol):
+    similarity: float
+    actual: Any
+    expected: Any
+class Comparer(Protocol):
+    async def compare(self, item1: Any, item2: Any) -> Comparisonlike: ...
+class RunConfigOverride(BaseModel):
+    min_runs: int | None = None
+    max_runs: int | None = None
+    batch_size: int | None = None
+    stability_goal: float | None = None
+class BenchmarkCase(BaseModel):
+    inputs: list[Any]
+    expectation: Any
+    config: RunConfigOverride | None = None
+    def __hash__(self):
+        return id(self)
+    def __eq__(self, other):
+        return self is other
+@dataclass
+class Benchmark:
+    runner: Callable[..., Awaitable[Any]]
+    comparer: Comparer
+    cases: list[BenchmarkCase]
+    config: RunConfigOverride | None = None
+class Comparison(BaseModel):
+    model_config = ConfigDict(from_attributes=True)
+    similarity: float
+    actual: Any
+    expected: Any
+class SuccessfulEvaluation(BaseModel):
+    type: Literal["success"] = "success"
+    comparison: Comparison
+class FailingEvaluation(BaseModel):
+    type: Literal["failure"] = "failure"
+    error_message: str
+EvaluationResult = SuccessfulEvaluation | FailingEvaluation
+class SuccessResult(BaseModel):
+    type: Literal["success"] = "success"
+    comparison: Comparison
+    stdout: bytes
+    stderr: bytes
+    runtime: float
+class FailureResult(BaseModel):
+    type: Literal["failure"] = "failure"
+    error_message: str
+    stdout: bytes
+    stderr: bytes
+    runtime: float
+class PendingResult(BaseModel):
+    type: Literal["pending"] = "pending"
+RunResult = SuccessResult | FailureResult
+RunState = RunResult | PendingResult
+@dataclass
+class Run:
+    result: RunState
+    stats: "BenchmarkStatistics | None"
+@dataclass
+class CaseResult:
+    case: BenchmarkCase
+    runs: list[Run | None]
+    stats: "BenchmarkStatistics | None" = None
+    done: bool = False
+@dataclass(eq=False)
+class NamedBenchmark:
+    name: str
+    benchmark: Benchmark
+@dataclass
+class BenchmarkRunResult:
+    benchmark: NamedBenchmark
+    case_results: list[CaseResult]
+    stats: "BenchmarkStatistics | None" = None
+@dataclass
+class BenchmarkSession:
+    results: list[BenchmarkRunResult]
+class Interval(BaseModel):
+    low: float
+    high: float
+class BenchmarkStatistics(BaseModel):
+    mean: float
+    std_dev: float
+    precision: Interval | None
+    stability: float
+RunCallable = Callable[[], Awaitable[RunResult]]
+class RunConfig(BaseModel):
+    min_runs: int
+    max_runs: int
+    batch_size: int
+    stability_goal: float
+    confidence_level: float

src/verily/runner.py ADDED Viewed

@@ -0,0 +1,305 @@
+import inspect
+import os
+import pickle
+import time
+import traceback
+from asyncio import create_subprocess_exec, subprocess
+from collections.abc import AsyncIterator, Iterable
+from importlib import import_module
+from aiostream import stream
+from .models import (
+    Benchmark,
+    BenchmarkCase,
+    BenchmarkRunResult,
+    BenchmarkSession,
+    BenchmarkStatistics,
+    CaseResult,
+    Comparison,
+    EvaluationResult,
+    FailingEvaluation,
+    FailureResult,
+    NamedBenchmark,
+    PendingResult,
+    Run,
+    RunConfig,
+    RunConfigOverride,
+    RunResult,
+    SuccessfulEvaluation,
+    SuccessResult,
+)
+from .stats import aggregate_benchmark_statistics, calculate_run_stats
+from .utils import _pipe
+class WorkerRuntimeError(RuntimeError):
+    def __init__(self, message: str, returncode: int, stdout: bytes, stderr: bytes):
+        super().__init__(message)
+        self.returncode = returncode
+        self.stdout = stdout
+        self.stderr = stderr
+# TODO: move
+def resolve_benchmark(benchmark_path: str) -> Benchmark:
+    module_name, instance_name = benchmark_path.split(":")
+    module = import_module(module_name)
+    benchmark_instance = getattr(module, instance_name)
+    if not isinstance(benchmark_instance, Benchmark):
+        raise TypeError(
+            f"Expected '{instance_name}' in '{module_name}' to be an instance of Benchmark, "
+            f"got {type(benchmark_instance).__name__} instead."
+        )
+    return benchmark_instance
+def resolve_benchmarks(benchmark_paths: list[str]) -> Iterable[NamedBenchmark]:
+    for path in benchmark_paths:
+        if ":" in path:
+            yield NamedBenchmark(name=path, benchmark=resolve_benchmark(path))
+        else:
+            module_name = path.replace("/", ".").removesuffix(".py")
+            module = import_module(module_name)
+            for name, member in inspect.getmembers(module):
+                if isinstance(member, Benchmark):
+                    yield NamedBenchmark(
+                        name=f"{module_name}:{name}", benchmark=member
+                    )
+def _get_run_result(read_fd: int) -> EvaluationResult:
+    with os.fdopen(read_fd, "rb") as result_pipe_r:
+        return pickle.load(result_pipe_r)
+def _get_test_name_from_path(benchmark_path: str) -> str:
+    try:
+        test_name = benchmark_path.split(":")[1]
+        return test_name.replace("_", " ").title()
+    except IndexError:
+        return benchmark_path
+def resolve_run_config(
+    global_config: RunConfig,
+    benchmark_config: RunConfigOverride | None,
+    case_config: RunConfigOverride | None,
+) -> RunConfig:
+    resolved_config = global_config.model_copy()
+    if benchmark_config:
+        resolved_config.min_runs = benchmark_config.min_runs or resolved_config.min_runs
+        resolved_config.max_runs = benchmark_config.max_runs or resolved_config.max_runs
+        resolved_config.batch_size = benchmark_config.batch_size or resolved_config.batch_size
+        resolved_config.stability_goal = benchmark_config.stability_goal or resolved_config.stability_goal
+    if case_config:
+        resolved_config.min_runs = case_config.min_runs or resolved_config.min_runs
+        resolved_config.max_runs = case_config.max_runs or resolved_config.max_runs
+        resolved_config.batch_size = case_config.batch_size or resolved_config.batch_size
+        resolved_config.stability_goal = case_config.stability_goal or resolved_config.stability_goal
+    return resolved_config
+async def run_benchmark_case(
+    benchmark: Benchmark,
+    case_idx: int,
+) -> EvaluationResult:
+    case = benchmark.cases[case_idx]
+    try:
+        result = await benchmark.runner(*case.inputs)
+        comparison = Comparison.model_validate(
+            await benchmark.comparer.compare(case.expectation, result)
+        )
+        return SuccessfulEvaluation(comparison=comparison)
+    except Exception:
+        return FailingEvaluation(error_message=traceback.format_exc())
+async def run_single_case(
+    benchmark_path_str: str,
+    case_idx: int,
+) -> RunResult:
+    start_time = time.monotonic()
+    with _pipe() as (read_fd, write_fd):
+        args = [
+            "stabe",
+            "worker",
+            "--benchmark-path",
+            benchmark_path_str,
+            "--case-index",
+            str(case_idx),
+            "--result-fd",
+            str(write_fd),
+        ]
+        proc = await create_subprocess_exec(
+            *args,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            pass_fds=(write_fd,),
+        )
+        stdout_bytes, stderr_bytes = await proc.communicate()
+        runtime = time.monotonic() - start_time
+        if proc.returncode != 0:
+            raise WorkerRuntimeError(
+                message=f"Worker process exited with code {proc.returncode}.",
+                returncode=proc.returncode,
+                stdout=stdout_bytes,
+                stderr=stderr_bytes,
+            )
+        try:
+            worker_output = _get_run_result(read_fd)
+            if not worker_output:
+                raise WorkerRuntimeError(
+                    message="Worker exited successfully but provided no data via pipe.",
+                    returncode=0,
+                    stdout=stdout_bytes,
+                    stderr=stderr_bytes,
+                )
+            if worker_output.type == "success":
+                return SuccessResult(
+                    comparison=worker_output.comparison,
+                    stdout=stdout_bytes,
+                    stderr=stderr_bytes,
+                    runtime=runtime,
+                )
+            return FailureResult(
+                error_message=worker_output.error_message,
+                stdout=stdout_bytes,
+                stderr=stderr_bytes,
+                runtime=runtime,
+            )
+        except (EOFError, pickle.UnpicklingError) as e:
+            raise WorkerRuntimeError(
+                message=f"Worker exited successfully but failed to provide valid result via pipe: {type(e).__name__} - {e}",
+                returncode=0,
+                stdout=stdout_bytes,
+                stderr=stderr_bytes,
+            )
+async def stream_case_runs(
+    benchmark_path: str,
+    case: BenchmarkCase,
+    case_idx: int,
+    global_run_config: RunConfig,
+    benchmark_run_config_override: RunConfigOverride | None,
+) -> AsyncIterator[CaseResult]:
+    run_config = resolve_run_config(
+        global_config=global_run_config,
+        benchmark_config=benchmark_run_config_override,
+        case_config=case.config,
+    )
+    runs: list[Run] = []
+    latest_stats: BenchmarkStatistics | None = None
+    run_count = 0
+    while run_count < run_config.max_runs:
+        # if we reach min runs and our latest stats
+        # meet the stability goal then we can stop
+        if (
+            run_count >= run_config.min_runs and
+            latest_stats and
+            latest_stats.stability >= run_config.stability_goal
+        ):
+            break
+        # Determine batch size for this iteration
+        batch_size = min(run_config.batch_size, run_config.max_runs - run_count)
+        batch_indices = list(range(run_count, run_count + batch_size))
+        run_count += batch_size
+        # start with pending results for the new batch
+        runs.extend([Run(result=PendingResult(), stats=None) for _ in batch_indices])
+        yield CaseResult(case=case, runs=list(runs), stats=latest_stats)
+        # note that while this function utlimately only yields once
+        # and could just be normal async, we leverage this iterator
+        # aspect to ensure we can provide continuos updates via the
+        # below stream merge
+        async def run_and_update(idx: int) -> AsyncIterator[CaseResult]:
+            nonlocal latest_stats
+            run_result = await run_single_case(benchmark_path, case_idx)
+            runs[idx] = Run(result=run_result, stats=None)
+            successful_runs = [
+                r.result for r in runs if r and r.result.type == "success"
+            ]
+            latest_stats = calculate_run_stats(
+                successful_runs, run_config.confidence_level
+            )
+            yield CaseResult(case=case, runs=list(runs), stats=latest_stats)
+        batch_runners = [run_and_update(idx) for idx in batch_indices]
+        merged_stream = stream.merge(*batch_runners)
+        async with merged_stream.stream() as streamer:
+            async for case_result in streamer:
+                yield case_result
+    yield CaseResult(case=case, runs=list(runs), stats=latest_stats, done=True)
+async def stream_benchmark_runs(
+    benchmark: NamedBenchmark,
+    run_config: RunConfig,
+) -> AsyncIterator[BenchmarkRunResult]:
+    case_results: dict[BenchmarkCase, CaseResult] = {
+        case: CaseResult(case=case, runs=[]) for case in benchmark.benchmark.cases
+    }
+    def get_stats() -> BenchmarkStatistics | None:
+        all_stats = [cr.stats for cr in case_results.values() if cr.stats]
+        return aggregate_benchmark_statistics(all_stats)
+    all_case_runners = [
+        stream_case_runs(
+            benchmark_path=benchmark.name,
+            case=case,
+            case_idx=benchmark.benchmark.cases.index(case),
+            global_run_config=run_config,
+            benchmark_run_config_override=benchmark.benchmark.config,
+        )
+        for case in benchmark.benchmark.cases
+    ]
+    merged_stream = stream.merge(*all_case_runners)
+    async with merged_stream.stream() as streamer:
+        async for case_result in streamer:
+            case_results[case_result.case] = case_result
+            yield BenchmarkRunResult(
+                benchmark=benchmark,
+                case_results=list(case_results.values()),
+                stats=get_stats(),
+            )
+# TODO: move or receive the result of resolve_benchmarks instead
+async def stream_benchmarks_runs(
+    benchmark_paths: list[str],
+    run_config: RunConfig,
+) -> AsyncIterator[BenchmarkSession]:
+    benchmarks = list(resolve_benchmarks(benchmark_paths))
+    benchmark_streams = [
+        stream_benchmark_runs(
+            named_benchmark,
+            run_config=run_config,
+        )
+        for named_benchmark in benchmarks
+    ]
+    benchmark_states: dict[NamedBenchmark, BenchmarkRunResult] = {}
+    merged_stream = stream.merge(*benchmark_streams)
+    async with merged_stream.stream() as streamer:
+        async for item in streamer:
+            benchmark_states[item.benchmark] = item
+            yield BenchmarkSession(results=list(benchmark_states.values()))

src/verily/stats.py ADDED Viewed

@@ -0,0 +1,112 @@
+import numpy as np
+from scipy.stats import t
+from .models import BenchmarkStatistics, Interval, SuccessResult
+def calculate_stability(scores: list[float]) -> float:
+    """
+    calculates a stability score between 0 and 1 for a sequence of scores.
+    a score of 1.0 indicates perfect stability. this is based on a normalized
+    split r-hat diagnostic.
+    """
+    if len(scores) < 4:
+        r_hat = 2.0  # NOTE: not enough data to compute, assign high r-hat
+    else:
+        n = len(scores)
+        half_len = n // 2
+        # NOTE: split the sequence into two halves
+        chain1 = scores[:half_len]
+        chain2 = scores[half_len : half_len * 2]  # NOTE: ensure equal length
+        # 1. calculate within-chain variance (w)
+        var1 = np.var(chain1, ddof=1)
+        var2 = np.var(chain2, ddof=1)
+        w = 0.5 * (var1 + var2)
+        if w == 0:  # NOTE: avoid division by zero if variance is null
+            r_hat = 1.0
+        else:
+            # 2. calculate between-chain variance (b)
+            mean1 = np.mean(chain1)
+            mean2 = np.mean(chain2)
+            mean_total = np.mean(scores[: half_len * 2])
+            b = half_len * ((mean1 - mean_total) ** 2 + (mean2 - mean_total) ** 2)
+            # 3. estimate the marginal posterior variance (var_hat)
+            var_hat = ((half_len - 1) / half_len) * w + (1 / half_len) * b
+            r_hat = np.sqrt(var_hat / w)
+    # NOTE: ensure r_hat is at least 1.0 to prevent stability > 100%
+    return 1.0 / max(1.0, r_hat)
+def aggregate_benchmark_statistics(
+    stats_list: list[BenchmarkStatistics],
+) -> BenchmarkStatistics | None:
+    """
+    aggregates a list of benchmark statistics into a single representative
+    statistic.
+    """
+    if not stats_list:
+        return None
+    mean_of_means = float(np.mean([s.mean for s in stats_list]))
+    mean_of_std_devs = float(np.mean([s.std_dev for s in stats_list]))
+    mean_of_stabilities = float(np.mean([s.stability for s in stats_list]))
+    # NOTE: aggregate precision by taking the pessimistic (widest) interval
+    precision_intervals = [s.precision for s in stats_list if s.precision]
+    if precision_intervals:
+        min_low = float(np.min([p.low for p in precision_intervals]))
+        max_high = float(np.max([p.high for p in precision_intervals]))
+        aggregated_precision = Interval(low=min_low, high=max_high)
+    else:
+        aggregated_precision = None
+    return BenchmarkStatistics(
+        mean=mean_of_means,
+        std_dev=mean_of_std_devs,
+        precision=aggregated_precision,
+        stability=mean_of_stabilities,
+    )
+def calculate_run_stats(
+    runs: list[SuccessResult], confidence_level: float
+) -> BenchmarkStatistics | None:
+    """
+    calculates statistics for a series of successful runs.
+    """
+    if not runs:
+        return None
+    similarities = [run.comparison.similarity for run in runs]
+    mean_similarity = float(np.mean(similarities))
+    std_dev_similarity = (
+        float(np.std(similarities, ddof=1)) if len(similarities) > 1 else 0.0
+    )
+    precision = None
+    if len(similarities) > 1:
+        # NOTE: calculate confidence interval for the mean
+        with np.errstate(invalid="ignore"):  # NOTE: avoid warning when sem is 0
+            sem = std_dev_similarity / np.sqrt(len(similarities))
+            if sem > 0 and np.isfinite(sem):
+                t_crit = t.ppf((1 + confidence_level) / 2, len(similarities) - 1)
+                margin_of_error = t_crit * sem
+                precision = Interval(
+                    low=mean_similarity - margin_of_error,
+                    high=mean_similarity + margin_of_error,
+                )
+            elif sem == 0:
+                precision = Interval(low=mean_similarity, high=mean_similarity)
+    stability = calculate_stability(similarities)
+    return BenchmarkStatistics(
+        mean=mean_similarity,
+        std_dev=std_dev_similarity,
+        precision=precision,
+        stability=stability,
+    )

src/verily/utils.py ADDED Viewed

@@ -0,0 +1,71 @@
+import contextlib
+import errno
+from fsspec import AbstractFileSystem
+def create_sequential_run_directory(
+    fs: AbstractFileSystem, base_path: str, prefix: str = "run-"
+) -> str:
+    """
+    Creates a sequentially numbered, race-condition-safe run directory.
+    e.g., ./benchmarks/run-0001, ./benchmarks/run-0002, etc.
+    Args:
+        fs: The fsspec filesystem instance to use.
+        base_path: The base directory where run directories will be created.
+        prefix: The prefix for the run directory names (e.g., "run-").
+    Returns:
+        The path to the created directory.
+    Raises:
+        OSError: If directory creation fails after multiple retries, or for
+                 any other unexpected OS error.
+    """
+    # find the last successful index to start the search from.
+    pattern = f"{base_path}{fs.sep}{prefix}*"
+    existing_dirs = fs.glob(pattern)
+    max_index = 0
+    for d in existing_dirs:
+        try:
+            index_str = d.split(fs.sep)[-1].replace(prefix, "")
+            max_index = max(max_index, int(index_str))
+        except (ValueError, IndexError):
+            continue
+    next_index = max_index + 1
+    # zero-pad the index to 4 digits (note, this 4 digits is arbitrary)
+    dir_name = f"{prefix}{next_index:04d}"
+    full_path = f"{base_path}{fs.sep}{dir_name}"
+    fs.makedirs(full_path, exist_ok=False)
+    return full_path
+def _try_close_fd(fd: int):
+    try:
+        # NOTE: os.close is still used here as fsspec does not provide a direct
+        # equivalent for low-level file descriptor operations.
+        # This function is related to _pipe, which is a low-level OS pipe.
+        import os
+        os.close(fd)
+    except OSError as e:
+        if e.errno != errno.EBADF:
+            raise
+@contextlib.contextmanager
+def _pipe():
+    # NOTE: os.pipe is still used here as fsspec does not provide a direct
+    # equivalent for low-level file descriptor operations.
+    import os
+    read_fd, write_fd = os.pipe()
+    try:
+        yield read_fd, write_fd
+    finally:
+        _try_close_fd(read_fd)
+        _try_close_fd(write_fd)

verily-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,17 @@
+Metadata-Version: 2.4
+Name: verily
+Version: 0.1.0
+Summary: Simple stability testing test for stochastic systems
+License-File: LICENSE
+Requires-Python: >=3.11
+Requires-Dist: aiostream>=0.6.4
+Requires-Dist: fsspec>=2025.5.1
+Requires-Dist: numpy>=2.3.1
+Requires-Dist: pydantic>=2.11.7
+Requires-Dist: rich>=14.0.0
+Requires-Dist: scipy>=1.16.0
+Requires-Dist: typer>=0.16.0
+Description-Content-Type: text/markdown
+# Verily
+Simple stability testing test for stochastic systems

verily-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+src/verily/__init__.py,sha256=ghf3ojwgzu-J1SpUvj7JF_3Dsg2VVNnhiS4MNBOVPnQ,127
+src/verily/display.py,sha256=DlgY3skbZ8DS27w1atj-FQlpm7-xC0UA6nbkyoF-2EQ,4243
+src/verily/io.py,sha256=TzIUlsZBFZrmIREdPH_oa-S3ww3eDB9PfRH5D4xbC3Y,6049
+src/verily/main.py,sha256=JKUHYGOzzJHUT2V_RsE3M1CWzsi_VSzC_s-7Kmwff5g,5140
+src/verily/models.py,sha256=YSvbsBOOQz20vRkrUCDGX8ls0wmlVdTD6viTWACsS8k,2739
+src/verily/runner.py,sha256=Me7HGBOVpqi2NGG_CP9h6-v_1aencMpJOWu_FRZXDTQ,10514
+src/verily/stats.py,sha256=raHuMaGNcZQk5Hq4UrnCtaZEDpokp6wTxGhEmdx3TWU,3983
+src/verily/utils.py,sha256=ZwtfGn1WkXilF_HT7YUOM2AAOyVQRAn0nBgRRbFDwYs,2102
+verily-0.1.0.dist-info/METADATA,sha256=UaiHKaZElJVNdaHx4eNI-j9MGXKVuALXIfkeXaaTv6s,471
+verily-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+verily-0.1.0.dist-info/entry_points.txt,sha256=oOhhFgdZ0iaagLEahPtUvsFoAGaidzlCWeDGP0uN5Is,40
+verily-0.1.0.dist-info/licenses/LICENSE,sha256=l6AVXL-la_CflPm9SetkDxEiHBNfIetKdFGiWyCq8-k,1063
+verily-0.1.0.dist-info/RECORD,,

verily-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.27.0
+Root-Is-Purelib: true
+Tag: py3-none-any

verily-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ zen = verily.main:app

verily-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 wowthx
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.