PyPI - skilltest-sdk - Versions diffs - 0.1.0__tar.gz - Mend

skilltest-sdk 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

skilltest_sdk-0.1.0/.gitignore +31 -0
skilltest_sdk-0.1.0/PKG-INFO +40 -0
skilltest_sdk-0.1.0/README.md +30 -0
skilltest_sdk-0.1.0/project.json +43 -0
skilltest_sdk-0.1.0/pyproject.toml +40 -0
skilltest_sdk-0.1.0/skilltest_sdk/__init__.py +61 -0
skilltest_sdk-0.1.0/skilltest_sdk/_report.py +125 -0
skilltest_sdk-0.1.0/skilltest_sdk/_validation.py +24 -0
skilltest_sdk-0.1.0/skilltest_sdk/errors.py +21 -0
skilltest_sdk-0.1.0/skilltest_sdk/models.py +69 -0
skilltest_sdk-0.1.0/skilltest_sdk/runner.py +129 -0
skilltest_sdk-0.1.0/tests/conftest.py +50 -0
skilltest_sdk-0.1.0/tests/test_api.py +76 -0
skilltest_sdk-0.1.0/uv.lock +566 -0

skilltest_sdk-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,31 @@
+# Rust
+/target
+**/*.rs.bk
+# Python
+.venv/
+__pycache__/
+*.pyc
+.pytest_cache/
+.ruff_cache/
+dist/
+*.egg-info/
+# Node / TypeScript
+node_modules/
+*.tsbuildinfo
+# Nx
+.nx/cache
+.nx/workspace-data
+# Editors / OS
+.DS_Store
+*.swp
+# Secrets: never commit these. `gh-secrets manifest sync` writes the real harness
+# auth tokens into .env (and tracks push state in .gh-secrets-state.json). The
+# manifest (gh-secrets.json) is committed; the resolved values are not.
+.env
+.env.local
+.gh-secrets-state.json

skilltest_sdk-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,40 @@
+Metadata-Version: 2.4
+Name: skilltest-sdk
+Version: 0.1.0
+Summary: Python SDK for the skilltest CLI: run AI-skill tests and natural-language evals from Python, with a typed report contract.
+Author: Nick DeRobertis
+License-Expression: MIT
+Requires-Python: >=3.12
+Requires-Dist: pydantic>=2.7
+Description-Content-Type: text/markdown
+# skilltest-sdk
+The Python SDK for the [`skilltest`](https://github.com/nickderobertis/skilltest)
+CLI. A thin, typed wrapper and nothing else: it runs the CLI as a subprocess and
+parses the stable `--format json` contract into Pydantic models. Test-framework
+integrations build on it — use [`skilltest-pytest`](../../plugins/pytest) if you
+want pytest collection; use this package directly from any other Python code.
+```python
+from skilltest_sdk import run_skill, validate_skill
+report = run_skill("cases/greet.yaml")
+assert report.passed, report.describe_failures()
+# Mix in deterministic checks on the transcript:
+assert "Dr. Smith" in report.runs[0].transcript.assistant_text()
+result = validate_skill("skills/greeter")
+assert result.valid
+```
+The `skilltest` binary is resolved from the `bin=` argument, the
+`SKILLTEST_BIN` env var, or `PATH`; a provider override comes from `provider=`
+or `SKILLTEST_PROVIDER`. A failing eval is *reported* (`report.passed` is
+false), not raised; bad input raises `SkilltestUsageError` (CLI exit 2) and
+provider problems raise `SkilltestProviderError` (exit 3).
+The Pydantic models are **generated** from `schemas/report.schema.json` /
+`schemas/validation.schema.json` — themselves generated from the CLI's own
+types — via `just gen-contract`, and a drift gate in CI fails if anything is
+stale, so the models cannot diverge from the binary.

skilltest_sdk-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,30 @@
+# skilltest-sdk
+The Python SDK for the [`skilltest`](https://github.com/nickderobertis/skilltest)
+CLI. A thin, typed wrapper and nothing else: it runs the CLI as a subprocess and
+parses the stable `--format json` contract into Pydantic models. Test-framework
+integrations build on it — use [`skilltest-pytest`](../../plugins/pytest) if you
+want pytest collection; use this package directly from any other Python code.
+```python
+from skilltest_sdk import run_skill, validate_skill
+report = run_skill("cases/greet.yaml")
+assert report.passed, report.describe_failures()
+# Mix in deterministic checks on the transcript:
+assert "Dr. Smith" in report.runs[0].transcript.assistant_text()
+result = validate_skill("skills/greeter")
+assert result.valid
+```
+The `skilltest` binary is resolved from the `bin=` argument, the
+`SKILLTEST_BIN` env var, or `PATH`; a provider override comes from `provider=`
+or `SKILLTEST_PROVIDER`. A failing eval is *reported* (`report.passed` is
+false), not raised; bad input raises `SkilltestUsageError` (CLI exit 2) and
+provider problems raise `SkilltestProviderError` (exit 3).
+The Pydantic models are **generated** from `schemas/report.schema.json` /
+`schemas/validation.schema.json` — themselves generated from the CLI's own
+types — via `just gen-contract`, and a drift gate in CI fails if anything is
+stale, so the models cannot diverge from the binary.

skilltest_sdk-0.1.0/project.json ADDED Viewed

@@ -0,0 +1,43 @@
+{
+  "$schema": "../../node_modules/nx/schemas/project-schema.json",
+  "name": "skilltest-sdk",
+  "projectType": "library",
+  "implicitDependencies": ["skilltest-cli"],
+  "targets": {
+    "lint": {
+      "executor": "nx:run-commands",
+      "options": {
+        "command": "uv run ruff check .",
+        "cwd": "{projectRoot}"
+      }
+    },
+    "format-check": {
+      "executor": "nx:run-commands",
+      "options": {
+        "command": "uv run ruff format --check .",
+        "cwd": "{projectRoot}"
+      }
+    },
+    "format": {
+      "executor": "nx:run-commands",
+      "options": {
+        "command": "uv run ruff format .",
+        "cwd": "{projectRoot}"
+      }
+    },
+    "typecheck": {
+      "executor": "nx:run-commands",
+      "options": {
+        "command": "uv run ty check",
+        "cwd": "{projectRoot}"
+      }
+    },
+    "test-e2e": {
+      "executor": "nx:run-commands",
+      "options": {
+        "command": "SKILLTEST_BIN=\"$PWD/../../target/debug/skilltest\" SKILLTEST_PROVIDER=\"$PWD/../../target/debug/skilltest-fake-provider\" uv run pytest",
+        "cwd": "{projectRoot}"
+      }
+    }
+  }
+}

skilltest_sdk-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,40 @@
+[project]
+name = "skilltest-sdk"
+version = "0.1.0"
+description = "Python SDK for the skilltest CLI: run AI-skill tests and natural-language evals from Python, with a typed report contract."
+readme = "README.md"
+requires-python = ">=3.12"
+license = "MIT"
+authors = [{ name = "Nick DeRobertis" }]
+dependencies = [
+    "pydantic>=2.7",
+]
+[dependency-groups]
+dev = [
+    "datamodel-code-generator>=0.62.0",
+    "pytest>=8",
+    "ruff>=0.6",
+    "ty>=0.0.1a1",
+]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["skilltest_sdk"]
+[tool.ruff]
+line-length = 100
+target-version = "py312"
+# Generated by `just gen-contract` from schemas/ — not hand-maintained, so not
+# held to hand-written style; the drift gate and `ty` still cover them.
+extend-exclude = ["skilltest_sdk/_report.py", "skilltest_sdk/_validation.py"]
+[tool.ruff.lint]
+select = ["E", "F", "I", "UP", "B", "SIM", "RUF"]
+[tool.pytest.ini_options]
+addopts = "-q"
+testpaths = ["tests"]

skilltest_sdk-0.1.0/skilltest_sdk/__init__.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""skilltest-sdk: the Python SDK for the ``skilltest`` CLI.
+A thin, typed wrapper around the CLI and nothing else: run test cases, validate
+skills, and get back models mirroring the ``--format json`` contract. The
+models are generated from the CLI's own JSON Schemas (``just gen-contract``),
+so they cannot drift from the binary. Test frameworks build on this —
+``skilltest-pytest`` adds pytest collection on top.
+    from skilltest_sdk import run_skill, describe_failures, assistant_text
+    report = run_skill("cases/greet.yaml")
+    assert report.passed, describe_failures(report)
+    assert "Dr. Smith" in assistant_text(report.runs[0].transcript)
+"""
+from __future__ import annotations
+from .errors import SkilltestError, SkilltestProviderError, SkilltestUsageError
+from .models import (
+    BooleanDetail,
+    CaseRun,
+    EvalOutcome,
+    Message,
+    NumericDetail,
+    Report,
+    Summary,
+    Transcript,
+    Usage,
+    ValidationFinding,
+    ValidationReport,
+    assistant_text,
+    describe_failures,
+    failed_evals,
+    failed_runs,
+)
+from .runner import ENV_BIN, ENV_PROVIDER, run_skill, validate_skill
+__all__ = [
+    "ENV_BIN",
+    "ENV_PROVIDER",
+    "BooleanDetail",
+    "CaseRun",
+    "EvalOutcome",
+    "Message",
+    "NumericDetail",
+    "Report",
+    "SkilltestError",
+    "SkilltestProviderError",
+    "SkilltestUsageError",
+    "Summary",
+    "Transcript",
+    "Usage",
+    "ValidationFinding",
+    "ValidationReport",
+    "assistant_text",
+    "describe_failures",
+    "failed_evals",
+    "failed_runs",
+    "run_skill",
+    "validate_skill",
+]

skilltest_sdk-0.1.0/skilltest_sdk/_report.py ADDED Viewed

@@ -0,0 +1,125 @@
+# generated by datamodel-codegen:
+#   filename:  report.schema.json
+from __future__ import annotations
+from typing import Literal
+from pydantic import BaseModel, Field
+class BooleanDetail(BaseModel):
+    """
+    The kind-specific detail of an eval outcome, for reporting.
+    The variant titles name the generated SDK model for each union arm, so keep
+    them stable: they are part of the SDK API surface.
+    """
+    expected: bool
+    kind: Literal["boolean"]
+    value: bool
+class NumericDetail(BaseModel):
+    """
+    The kind-specific detail of an eval outcome, for reporting.
+    The variant titles name the generated SDK model for each union arm, so keep
+    them stable: they are part of the SDK API surface.
+    """
+    comparator: Literal["gte", "gt", "lte", "lt"] = Field(
+        ..., description="How a numeric score is compared to its threshold."
+    )
+    kind: Literal["numeric"]
+    threshold: float
+    value: float
+class EvalOutcome(BaseModel):
+    """
+    The result of running one eval against a transcript.
+    """
+    detail: BooleanDetail | NumericDetail = Field(..., description="Kind-specific verdict detail.")
+    label: str = Field(..., description="The eval's label (name or criterion).")
+    passed: bool = Field(..., description="Whether the eval passed.")
+    reason: str = Field(..., description="The judge's stated reason.")
+class Usage(BaseModel):
+    """
+    Token / cost usage for one provider call.
+    Each field is independently optional because not every harness reports every
+    signal (cost is commonly absent on subscription auth; some harnesses report
+    no usage at all). The whole struct is `Option<Usage>` on a turn — `None`
+    means "no signal," not "zero."
+    """
+    cost_usd: float | None = None
+    input_tokens: int | None = Field(None, ge=0)
+    output_tokens: int | None = Field(None, ge=0)
+class Message(BaseModel):
+    """
+    A single turn in the conversation.
+    """
+    content: str
+    role: Literal["user", "assistant", "system"] = Field(..., description="Who produced a message.")
+class Summary(BaseModel):
+    """
+    Aggregate pass/fail counts for a report.
+    """
+    cases: int = Field(..., description="Distinct test cases represented.", ge=0)
+    failed: int = Field(..., description="Runs that failed.", ge=0)
+    passed: int = Field(..., description="Runs that passed.", ge=0)
+    runs: int = Field(..., description="Total (case × platform × model) runs.", ge=0)
+    usage: Usage | None = Field(
+        None,
+        description="Aggregated token/cost usage across every run in the report. Omitted\nwhen no run reported usage.",
+    )
+class Transcript(BaseModel):
+    """
+    An ordered list of messages. Thin wrapper so the type reads clearly at call
+    sites and so we can grow conversation-level helpers without churn.
+    """
+    messages: list[Message]
+class CaseRun(BaseModel):
+    """
+    The result of running one test case on one (platform, model) pair.
+    """
+    case: str = Field(..., description="The test case name.")
+    evals: list[EvalOutcome] = Field(..., description="Per-eval outcomes, in declaration order.")
+    model: str = Field(..., description="The model this run used.")
+    passed: bool = Field(..., description="True iff every eval in this run passed.")
+    platform: str = Field(..., description="The harness platform this run used.")
+    skill: str = Field(..., description="Absolute-ish path to the skill that was exercised.")
+    transcript: Transcript = Field(
+        ..., description="The full conversation, for debugging and deterministic mix-in checks."
+    )
+    turns: int = Field(..., description="Number of assistant turns produced.", ge=0)
+    usage: Usage | None = Field(
+        None,
+        description="Aggregated token/cost usage across every provider call in this run\n(skill turns + simulated-user turns + judge calls). Omitted when no\nusage was reported (e.g. the fake provider or a harness that doesn't\nsurface usage).",
+    )
+class Report(BaseModel):
+    """
+    The top-level report for a `skilltest run` invocation.
+    """
+    passed: bool = Field(..., description="True iff every run passed.")
+    runs: list[CaseRun] = Field(..., description="Every individual run.")
+    summary: Summary = Field(..., description="Aggregate counts.")

skilltest_sdk-0.1.0/skilltest_sdk/_validation.py ADDED Viewed

@@ -0,0 +1,24 @@
+# generated by datamodel-codegen:
+#   filename:  validation.schema.json
+from __future__ import annotations
+from pydantic import BaseModel, Field
+class ValidationFinding(BaseModel):
+    """
+    One problem found while validating a skill, as serialized in the
+    `skilltest validate --format json` output.
+    """
+    message: str = Field(..., description="What is wrong and how to fix it.")
+    skill: str = Field(..., description="The skill directory the finding is about.")
+class ValidationReport(BaseModel):
+    """
+    The top-level report for a `skilltest validate` invocation.
+    """
+    findings: list[ValidationFinding] = Field(..., description="Every finding, in discovery order.")
+    valid: bool = Field(..., description="True iff no findings were produced.")

skilltest_sdk-0.1.0/skilltest_sdk/errors.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Exceptions mirroring the CLI's exit-code contract.
+The CLI distinguishes a *test failure* (exit 1 — surfaced as a [`Report`] with
+``passed == False``, not an exception) from *bad input* (exit 2) and a *provider
+failure* (exit 3). The latter two are environmental/usage problems the test
+author must fix, so they are raised.
+"""
+from __future__ import annotations
+class SkilltestError(Exception):
+    """Base class for skilltest SDK errors."""
+class SkilltestUsageError(SkilltestError):
+    """The CLI rejected the input (exit 2): bad config, malformed YAML, etc."""
+class SkilltestProviderError(SkilltestError):
+    """The provider command failed (exit 3): not found, crashed, bad output."""

skilltest_sdk-0.1.0/skilltest_sdk/models.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""Typed views of the ``skilltest --format json`` contract, plus helpers.
+The model classes live in ``_report.py`` / ``_validation.py``, which are
+**generated** from the golden JSON Schemas in ``schemas/`` (themselves
+generated from the CLI's Rust types). Never edit the generated modules by
+hand — change the Rust types and run ``just gen-contract``; the gate fails
+while anything is stale. This facade re-exports the generated models and adds
+the hand-written conveniences, which the type checker keeps honest against the
+generated fields.
+"""
+from __future__ import annotations
+from ._report import (
+    BooleanDetail,
+    CaseRun,
+    EvalOutcome,
+    Message,
+    NumericDetail,
+    Report,
+    Summary,
+    Transcript,
+    Usage,
+)
+from ._validation import ValidationFinding, ValidationReport
+__all__ = [
+    "BooleanDetail",
+    "CaseRun",
+    "EvalOutcome",
+    "Message",
+    "NumericDetail",
+    "Report",
+    "Summary",
+    "Transcript",
+    "Usage",
+    "ValidationFinding",
+    "ValidationReport",
+    "assistant_text",
+    "describe_failures",
+    "failed_evals",
+    "failed_runs",
+]
+def assistant_text(transcript: Transcript) -> str:
+    """All assistant turns joined — handy for deterministic mix-in checks."""
+    return "\n".join(m.content for m in transcript.messages if m.role == "assistant")
+def failed_evals(run: CaseRun) -> list[EvalOutcome]:
+    """The evals of a run that did not pass."""
+    return [e for e in run.evals if not e.passed]
+def failed_runs(report: Report) -> list[CaseRun]:
+    """The runs of a report that did not pass."""
+    return [r for r in report.runs if not r.passed]
+def describe_failures(report: Report) -> str:
+    """A one-line-per-failed-eval summary, for assertion messages."""
+    lines: list[str] = []
+    for run in failed_runs(report):
+        for outcome in failed_evals(run):
+            lines.append(
+                f"{run.case} [{run.platform}/{run.model}] {outcome.label}: {outcome.reason}"
+            )
+    return "\n".join(lines)

skilltest_sdk-0.1.0/skilltest_sdk/runner.py ADDED Viewed

@@ -0,0 +1,129 @@
+"""Run the ``skilltest`` CLI as a subprocess and parse its JSON contract.
+This is the code-level API: call [`run_skill`][skilltest_sdk.runner.run_skill],
+get a typed [`Report`], assert on ``report.passed``, and mix in any
+deterministic checks against the transcript.
+"""
+from __future__ import annotations
+import os
+import subprocess
+from collections.abc import Sequence
+from pathlib import Path
+from pydantic import BaseModel, ValidationError
+from .errors import SkilltestError, SkilltestProviderError, SkilltestUsageError
+from .models import Report, ValidationReport
+#: Environment variables that supply defaults so callers (test-framework
+#: packages, CI) can locate the binary and provider without per-call arguments.
+ENV_BIN = "SKILLTEST_BIN"
+ENV_PROVIDER = "SKILLTEST_PROVIDER"
+# Exit codes that still produce a JSON report (0 = all passed, 1 = some failed).
+_REPORTING_CODES = frozenset({0, 1})
+def _resolve_bin(bin: str | Path | None) -> str:
+    if bin is not None:
+        return str(bin)
+    return os.environ.get(ENV_BIN, "skilltest")
+def _resolve_provider(provider: str | Sequence[str] | None) -> str | None:
+    if provider is None:
+        provider = os.environ.get(ENV_PROVIDER)
+    if provider is None:
+        return None
+    if isinstance(provider, str):
+        return provider
+    return " ".join(provider)
+def _run(argv: list[str], cwd: str | Path | None) -> subprocess.CompletedProcess[str]:
+    try:
+        return subprocess.run(
+            argv,
+            capture_output=True,
+            text=True,
+            cwd=cwd,
+            check=False,
+        )
+    except FileNotFoundError as exc:
+        raise SkilltestProviderError(
+            f"could not run skilltest binary `{argv[0]}`: {exc}. Set {ENV_BIN} or pass bin=..."
+        ) from exc
+def run_skill(
+    case: str | Path,
+    *,
+    bin: str | Path | None = None,
+    provider: str | Sequence[str] | None = None,
+    platforms: Sequence[str] = (),
+    models: Sequence[str] = (),
+    judge_model: str | None = None,
+    max_turns: int | None = None,
+    config: str | Path | None = None,
+    cwd: str | Path | None = None,
+) -> Report:
+    """Run one or more test cases and return the parsed [`Report`].
+    ``case`` is a test-case YAML file or a directory of them. A failing eval is
+    *not* an exception — it is reported in ``report.passed``/``report.runs`` so
+    the caller can assert and inspect. Only bad input ([`SkilltestUsageError`])
+    and provider failures ([`SkilltestProviderError`]) raise.
+    """
+    argv = [_resolve_bin(bin)]
+    if config is not None:
+        argv += ["--config", str(config)]
+    argv += ["run", str(case), "--format", "json"]
+    resolved_provider = _resolve_provider(provider)
+    if resolved_provider is not None:
+        argv += ["--provider", resolved_provider]
+    for platform in platforms:
+        argv += ["--platform", platform]
+    for model in models:
+        argv += ["--model", model]
+    if judge_model is not None:
+        argv += ["--judge-model", judge_model]
+    if max_turns is not None:
+        argv += ["--max-turns", str(max_turns)]
+    proc = _run(argv, cwd)
+    _raise_for_status(proc)
+    return _parse(Report, proc.stdout)
+def validate_skill(
+    path: str | Path,
+    *,
+    bin: str | Path | None = None,
+    cwd: str | Path | None = None,
+) -> ValidationReport:
+    """Validate a skill directory (or a folder of them) and return findings."""
+    argv = [_resolve_bin(bin), "validate", str(path), "--format", "json"]
+    proc = _run(argv, cwd)
+    _raise_for_status(proc)
+    return _parse(ValidationReport, proc.stdout)
+def _raise_for_status(proc: subprocess.CompletedProcess[str]) -> None:
+    if proc.returncode in _REPORTING_CODES:
+        return
+    detail = proc.stderr.strip() or proc.stdout.strip()
+    if proc.returncode == 2:
+        raise SkilltestUsageError(detail)
+    if proc.returncode == 3:
+        raise SkilltestProviderError(detail)
+    raise SkilltestError(f"skilltest exited {proc.returncode}: {detail}")
+def _parse[T: BaseModel](model: type[T], stdout: str) -> T:
+    try:
+        return model.model_validate_json(stdout)
+    except ValidationError as exc:
+        raise SkilltestError(f"skilltest output did not match the expected schema: {exc}") from exc

skilltest_sdk-0.1.0/tests/conftest.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""Shared test setup: point the SDK at the locally built Rust binaries.
+These tests exercise the *real* `skilltest` binary and the deterministic
+`skilltest-fake-provider`, both built by `cargo build` (run via `just bootstrap`
+/ `just check` before the Python suite). Only the model is faked.
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+import pytest
+REPO_ROOT = Path(__file__).resolve().parents[3]
+TARGET = REPO_ROOT / "target" / "debug"
+SKILLTEST_BIN = TARGET / "skilltest"
+FAKE_PROVIDER = TARGET / "skilltest-fake-provider"
+FIXTURES = REPO_ROOT / "tests" / "fixtures"
+SCHEMAS = REPO_ROOT / "schemas"
+# Defaults so the runner finds the binary and provider without per-call wiring.
+os.environ.setdefault("SKILLTEST_BIN", str(SKILLTEST_BIN))
+os.environ.setdefault("SKILLTEST_PROVIDER", str(FAKE_PROVIDER))
+@pytest.fixture(scope="session", autouse=True)
+def _require_binaries() -> None:
+    missing = [p for p in (SKILLTEST_BIN, FAKE_PROVIDER) if not p.exists()]
+    if missing:
+        names = ", ".join(str(p) for p in missing)
+        pytest.fail(
+            f"built binaries not found: {names}. Run `just bootstrap` (cargo build) first.",
+            pytrace=False,
+        )
+@pytest.fixture
+def fixtures() -> Path:
+    return FIXTURES
+@pytest.fixture
+def cases(fixtures: Path) -> Path:
+    return fixtures / "cases"
+@pytest.fixture
+def schemas() -> Path:
+    return SCHEMAS