skilltest-sdk 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ # Rust
2
+ /target
3
+ **/*.rs.bk
4
+
5
+ # Python
6
+ .venv/
7
+ __pycache__/
8
+ *.pyc
9
+ .pytest_cache/
10
+ .ruff_cache/
11
+ dist/
12
+ *.egg-info/
13
+
14
+ # Node / TypeScript
15
+ node_modules/
16
+ *.tsbuildinfo
17
+
18
+ # Nx
19
+ .nx/cache
20
+ .nx/workspace-data
21
+
22
+ # Editors / OS
23
+ .DS_Store
24
+ *.swp
25
+
26
+ # Secrets: never commit these. `gh-secrets manifest sync` writes the real harness
27
+ # auth tokens into .env (and tracks push state in .gh-secrets-state.json). The
28
+ # manifest (gh-secrets.json) is committed; the resolved values are not.
29
+ .env
30
+ .env.local
31
+ .gh-secrets-state.json
@@ -0,0 +1,40 @@
1
+ Metadata-Version: 2.4
2
+ Name: skilltest-sdk
3
+ Version: 0.1.0
4
+ Summary: Python SDK for the skilltest CLI: run AI-skill tests and natural-language evals from Python, with a typed report contract.
5
+ Author: Nick DeRobertis
6
+ License-Expression: MIT
7
+ Requires-Python: >=3.12
8
+ Requires-Dist: pydantic>=2.7
9
+ Description-Content-Type: text/markdown
10
+
11
+ # skilltest-sdk
12
+
13
+ The Python SDK for the [`skilltest`](https://github.com/nickderobertis/skilltest)
14
+ CLI. A thin, typed wrapper and nothing else: it runs the CLI as a subprocess and
15
+ parses the stable `--format json` contract into Pydantic models. Test-framework
16
+ integrations build on it — use [`skilltest-pytest`](../../plugins/pytest) if you
17
+ want pytest collection; use this package directly from any other Python code.
18
+
19
+ ```python
20
+ from skilltest_sdk import run_skill, validate_skill
21
+
22
+ report = run_skill("cases/greet.yaml")
23
+ assert report.passed, report.describe_failures()
24
+ # Mix in deterministic checks on the transcript:
25
+ assert "Dr. Smith" in report.runs[0].transcript.assistant_text()
26
+
27
+ result = validate_skill("skills/greeter")
28
+ assert result.valid
29
+ ```
30
+
31
+ The `skilltest` binary is resolved from the `bin=` argument, the
32
+ `SKILLTEST_BIN` env var, or `PATH`; a provider override comes from `provider=`
33
+ or `SKILLTEST_PROVIDER`. A failing eval is *reported* (`report.passed` is
34
+ false), not raised; bad input raises `SkilltestUsageError` (CLI exit 2) and
35
+ provider problems raise `SkilltestProviderError` (exit 3).
36
+
37
+ The Pydantic models are **generated** from `schemas/report.schema.json` /
38
+ `schemas/validation.schema.json` — themselves generated from the CLI's own
39
+ types — via `just gen-contract`, and a drift gate in CI fails if anything is
40
+ stale, so the models cannot diverge from the binary.
@@ -0,0 +1,30 @@
1
+ # skilltest-sdk
2
+
3
+ The Python SDK for the [`skilltest`](https://github.com/nickderobertis/skilltest)
4
+ CLI. A thin, typed wrapper and nothing else: it runs the CLI as a subprocess and
5
+ parses the stable `--format json` contract into Pydantic models. Test-framework
6
+ integrations build on it — use [`skilltest-pytest`](../../plugins/pytest) if you
7
+ want pytest collection; use this package directly from any other Python code.
8
+
9
+ ```python
10
+ from skilltest_sdk import run_skill, validate_skill
11
+
12
+ report = run_skill("cases/greet.yaml")
13
+ assert report.passed, report.describe_failures()
14
+ # Mix in deterministic checks on the transcript:
15
+ assert "Dr. Smith" in report.runs[0].transcript.assistant_text()
16
+
17
+ result = validate_skill("skills/greeter")
18
+ assert result.valid
19
+ ```
20
+
21
+ The `skilltest` binary is resolved from the `bin=` argument, the
22
+ `SKILLTEST_BIN` env var, or `PATH`; a provider override comes from `provider=`
23
+ or `SKILLTEST_PROVIDER`. A failing eval is *reported* (`report.passed` is
24
+ false), not raised; bad input raises `SkilltestUsageError` (CLI exit 2) and
25
+ provider problems raise `SkilltestProviderError` (exit 3).
26
+
27
+ The Pydantic models are **generated** from `schemas/report.schema.json` /
28
+ `schemas/validation.schema.json` — themselves generated from the CLI's own
29
+ types — via `just gen-contract`, and a drift gate in CI fails if anything is
30
+ stale, so the models cannot diverge from the binary.
@@ -0,0 +1,43 @@
1
+ {
2
+ "$schema": "../../node_modules/nx/schemas/project-schema.json",
3
+ "name": "skilltest-sdk",
4
+ "projectType": "library",
5
+ "implicitDependencies": ["skilltest-cli"],
6
+ "targets": {
7
+ "lint": {
8
+ "executor": "nx:run-commands",
9
+ "options": {
10
+ "command": "uv run ruff check .",
11
+ "cwd": "{projectRoot}"
12
+ }
13
+ },
14
+ "format-check": {
15
+ "executor": "nx:run-commands",
16
+ "options": {
17
+ "command": "uv run ruff format --check .",
18
+ "cwd": "{projectRoot}"
19
+ }
20
+ },
21
+ "format": {
22
+ "executor": "nx:run-commands",
23
+ "options": {
24
+ "command": "uv run ruff format .",
25
+ "cwd": "{projectRoot}"
26
+ }
27
+ },
28
+ "typecheck": {
29
+ "executor": "nx:run-commands",
30
+ "options": {
31
+ "command": "uv run ty check",
32
+ "cwd": "{projectRoot}"
33
+ }
34
+ },
35
+ "test-e2e": {
36
+ "executor": "nx:run-commands",
37
+ "options": {
38
+ "command": "SKILLTEST_BIN=\"$PWD/../../target/debug/skilltest\" SKILLTEST_PROVIDER=\"$PWD/../../target/debug/skilltest-fake-provider\" uv run pytest",
39
+ "cwd": "{projectRoot}"
40
+ }
41
+ }
42
+ }
43
+ }
@@ -0,0 +1,40 @@
1
+ [project]
2
+ name = "skilltest-sdk"
3
+ version = "0.1.0"
4
+ description = "Python SDK for the skilltest CLI: run AI-skill tests and natural-language evals from Python, with a typed report contract."
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ license = "MIT"
8
+ authors = [{ name = "Nick DeRobertis" }]
9
+ dependencies = [
10
+ "pydantic>=2.7",
11
+ ]
12
+
13
+ [dependency-groups]
14
+ dev = [
15
+ "datamodel-code-generator>=0.62.0",
16
+ "pytest>=8",
17
+ "ruff>=0.6",
18
+ "ty>=0.0.1a1",
19
+ ]
20
+
21
+ [build-system]
22
+ requires = ["hatchling"]
23
+ build-backend = "hatchling.build"
24
+
25
+ [tool.hatch.build.targets.wheel]
26
+ packages = ["skilltest_sdk"]
27
+
28
+ [tool.ruff]
29
+ line-length = 100
30
+ target-version = "py312"
31
+ # Generated by `just gen-contract` from schemas/ — not hand-maintained, so not
32
+ # held to hand-written style; the drift gate and `ty` still cover them.
33
+ extend-exclude = ["skilltest_sdk/_report.py", "skilltest_sdk/_validation.py"]
34
+
35
+ [tool.ruff.lint]
36
+ select = ["E", "F", "I", "UP", "B", "SIM", "RUF"]
37
+
38
+ [tool.pytest.ini_options]
39
+ addopts = "-q"
40
+ testpaths = ["tests"]
@@ -0,0 +1,61 @@
1
+ """skilltest-sdk: the Python SDK for the ``skilltest`` CLI.
2
+
3
+ A thin, typed wrapper around the CLI and nothing else: run test cases, validate
4
+ skills, and get back models mirroring the ``--format json`` contract. The
5
+ models are generated from the CLI's own JSON Schemas (``just gen-contract``),
6
+ so they cannot drift from the binary. Test frameworks build on this —
7
+ ``skilltest-pytest`` adds pytest collection on top.
8
+
9
+ from skilltest_sdk import run_skill, describe_failures, assistant_text
10
+
11
+ report = run_skill("cases/greet.yaml")
12
+ assert report.passed, describe_failures(report)
13
+ assert "Dr. Smith" in assistant_text(report.runs[0].transcript)
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from .errors import SkilltestError, SkilltestProviderError, SkilltestUsageError
19
+ from .models import (
20
+ BooleanDetail,
21
+ CaseRun,
22
+ EvalOutcome,
23
+ Message,
24
+ NumericDetail,
25
+ Report,
26
+ Summary,
27
+ Transcript,
28
+ Usage,
29
+ ValidationFinding,
30
+ ValidationReport,
31
+ assistant_text,
32
+ describe_failures,
33
+ failed_evals,
34
+ failed_runs,
35
+ )
36
+ from .runner import ENV_BIN, ENV_PROVIDER, run_skill, validate_skill
37
+
38
+ __all__ = [
39
+ "ENV_BIN",
40
+ "ENV_PROVIDER",
41
+ "BooleanDetail",
42
+ "CaseRun",
43
+ "EvalOutcome",
44
+ "Message",
45
+ "NumericDetail",
46
+ "Report",
47
+ "SkilltestError",
48
+ "SkilltestProviderError",
49
+ "SkilltestUsageError",
50
+ "Summary",
51
+ "Transcript",
52
+ "Usage",
53
+ "ValidationFinding",
54
+ "ValidationReport",
55
+ "assistant_text",
56
+ "describe_failures",
57
+ "failed_evals",
58
+ "failed_runs",
59
+ "run_skill",
60
+ "validate_skill",
61
+ ]
@@ -0,0 +1,125 @@
1
+ # generated by datamodel-codegen:
2
+ # filename: report.schema.json
3
+
4
+ from __future__ import annotations
5
+ from typing import Literal
6
+ from pydantic import BaseModel, Field
7
+
8
+
9
+ class BooleanDetail(BaseModel):
10
+ """
11
+ The kind-specific detail of an eval outcome, for reporting.
12
+
13
+ The variant titles name the generated SDK model for each union arm, so keep
14
+ them stable: they are part of the SDK API surface.
15
+ """
16
+
17
+ expected: bool
18
+ kind: Literal["boolean"]
19
+ value: bool
20
+
21
+
22
+ class NumericDetail(BaseModel):
23
+ """
24
+ The kind-specific detail of an eval outcome, for reporting.
25
+
26
+ The variant titles name the generated SDK model for each union arm, so keep
27
+ them stable: they are part of the SDK API surface.
28
+ """
29
+
30
+ comparator: Literal["gte", "gt", "lte", "lt"] = Field(
31
+ ..., description="How a numeric score is compared to its threshold."
32
+ )
33
+ kind: Literal["numeric"]
34
+ threshold: float
35
+ value: float
36
+
37
+
38
+ class EvalOutcome(BaseModel):
39
+ """
40
+ The result of running one eval against a transcript.
41
+ """
42
+
43
+ detail: BooleanDetail | NumericDetail = Field(..., description="Kind-specific verdict detail.")
44
+ label: str = Field(..., description="The eval's label (name or criterion).")
45
+ passed: bool = Field(..., description="Whether the eval passed.")
46
+ reason: str = Field(..., description="The judge's stated reason.")
47
+
48
+
49
+ class Usage(BaseModel):
50
+ """
51
+ Token / cost usage for one provider call.
52
+
53
+ Each field is independently optional because not every harness reports every
54
+ signal (cost is commonly absent on subscription auth; some harnesses report
55
+ no usage at all). The whole struct is `Option<Usage>` on a turn — `None`
56
+ means "no signal," not "zero."
57
+ """
58
+
59
+ cost_usd: float | None = None
60
+ input_tokens: int | None = Field(None, ge=0)
61
+ output_tokens: int | None = Field(None, ge=0)
62
+
63
+
64
+ class Message(BaseModel):
65
+ """
66
+ A single turn in the conversation.
67
+ """
68
+
69
+ content: str
70
+ role: Literal["user", "assistant", "system"] = Field(..., description="Who produced a message.")
71
+
72
+
73
+ class Summary(BaseModel):
74
+ """
75
+ Aggregate pass/fail counts for a report.
76
+ """
77
+
78
+ cases: int = Field(..., description="Distinct test cases represented.", ge=0)
79
+ failed: int = Field(..., description="Runs that failed.", ge=0)
80
+ passed: int = Field(..., description="Runs that passed.", ge=0)
81
+ runs: int = Field(..., description="Total (case × platform × model) runs.", ge=0)
82
+ usage: Usage | None = Field(
83
+ None,
84
+ description="Aggregated token/cost usage across every run in the report. Omitted\nwhen no run reported usage.",
85
+ )
86
+
87
+
88
+ class Transcript(BaseModel):
89
+ """
90
+ An ordered list of messages. Thin wrapper so the type reads clearly at call
91
+ sites and so we can grow conversation-level helpers without churn.
92
+ """
93
+
94
+ messages: list[Message]
95
+
96
+
97
+ class CaseRun(BaseModel):
98
+ """
99
+ The result of running one test case on one (platform, model) pair.
100
+ """
101
+
102
+ case: str = Field(..., description="The test case name.")
103
+ evals: list[EvalOutcome] = Field(..., description="Per-eval outcomes, in declaration order.")
104
+ model: str = Field(..., description="The model this run used.")
105
+ passed: bool = Field(..., description="True iff every eval in this run passed.")
106
+ platform: str = Field(..., description="The harness platform this run used.")
107
+ skill: str = Field(..., description="Absolute-ish path to the skill that was exercised.")
108
+ transcript: Transcript = Field(
109
+ ..., description="The full conversation, for debugging and deterministic mix-in checks."
110
+ )
111
+ turns: int = Field(..., description="Number of assistant turns produced.", ge=0)
112
+ usage: Usage | None = Field(
113
+ None,
114
+ description="Aggregated token/cost usage across every provider call in this run\n(skill turns + simulated-user turns + judge calls). Omitted when no\nusage was reported (e.g. the fake provider or a harness that doesn't\nsurface usage).",
115
+ )
116
+
117
+
118
+ class Report(BaseModel):
119
+ """
120
+ The top-level report for a `skilltest run` invocation.
121
+ """
122
+
123
+ passed: bool = Field(..., description="True iff every run passed.")
124
+ runs: list[CaseRun] = Field(..., description="Every individual run.")
125
+ summary: Summary = Field(..., description="Aggregate counts.")
@@ -0,0 +1,24 @@
1
+ # generated by datamodel-codegen:
2
+ # filename: validation.schema.json
3
+
4
+ from __future__ import annotations
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class ValidationFinding(BaseModel):
9
+ """
10
+ One problem found while validating a skill, as serialized in the
11
+ `skilltest validate --format json` output.
12
+ """
13
+
14
+ message: str = Field(..., description="What is wrong and how to fix it.")
15
+ skill: str = Field(..., description="The skill directory the finding is about.")
16
+
17
+
18
+ class ValidationReport(BaseModel):
19
+ """
20
+ The top-level report for a `skilltest validate` invocation.
21
+ """
22
+
23
+ findings: list[ValidationFinding] = Field(..., description="Every finding, in discovery order.")
24
+ valid: bool = Field(..., description="True iff no findings were produced.")
@@ -0,0 +1,21 @@
1
+ """Exceptions mirroring the CLI's exit-code contract.
2
+
3
+ The CLI distinguishes a *test failure* (exit 1 — surfaced as a [`Report`] with
4
+ ``passed == False``, not an exception) from *bad input* (exit 2) and a *provider
5
+ failure* (exit 3). The latter two are environmental/usage problems the test
6
+ author must fix, so they are raised.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+
12
+ class SkilltestError(Exception):
13
+ """Base class for skilltest SDK errors."""
14
+
15
+
16
+ class SkilltestUsageError(SkilltestError):
17
+ """The CLI rejected the input (exit 2): bad config, malformed YAML, etc."""
18
+
19
+
20
+ class SkilltestProviderError(SkilltestError):
21
+ """The provider command failed (exit 3): not found, crashed, bad output."""
@@ -0,0 +1,69 @@
1
+ """Typed views of the ``skilltest --format json`` contract, plus helpers.
2
+
3
+ The model classes live in ``_report.py`` / ``_validation.py``, which are
4
+ **generated** from the golden JSON Schemas in ``schemas/`` (themselves
5
+ generated from the CLI's Rust types). Never edit the generated modules by
6
+ hand — change the Rust types and run ``just gen-contract``; the gate fails
7
+ while anything is stale. This facade re-exports the generated models and adds
8
+ the hand-written conveniences, which the type checker keeps honest against the
9
+ generated fields.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from ._report import (
15
+ BooleanDetail,
16
+ CaseRun,
17
+ EvalOutcome,
18
+ Message,
19
+ NumericDetail,
20
+ Report,
21
+ Summary,
22
+ Transcript,
23
+ Usage,
24
+ )
25
+ from ._validation import ValidationFinding, ValidationReport
26
+
27
+ __all__ = [
28
+ "BooleanDetail",
29
+ "CaseRun",
30
+ "EvalOutcome",
31
+ "Message",
32
+ "NumericDetail",
33
+ "Report",
34
+ "Summary",
35
+ "Transcript",
36
+ "Usage",
37
+ "ValidationFinding",
38
+ "ValidationReport",
39
+ "assistant_text",
40
+ "describe_failures",
41
+ "failed_evals",
42
+ "failed_runs",
43
+ ]
44
+
45
+
46
+ def assistant_text(transcript: Transcript) -> str:
47
+ """All assistant turns joined — handy for deterministic mix-in checks."""
48
+ return "\n".join(m.content for m in transcript.messages if m.role == "assistant")
49
+
50
+
51
+ def failed_evals(run: CaseRun) -> list[EvalOutcome]:
52
+ """The evals of a run that did not pass."""
53
+ return [e for e in run.evals if not e.passed]
54
+
55
+
56
+ def failed_runs(report: Report) -> list[CaseRun]:
57
+ """The runs of a report that did not pass."""
58
+ return [r for r in report.runs if not r.passed]
59
+
60
+
61
+ def describe_failures(report: Report) -> str:
62
+ """A one-line-per-failed-eval summary, for assertion messages."""
63
+ lines: list[str] = []
64
+ for run in failed_runs(report):
65
+ for outcome in failed_evals(run):
66
+ lines.append(
67
+ f"{run.case} [{run.platform}/{run.model}] {outcome.label}: {outcome.reason}"
68
+ )
69
+ return "\n".join(lines)
@@ -0,0 +1,129 @@
1
+ """Run the ``skilltest`` CLI as a subprocess and parse its JSON contract.
2
+
3
+ This is the code-level API: call [`run_skill`][skilltest_sdk.runner.run_skill],
4
+ get a typed [`Report`], assert on ``report.passed``, and mix in any
5
+ deterministic checks against the transcript.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ import subprocess
12
+ from collections.abc import Sequence
13
+ from pathlib import Path
14
+
15
+ from pydantic import BaseModel, ValidationError
16
+
17
+ from .errors import SkilltestError, SkilltestProviderError, SkilltestUsageError
18
+ from .models import Report, ValidationReport
19
+
20
+ #: Environment variables that supply defaults so callers (test-framework
21
+ #: packages, CI) can locate the binary and provider without per-call arguments.
22
+ ENV_BIN = "SKILLTEST_BIN"
23
+ ENV_PROVIDER = "SKILLTEST_PROVIDER"
24
+
25
+ # Exit codes that still produce a JSON report (0 = all passed, 1 = some failed).
26
+ _REPORTING_CODES = frozenset({0, 1})
27
+
28
+
29
+ def _resolve_bin(bin: str | Path | None) -> str:
30
+ if bin is not None:
31
+ return str(bin)
32
+ return os.environ.get(ENV_BIN, "skilltest")
33
+
34
+
35
+ def _resolve_provider(provider: str | Sequence[str] | None) -> str | None:
36
+ if provider is None:
37
+ provider = os.environ.get(ENV_PROVIDER)
38
+ if provider is None:
39
+ return None
40
+ if isinstance(provider, str):
41
+ return provider
42
+ return " ".join(provider)
43
+
44
+
45
+ def _run(argv: list[str], cwd: str | Path | None) -> subprocess.CompletedProcess[str]:
46
+ try:
47
+ return subprocess.run(
48
+ argv,
49
+ capture_output=True,
50
+ text=True,
51
+ cwd=cwd,
52
+ check=False,
53
+ )
54
+ except FileNotFoundError as exc:
55
+ raise SkilltestProviderError(
56
+ f"could not run skilltest binary `{argv[0]}`: {exc}. Set {ENV_BIN} or pass bin=..."
57
+ ) from exc
58
+
59
+
60
+ def run_skill(
61
+ case: str | Path,
62
+ *,
63
+ bin: str | Path | None = None,
64
+ provider: str | Sequence[str] | None = None,
65
+ platforms: Sequence[str] = (),
66
+ models: Sequence[str] = (),
67
+ judge_model: str | None = None,
68
+ max_turns: int | None = None,
69
+ config: str | Path | None = None,
70
+ cwd: str | Path | None = None,
71
+ ) -> Report:
72
+ """Run one or more test cases and return the parsed [`Report`].
73
+
74
+ ``case`` is a test-case YAML file or a directory of them. A failing eval is
75
+ *not* an exception — it is reported in ``report.passed``/``report.runs`` so
76
+ the caller can assert and inspect. Only bad input ([`SkilltestUsageError`])
77
+ and provider failures ([`SkilltestProviderError`]) raise.
78
+ """
79
+ argv = [_resolve_bin(bin)]
80
+ if config is not None:
81
+ argv += ["--config", str(config)]
82
+ argv += ["run", str(case), "--format", "json"]
83
+
84
+ resolved_provider = _resolve_provider(provider)
85
+ if resolved_provider is not None:
86
+ argv += ["--provider", resolved_provider]
87
+ for platform in platforms:
88
+ argv += ["--platform", platform]
89
+ for model in models:
90
+ argv += ["--model", model]
91
+ if judge_model is not None:
92
+ argv += ["--judge-model", judge_model]
93
+ if max_turns is not None:
94
+ argv += ["--max-turns", str(max_turns)]
95
+
96
+ proc = _run(argv, cwd)
97
+ _raise_for_status(proc)
98
+ return _parse(Report, proc.stdout)
99
+
100
+
101
+ def validate_skill(
102
+ path: str | Path,
103
+ *,
104
+ bin: str | Path | None = None,
105
+ cwd: str | Path | None = None,
106
+ ) -> ValidationReport:
107
+ """Validate a skill directory (or a folder of them) and return findings."""
108
+ argv = [_resolve_bin(bin), "validate", str(path), "--format", "json"]
109
+ proc = _run(argv, cwd)
110
+ _raise_for_status(proc)
111
+ return _parse(ValidationReport, proc.stdout)
112
+
113
+
114
+ def _raise_for_status(proc: subprocess.CompletedProcess[str]) -> None:
115
+ if proc.returncode in _REPORTING_CODES:
116
+ return
117
+ detail = proc.stderr.strip() or proc.stdout.strip()
118
+ if proc.returncode == 2:
119
+ raise SkilltestUsageError(detail)
120
+ if proc.returncode == 3:
121
+ raise SkilltestProviderError(detail)
122
+ raise SkilltestError(f"skilltest exited {proc.returncode}: {detail}")
123
+
124
+
125
+ def _parse[T: BaseModel](model: type[T], stdout: str) -> T:
126
+ try:
127
+ return model.model_validate_json(stdout)
128
+ except ValidationError as exc:
129
+ raise SkilltestError(f"skilltest output did not match the expected schema: {exc}") from exc
@@ -0,0 +1,50 @@
1
+ """Shared test setup: point the SDK at the locally built Rust binaries.
2
+
3
+ These tests exercise the *real* `skilltest` binary and the deterministic
4
+ `skilltest-fake-provider`, both built by `cargo build` (run via `just bootstrap`
5
+ / `just check` before the Python suite). Only the model is faked.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from pathlib import Path
12
+
13
+ import pytest
14
+
15
+ REPO_ROOT = Path(__file__).resolve().parents[3]
16
+ TARGET = REPO_ROOT / "target" / "debug"
17
+ SKILLTEST_BIN = TARGET / "skilltest"
18
+ FAKE_PROVIDER = TARGET / "skilltest-fake-provider"
19
+ FIXTURES = REPO_ROOT / "tests" / "fixtures"
20
+ SCHEMAS = REPO_ROOT / "schemas"
21
+
22
+ # Defaults so the runner finds the binary and provider without per-call wiring.
23
+ os.environ.setdefault("SKILLTEST_BIN", str(SKILLTEST_BIN))
24
+ os.environ.setdefault("SKILLTEST_PROVIDER", str(FAKE_PROVIDER))
25
+
26
+
27
+ @pytest.fixture(scope="session", autouse=True)
28
+ def _require_binaries() -> None:
29
+ missing = [p for p in (SKILLTEST_BIN, FAKE_PROVIDER) if not p.exists()]
30
+ if missing:
31
+ names = ", ".join(str(p) for p in missing)
32
+ pytest.fail(
33
+ f"built binaries not found: {names}. Run `just bootstrap` (cargo build) first.",
34
+ pytrace=False,
35
+ )
36
+
37
+
38
+ @pytest.fixture
39
+ def fixtures() -> Path:
40
+ return FIXTURES
41
+
42
+
43
+ @pytest.fixture
44
+ def cases(fixtures: Path) -> Path:
45
+ return fixtures / "cases"
46
+
47
+
48
+ @pytest.fixture
49
+ def schemas() -> Path:
50
+ return SCHEMAS