skilltest-pytest 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""skilltest-pytest: run AI-skill tests and natural-language evals as pytest.
|
|
2
|
+
|
|
3
|
+
The pytest integration on top of [`skilltest-sdk`][skilltest_sdk]: drop a
|
|
4
|
+
``*.skilltest.yaml`` next to your other tests and pytest collects it as a test
|
|
5
|
+
item. The SDK's code-level API is re-exported here for convenience, so a pytest
|
|
6
|
+
suite only needs one dependency:
|
|
7
|
+
|
|
8
|
+
from skilltest_pytest import run_skill, validate_skill
|
|
9
|
+
|
|
10
|
+
def test_greeter():
|
|
11
|
+
report = run_skill("cases/greet.yaml")
|
|
12
|
+
assert report.passed, describe_failures(report)
|
|
13
|
+
# Mix in a deterministic check on the transcript:
|
|
14
|
+
assert "Dr. Smith" in assistant_text(report.runs[0].transcript)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from skilltest_sdk import (
|
|
20
|
+
ENV_BIN,
|
|
21
|
+
ENV_PROVIDER,
|
|
22
|
+
BooleanDetail,
|
|
23
|
+
CaseRun,
|
|
24
|
+
EvalOutcome,
|
|
25
|
+
Message,
|
|
26
|
+
NumericDetail,
|
|
27
|
+
Report,
|
|
28
|
+
SkilltestError,
|
|
29
|
+
SkilltestProviderError,
|
|
30
|
+
SkilltestUsageError,
|
|
31
|
+
Summary,
|
|
32
|
+
Transcript,
|
|
33
|
+
Usage,
|
|
34
|
+
ValidationFinding,
|
|
35
|
+
ValidationReport,
|
|
36
|
+
assistant_text,
|
|
37
|
+
describe_failures,
|
|
38
|
+
failed_evals,
|
|
39
|
+
failed_runs,
|
|
40
|
+
run_skill,
|
|
41
|
+
validate_skill,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
from .plugin import SkilltestFailure
|
|
45
|
+
|
|
46
|
+
__all__ = [
|
|
47
|
+
"ENV_BIN",
|
|
48
|
+
"ENV_PROVIDER",
|
|
49
|
+
"BooleanDetail",
|
|
50
|
+
"CaseRun",
|
|
51
|
+
"EvalOutcome",
|
|
52
|
+
"Message",
|
|
53
|
+
"NumericDetail",
|
|
54
|
+
"Report",
|
|
55
|
+
"SkilltestError",
|
|
56
|
+
"SkilltestFailure",
|
|
57
|
+
"SkilltestProviderError",
|
|
58
|
+
"SkilltestUsageError",
|
|
59
|
+
"Summary",
|
|
60
|
+
"Transcript",
|
|
61
|
+
"Usage",
|
|
62
|
+
"ValidationFinding",
|
|
63
|
+
"ValidationReport",
|
|
64
|
+
"assistant_text",
|
|
65
|
+
"describe_failures",
|
|
66
|
+
"failed_evals",
|
|
67
|
+
"failed_runs",
|
|
68
|
+
"run_skill",
|
|
69
|
+
"validate_skill",
|
|
70
|
+
]
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""pytest integration: collect ``*.skilltest.yaml`` files as test items.
|
|
2
|
+
|
|
3
|
+
Drop a ``greets.skilltest.yaml`` next to your other tests and `pytest` will run
|
|
4
|
+
it as a case, failing with the judge's reasons when an eval does not pass. For
|
|
5
|
+
finer control — multiple platforms/models, or deterministic mix-in assertions on
|
|
6
|
+
the transcript — call [`run_skill`][skilltest_sdk.runner.run_skill] from an
|
|
7
|
+
ordinary test function instead.
|
|
8
|
+
|
|
9
|
+
Settings come from ``pytest.ini``/``pyproject.toml`` (``skilltest_bin``,
|
|
10
|
+
``skilltest_provider``, ``skilltest_platforms``, ``skilltest_models``,
|
|
11
|
+
``skilltest_config``) or the ``SKILLTEST_BIN`` / ``SKILLTEST_PROVIDER`` env vars.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from typing import TYPE_CHECKING
|
|
17
|
+
|
|
18
|
+
import pytest
|
|
19
|
+
from skilltest_sdk import Report, describe_failures, run_skill
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from collections.abc import Sequence
|
|
23
|
+
|
|
24
|
+
_SUFFIXES = (".skilltest.yaml", ".skilltest.yml")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def pytest_addoption(parser: pytest.Parser) -> None:
|
|
28
|
+
parser.addini("skilltest_bin", "Path to the skilltest binary", default=None)
|
|
29
|
+
parser.addini("skilltest_provider", "Provider command for skilltest", default=None)
|
|
30
|
+
parser.addini("skilltest_platforms", "Platforms to run cases on", type="args", default=[])
|
|
31
|
+
parser.addini("skilltest_models", "Models to run cases on", type="args", default=[])
|
|
32
|
+
parser.addini("skilltest_config", "Path to a skilltest config file", default=None)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def pytest_collect_file(parent: pytest.Collector, file_path) -> SkilltestFile | None:
|
|
36
|
+
name = file_path.name
|
|
37
|
+
if any(name.endswith(suffix) for suffix in _SUFFIXES):
|
|
38
|
+
return SkilltestFile.from_parent(parent, path=file_path)
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class _Settings:
|
|
43
|
+
"""Resolved collector settings, read once from the pytest config."""
|
|
44
|
+
|
|
45
|
+
def __init__(self, config: pytest.Config) -> None:
|
|
46
|
+
self.bin: str | None = config.getini("skilltest_bin") or None
|
|
47
|
+
self.provider: str | None = config.getini("skilltest_provider") or None
|
|
48
|
+
self.platforms: Sequence[str] = config.getini("skilltest_platforms")
|
|
49
|
+
self.models: Sequence[str] = config.getini("skilltest_models")
|
|
50
|
+
self.config: str | None = config.getini("skilltest_config") or None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class SkilltestFailure(Exception):
|
|
54
|
+
"""Raised when a collected case fails, carrying the report for reporting."""
|
|
55
|
+
|
|
56
|
+
def __init__(self, report: Report) -> None:
|
|
57
|
+
super().__init__(describe_failures(report))
|
|
58
|
+
self.report = report
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class SkilltestFile(pytest.File):
|
|
62
|
+
def collect(self): # type: ignore[override]
|
|
63
|
+
yield SkilltestItem.from_parent(self, name=self.path.stem)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class SkilltestItem(pytest.Item):
|
|
67
|
+
def runtest(self) -> None:
|
|
68
|
+
settings = _Settings(self.config)
|
|
69
|
+
report = run_skill(
|
|
70
|
+
self.path,
|
|
71
|
+
bin=settings.bin,
|
|
72
|
+
provider=settings.provider,
|
|
73
|
+
platforms=settings.platforms,
|
|
74
|
+
models=settings.models,
|
|
75
|
+
config=settings.config,
|
|
76
|
+
)
|
|
77
|
+
if not report.passed:
|
|
78
|
+
raise SkilltestFailure(report)
|
|
79
|
+
|
|
80
|
+
def repr_failure(self, excinfo, style=None): # type: ignore[override]
|
|
81
|
+
if isinstance(excinfo.value, SkilltestFailure):
|
|
82
|
+
return f"skilltest case failed:\n{excinfo.value}"
|
|
83
|
+
return super().repr_failure(excinfo, style=style)
|
|
84
|
+
|
|
85
|
+
def reportinfo(self): # type: ignore[override]
|
|
86
|
+
return self.path, 0, f"skilltest: {self.name}"
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: skilltest-pytest
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: pytest integration for skilltest: auto-collect *.skilltest.yaml cases as pytest tests, built on skilltest-sdk.
|
|
5
|
+
Author: Nick DeRobertis
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Requires-Dist: pytest>=8
|
|
9
|
+
Requires-Dist: skilltest-sdk<0.2,>=0.1.0
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# skilltest-pytest
|
|
13
|
+
|
|
14
|
+
A [pytest](https://pytest.org) plugin for [skilltest](../../README.md): run
|
|
15
|
+
AI-skill tests and natural-language evals as ordinary pytest tests, and mix in
|
|
16
|
+
your own deterministic checks. Built on
|
|
17
|
+
[`skilltest-sdk`](../../sdks/python/README.md) — the SDK's code API is
|
|
18
|
+
re-exported here, so a pytest suite needs only this one dependency.
|
|
19
|
+
|
|
20
|
+
## Two ways to use it
|
|
21
|
+
|
|
22
|
+
**Auto-collected case files.** Name a case `something.skilltest.yaml` and pytest
|
|
23
|
+
runs it:
|
|
24
|
+
|
|
25
|
+
```yaml
|
|
26
|
+
# greet.skilltest.yaml
|
|
27
|
+
skill: ./skills/greeter
|
|
28
|
+
input: "Greet Dr. Smith."
|
|
29
|
+
evals:
|
|
30
|
+
- type: boolean
|
|
31
|
+
criterion: "the reply greets Dr. Smith by name"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
**As code**, for matrices and deterministic mix-ins:
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from skilltest_pytest import run_skill
|
|
38
|
+
|
|
39
|
+
def test_greeter():
|
|
40
|
+
report = run_skill("cases/greet.yaml", platforms=["claude-code"], models=["claude-opus-4-8"])
|
|
41
|
+
assert report.passed, report.describe_failures()
|
|
42
|
+
assert "Dr. Smith" in report.runs[0].transcript.assistant_text()
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Configuration
|
|
46
|
+
|
|
47
|
+
The plugin shells out to the `skilltest` binary. Point it at one with the
|
|
48
|
+
`SKILLTEST_BIN` env var (or `bin=`), the provider with `SKILLTEST_PROVIDER` (or
|
|
49
|
+
`provider=`), and set defaults in `pyproject.toml`:
|
|
50
|
+
|
|
51
|
+
```toml
|
|
52
|
+
[tool.pytest.ini_options]
|
|
53
|
+
skilltest_provider = "oneharness"
|
|
54
|
+
skilltest_platforms = ["claude-code"]
|
|
55
|
+
skilltest_models = ["claude-opus-4-8"]
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
See the repository root for the provider protocol and the full schema.
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
skilltest_pytest/__init__.py,sha256=zUtKpvSZqJlJvX3LicBIJ0xB6ni0vPrWyyQ86WSTPV4,1645
|
|
2
|
+
skilltest_pytest/plugin.py,sha256=6MgeZLz7s_gfM0NodYxG-A_Qlm1RSD-mglt6Y7gE0i4,3374
|
|
3
|
+
skilltest_pytest-0.2.0.dist-info/METADATA,sha256=DmT5vH94tbSGAuiyRLTC9Vl7xMqR3uKKjFbEC9kaO98,1788
|
|
4
|
+
skilltest_pytest-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
5
|
+
skilltest_pytest-0.2.0.dist-info/entry_points.txt,sha256=r7Haj7qWnhqk2CyI8I1yttI8L7qXG9ssAZ5Nyccz4Ro,47
|
|
6
|
+
skilltest_pytest-0.2.0.dist-info/RECORD,,
|