pytest-skill-engineering 0.6.4__tar.gz → 0.6.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/PKG-INFO +5 -4
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/README.md +3 -3
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/pyproject.toml +3 -2
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/judge.py +1 -1
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/runner.py +1 -1
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/evals.py +1 -1
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/plugin.py +1 -1
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/execution/clarification.py +1 -1
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/execution/rate_limiter.py +2 -2
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/factories.py +1 -1
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/llm_assert.py +6 -4
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/llm_assert_image.py +7 -4
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/llm_score.py +9 -4
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/plugin.py +3 -2
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/plugin_options.py +3 -3
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/plugin_report.py +3 -2
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/prompts/ai_summary.md +3 -3
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/prompts/coding_agent_analysis.md +2 -2
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/insights.py +2 -2
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/.gitignore +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/LICENSE +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/__init__.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/cli.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/__init__.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/config.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/eval.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/evals.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/events.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/fixtures.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/personas.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/copilot/result.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/__init__.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/errors.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/prompt.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/result.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/scoring.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/serialization.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/skill.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/skill_benchmark.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/skill_eval_results.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/skill_evals.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/skill_grading.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/core/skill_refiner.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/execution/__init__.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/execution/cost.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/execution/servers.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/execution/skill_tools.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/__init__.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/iteration.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/skill_benchmark.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/skill_eval.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/fixtures/skill_refine.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/hooks.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/plugin_recording.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/prompts/__init__.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/__init__.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/collector.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/__init__.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/agent_leaderboard.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/agent_selector.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/overlay.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/report.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/test_comparison.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/test_grid.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/components/types.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/generator.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/reporting/markdown.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/templates/partials/report.css +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/templates/partials/scripts.js +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/testing/__init__.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/testing/banking.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/testing/banking_mcp.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/testing/todo.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/testing/todo_mcp.py +0 -0
- {pytest_skill_engineering-0.6.4 → pytest_skill_engineering-0.6.6}/src/pytest_skill_engineering/testing/types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pytest-skill-engineering
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.6
|
|
4
4
|
Summary: The testing framework for skill engineering. Test tool descriptions, prompt templates, agent skills, and custom agents with real LLMs. AI analyzes results and tells you what to fix.
|
|
5
5
|
Project-URL: Homepage, https://github.com/sbroenne/pytest-skill-engineering
|
|
6
6
|
Project-URL: Repository, https://github.com/sbroenne/pytest-skill-engineering
|
|
@@ -28,6 +28,7 @@ Requires-Dist: nh3>=0.3.3
|
|
|
28
28
|
Requires-Dist: pydantic>=2.0
|
|
29
29
|
Requires-Dist: pytest>=9.0
|
|
30
30
|
Requires-Dist: python-frontmatter>=1.1.0
|
|
31
|
+
Requires-Dist: trio>=0.33.0
|
|
31
32
|
Provides-Extra: dev
|
|
32
33
|
Requires-Dist: pre-commit>=4.5; extra == 'dev'
|
|
33
34
|
Requires-Dist: pyright>=1.1.408; extra == 'dev'
|
|
@@ -62,7 +63,7 @@ Test MCP servers, CLI tools, Agent Skills, and custom agents using the **real Gi
|
|
|
62
63
|
Your MCP server passes all unit tests. Then a user tries it in GitHub Copilot and:
|
|
63
64
|
|
|
64
65
|
- Copilot picks the wrong tool
|
|
65
|
-
- Passes garbage parameters
|
|
66
|
+
- Passes garbage parameters
|
|
66
67
|
- Can't recover from errors
|
|
67
68
|
- Ignores your skill's instructions
|
|
68
69
|
|
|
@@ -97,7 +98,7 @@ async def test_balance_query(copilot_eval):
|
|
|
97
98
|
max_turns=10,
|
|
98
99
|
)
|
|
99
100
|
result = await copilot_eval(agent, "What's my checking balance?")
|
|
100
|
-
|
|
101
|
+
|
|
101
102
|
assert result.success
|
|
102
103
|
assert result.tool_was_called("get_balance")
|
|
103
104
|
```
|
|
@@ -148,7 +149,7 @@ The AI-powered report needs a model to generate insights. Configure it in `pypro
|
|
|
148
149
|
|
|
149
150
|
```toml
|
|
150
151
|
[tool.pytest.ini_options]
|
|
151
|
-
addopts = "--aitest-summary-model=copilot/gpt-5-mini"
|
|
152
|
+
addopts = "--aitest-summary-model=copilot/gpt-5.4-mini"
|
|
152
153
|
```
|
|
153
154
|
|
|
154
155
|
You can also use Azure OpenAI or other providers if you prefer — see [Configuration](https://sbroenne.github.io/pytest-skill-engineering/reference/configuration/).
|
|
@@ -14,7 +14,7 @@ Test MCP servers, CLI tools, Agent Skills, and custom agents using the **real Gi
|
|
|
14
14
|
Your MCP server passes all unit tests. Then a user tries it in GitHub Copilot and:
|
|
15
15
|
|
|
16
16
|
- Copilot picks the wrong tool
|
|
17
|
-
- Passes garbage parameters
|
|
17
|
+
- Passes garbage parameters
|
|
18
18
|
- Can't recover from errors
|
|
19
19
|
- Ignores your skill's instructions
|
|
20
20
|
|
|
@@ -49,7 +49,7 @@ async def test_balance_query(copilot_eval):
|
|
|
49
49
|
max_turns=10,
|
|
50
50
|
)
|
|
51
51
|
result = await copilot_eval(agent, "What's my checking balance?")
|
|
52
|
-
|
|
52
|
+
|
|
53
53
|
assert result.success
|
|
54
54
|
assert result.tool_was_called("get_balance")
|
|
55
55
|
```
|
|
@@ -100,7 +100,7 @@ The AI-powered report needs a model to generate insights. Configure it in `pypro
|
|
|
100
100
|
|
|
101
101
|
```toml
|
|
102
102
|
[tool.pytest.ini_options]
|
|
103
|
-
addopts = "--aitest-summary-model=copilot/gpt-5-mini"
|
|
103
|
+
addopts = "--aitest-summary-model=copilot/gpt-5.4-mini"
|
|
104
104
|
```
|
|
105
105
|
|
|
106
106
|
You can also use Azure OpenAI or other providers if you prefer — see [Configuration](https://sbroenne.github.io/pytest-skill-engineering/reference/configuration/).
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "pytest-skill-engineering"
|
|
7
|
-
version = "0.6.
|
|
7
|
+
version = "0.6.6"
|
|
8
8
|
description = "The testing framework for skill engineering. Test tool descriptions, prompt templates, agent skills, and custom agents with real LLMs. AI analyzes results and tells you what to fix."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -35,6 +35,7 @@ dependencies = [
|
|
|
35
35
|
"python-frontmatter>=1.1.0",
|
|
36
36
|
"nh3>=0.3.3",
|
|
37
37
|
"github-copilot-sdk>=0.2.2",
|
|
38
|
+
"trio>=0.33.0",
|
|
38
39
|
]
|
|
39
40
|
|
|
40
41
|
[project.optional-dependencies]
|
|
@@ -119,7 +120,7 @@ filterwarnings = [
|
|
|
119
120
|
# This demonstrates the recommended setup - configure once in pyproject.toml.
|
|
120
121
|
# LLM auth is handled by the GitHub Copilot SDK (gh auth login or GITHUB_TOKEN)
|
|
121
122
|
addopts = """
|
|
122
|
-
--aitest-summary-model=copilot/gpt-5.
|
|
123
|
+
--aitest-summary-model=copilot/gpt-5.5
|
|
123
124
|
--aitest-html=aitest-reports/report.html
|
|
124
125
|
"""
|
|
125
126
|
markers = [
|
|
@@ -39,7 +39,7 @@ def _approve_all_permissions(*_args: Any, **_kwargs: Any) -> Any:
|
|
|
39
39
|
"""Approve all permission requests using the current SDK result type."""
|
|
40
40
|
from copilot.session import PermissionRequestResult # noqa: PLC0415
|
|
41
41
|
|
|
42
|
-
return PermissionRequestResult(kind="
|
|
42
|
+
return PermissionRequestResult(kind="approve-once")
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
def _get_data_field(event: Any, field: str, default: Any = None) -> Any:
|
|
@@ -115,7 +115,7 @@ def _approve_all_permissions(*_args: Any, **_kwargs: Any) -> Any:
|
|
|
115
115
|
"""Approve all permission requests using the current SDK result type."""
|
|
116
116
|
from copilot.session import PermissionRequestResult
|
|
117
117
|
|
|
118
|
-
return PermissionRequestResult(kind="
|
|
118
|
+
return PermissionRequestResult(kind="approve-once")
|
|
119
119
|
|
|
120
120
|
|
|
121
121
|
def _is_transient_error(error: str | None) -> bool:
|
|
@@ -38,7 +38,7 @@ Example usage::
|
|
|
38
38
|
from pytest_skill_engineering import Eval, Provider
|
|
39
39
|
agent = Eval.from_agent_file(
|
|
40
40
|
".github/agents/reviewer.agent.md",
|
|
41
|
-
provider=Provider(model="azure/gpt-5-mini"),
|
|
41
|
+
provider=Provider(model="azure/gpt-5.4-mini"),
|
|
42
42
|
)
|
|
43
43
|
|
|
44
44
|
# Use with CopilotEval
|
|
@@ -38,7 +38,7 @@ async def check_clarification(
|
|
|
38
38
|
|
|
39
39
|
Args:
|
|
40
40
|
response_text: The agent's final response text to classify.
|
|
41
|
-
judge_model: Model string (e.g. "gpt-5-mini", "claude-sonnet-4").
|
|
41
|
+
judge_model: Model string (e.g. "gpt-5.4-mini", "claude-sonnet-4").
|
|
42
42
|
timeout_seconds: Timeout for the judge LLM call.
|
|
43
43
|
|
|
44
44
|
Returns:
|
|
@@ -5,7 +5,7 @@ tokens per minute (tpm). Rate limiters are shared across all engine instances
|
|
|
5
5
|
using the same model, so concurrent tests respect deployment limits.
|
|
6
6
|
|
|
7
7
|
Usage:
|
|
8
|
-
limiter = get_rate_limiter("azure/gpt-5-mini", rpm=10, tpm=10000)
|
|
8
|
+
limiter = get_rate_limiter("azure/gpt-5.4-mini", rpm=10, tpm=10000)
|
|
9
9
|
await limiter.acquire() # Waits if rate limit would be exceeded
|
|
10
10
|
# ... make API call ...
|
|
11
11
|
limiter.record_tokens(1500) # Track token usage for tpm enforcement
|
|
@@ -38,7 +38,7 @@ def get_rate_limiter(
|
|
|
38
38
|
restrictive limits (minimum of old and new values).
|
|
39
39
|
|
|
40
40
|
Args:
|
|
41
|
-
model: Model identifier string (e.g. "azure/gpt-5-mini").
|
|
41
|
+
model: Model identifier string (e.g. "azure/gpt-5.4-mini").
|
|
42
42
|
rpm: Requests per minute limit.
|
|
43
43
|
tpm: Tokens per minute limit.
|
|
44
44
|
|
|
@@ -18,7 +18,7 @@ def skill_factory() -> Callable[[Path | str], Skill]:
|
|
|
18
18
|
def test_with_skill(skill_factory, eval_run):
|
|
19
19
|
skill = skill_factory("path/to/my-skill")
|
|
20
20
|
agent = Eval(
|
|
21
|
-
provider=Provider(model="azure/gpt-5-mini"),
|
|
21
|
+
provider=Provider(model="azure/gpt-5.4-mini"),
|
|
22
22
|
skill=skill,
|
|
23
23
|
)
|
|
24
24
|
result = await eval_run(agent, "Do something with the skill")
|
|
@@ -12,7 +12,7 @@ from dataclasses import dataclass
|
|
|
12
12
|
|
|
13
13
|
import pytest
|
|
14
14
|
|
|
15
|
-
_LLM_MODEL_DEFAULT = "copilot/gpt-5-mini"
|
|
15
|
+
_LLM_MODEL_DEFAULT = "copilot/gpt-5.4-mini"
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
@dataclass(slots=True)
|
|
@@ -111,17 +111,19 @@ def llm_assert(request: pytest.FixtureRequest) -> LLMAssert:
|
|
|
111
111
|
The judge model is resolved in this order:
|
|
112
112
|
1. ``--llm-model`` if explicitly set
|
|
113
113
|
2. ``--aitest-summary-model`` (same model for analysis and assertions)
|
|
114
|
-
3. ``copilot/gpt-5-mini`` as final fallback
|
|
114
|
+
3. ``copilot/gpt-5.4-mini`` as final fallback
|
|
115
115
|
|
|
116
116
|
Example::
|
|
117
117
|
|
|
118
118
|
def test_response(llm_assert):
|
|
119
119
|
assert llm_assert("Your balance is $1,500", "mentions a dollar amount")
|
|
120
120
|
"""
|
|
121
|
-
model_str
|
|
121
|
+
model_str = request.config.getoption("--llm-model")
|
|
122
|
+
if not isinstance(model_str, str):
|
|
123
|
+
model_str = _LLM_MODEL_DEFAULT
|
|
122
124
|
if model_str == _LLM_MODEL_DEFAULT:
|
|
123
125
|
# Not explicitly set — fall back to summary model if available
|
|
124
126
|
summary_model = request.config.getoption("--aitest-summary-model", default=None)
|
|
125
|
-
if summary_model:
|
|
127
|
+
if isinstance(summary_model, str) and summary_model:
|
|
126
128
|
model_str = summary_model
|
|
127
129
|
return LLMAssert(model=model_str)
|
|
@@ -74,7 +74,7 @@ def llm_assert_image(request: pytest.FixtureRequest) -> LLMAssertImage:
|
|
|
74
74
|
1. ``--llm-vision-model`` if explicitly set
|
|
75
75
|
2. ``--llm-model`` (same model for text and image assertions)
|
|
76
76
|
3. ``--aitest-summary-model``
|
|
77
|
-
4. ``copilot/gpt-5-mini`` as final fallback
|
|
77
|
+
4. ``copilot/gpt-5.4-mini`` as final fallback
|
|
78
78
|
|
|
79
79
|
NOTE: This fixture currently raises NotImplementedError when called,
|
|
80
80
|
as the Copilot SDK does not yet support image inputs in a documented way.
|
|
@@ -86,19 +86,22 @@ def llm_assert_image(request: pytest.FixtureRequest) -> LLMAssertImage:
|
|
|
86
86
|
screenshots = result.tool_images_for("screenshot")
|
|
87
87
|
assert llm_assert_image(screenshots[-1], "shows a bar chart")
|
|
88
88
|
"""
|
|
89
|
-
_LLM_MODEL_DEFAULT = "copilot/gpt-5-mini" # noqa: N806
|
|
89
|
+
_LLM_MODEL_DEFAULT = "copilot/gpt-5.4-mini" # noqa: N806
|
|
90
90
|
|
|
91
91
|
# Try vision-specific model first
|
|
92
|
-
|
|
92
|
+
vision_model_option = request.config.getoption("--llm-vision-model", default=None)
|
|
93
|
+
vision_model_str = vision_model_option if isinstance(vision_model_option, str) else None
|
|
93
94
|
|
|
94
95
|
if vision_model_str:
|
|
95
96
|
model_str = vision_model_str
|
|
96
97
|
else:
|
|
97
98
|
# Fall back to llm-model → summary model → default
|
|
98
99
|
model_str = request.config.getoption("--llm-model")
|
|
100
|
+
if not isinstance(model_str, str):
|
|
101
|
+
model_str = _LLM_MODEL_DEFAULT
|
|
99
102
|
if model_str == _LLM_MODEL_DEFAULT:
|
|
100
103
|
summary_model = request.config.getoption("--aitest-summary-model", default=None)
|
|
101
|
-
if summary_model:
|
|
104
|
+
if isinstance(summary_model, str) and summary_model:
|
|
102
105
|
model_str = summary_model
|
|
103
106
|
|
|
104
107
|
return LLMAssertImage(model=model_str)
|
|
@@ -20,9 +20,12 @@ import pytest
|
|
|
20
20
|
from pytest_skill_engineering.core.scoring import (
|
|
21
21
|
ScoreResult,
|
|
22
22
|
ScoringDimension,
|
|
23
|
+
assert_score,
|
|
23
24
|
)
|
|
24
25
|
|
|
25
|
-
_LLM_MODEL_DEFAULT = "copilot/gpt-5-mini"
|
|
26
|
+
_LLM_MODEL_DEFAULT = "copilot/gpt-5.4-mini"
|
|
27
|
+
|
|
28
|
+
__all__ = ["LLMScore", "ScoreResult", "ScoringDimension", "assert_score", "llm_score"]
|
|
26
29
|
|
|
27
30
|
|
|
28
31
|
# ---------------------------------------------------------------------------
|
|
@@ -240,7 +243,7 @@ def llm_score(request: pytest.FixtureRequest) -> LLMScore:
|
|
|
240
243
|
|
|
241
244
|
1. ``--llm-model`` if explicitly set
|
|
242
245
|
2. ``--aitest-summary-model`` (shared analysis model)
|
|
243
|
-
3. ``copilot/gpt-5-mini`` as final fallback
|
|
246
|
+
3. ``copilot/gpt-5.4-mini`` as final fallback
|
|
244
247
|
|
|
245
248
|
Example::
|
|
246
249
|
|
|
@@ -254,9 +257,11 @@ def llm_score(request: pytest.FixtureRequest) -> LLMScore:
|
|
|
254
257
|
result = llm_score(my_text, rubric)
|
|
255
258
|
assert_score(result, min_total=7)
|
|
256
259
|
"""
|
|
257
|
-
model_str
|
|
260
|
+
model_str = request.config.getoption("--llm-model")
|
|
261
|
+
if not isinstance(model_str, str):
|
|
262
|
+
model_str = _LLM_MODEL_DEFAULT
|
|
258
263
|
if model_str == _LLM_MODEL_DEFAULT:
|
|
259
264
|
summary_model = request.config.getoption("--aitest-summary-model", default=None)
|
|
260
|
-
if summary_model:
|
|
265
|
+
if isinstance(summary_model, str) and summary_model:
|
|
261
266
|
model_str = summary_model
|
|
262
267
|
return LLMScore(model=model_str)
|
|
@@ -181,7 +181,8 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
|
|
|
181
181
|
receives the parameter even though it does not declare the fixture
|
|
182
182
|
explicitly.
|
|
183
183
|
"""
|
|
184
|
-
|
|
184
|
+
count_option = metafunc.config.getoption("--aitest-iterations", default=1)
|
|
185
|
+
count = count_option if isinstance(count_option, int) else 1
|
|
185
186
|
if count <= 1:
|
|
186
187
|
return
|
|
187
188
|
metafunc.fixturenames.append("_aitest_iteration")
|
|
@@ -360,7 +361,7 @@ def _add_junit_properties(
|
|
|
360
361
|
<testcase name="test_balance">
|
|
361
362
|
<properties>
|
|
362
363
|
<property name="aitest.agent.name" value="banking-agent"/>
|
|
363
|
-
<property name="aitest.model" value="gpt-5-mini"/>
|
|
364
|
+
<property name="aitest.model" value="gpt-5.4-mini"/>
|
|
364
365
|
<property name="aitest.skill" value="financial-advisor"/>
|
|
365
366
|
<property name="aitest.tools.called" value="get_balance,transfer"/>
|
|
366
367
|
</properties>
|
|
@@ -24,7 +24,7 @@ def add_aitest_options(group: OptionGroup) -> None:
|
|
|
24
24
|
default=None,
|
|
25
25
|
help=(
|
|
26
26
|
"Model for AI analysis. Required when generating reports. "
|
|
27
|
-
"Use the most capable model you can afford (e.g., gpt-5.
|
|
27
|
+
"Use the most capable model you can afford (e.g., gpt-5.5, claude-opus-4)."
|
|
28
28
|
),
|
|
29
29
|
)
|
|
30
30
|
|
|
@@ -107,10 +107,10 @@ def add_aitest_options(group: OptionGroup) -> None:
|
|
|
107
107
|
# LLM judge model for llm_assert fixture
|
|
108
108
|
group.addoption(
|
|
109
109
|
"--llm-model",
|
|
110
|
-
default="copilot/gpt-5-mini",
|
|
110
|
+
default="copilot/gpt-5.4-mini",
|
|
111
111
|
help=(
|
|
112
112
|
"Model for llm_assert semantic assertions. "
|
|
113
|
-
"Defaults to --aitest-summary-model if set, otherwise copilot/gpt-5-mini."
|
|
113
|
+
"Defaults to --aitest-summary-model if set, otherwise copilot/gpt-5.4-mini."
|
|
114
114
|
),
|
|
115
115
|
)
|
|
116
116
|
|
|
@@ -107,7 +107,8 @@ def generate_structured_insights(
|
|
|
107
107
|
from pytest_skill_engineering.reporting.insights import generate_insights
|
|
108
108
|
|
|
109
109
|
# Require dedicated summary model - no fallback
|
|
110
|
-
|
|
110
|
+
model_option = config.getoption("--aitest-summary-model")
|
|
111
|
+
model = model_option if isinstance(model_option, str) else None
|
|
111
112
|
if not model:
|
|
112
113
|
if required:
|
|
113
114
|
raise pytest.UsageError(
|
|
@@ -196,7 +197,7 @@ def generate_structured_insights(
|
|
|
196
197
|
model=model,
|
|
197
198
|
min_pass_rate=config.getoption("--aitest-min-pass-rate"),
|
|
198
199
|
analysis_prompt=analysis_prompt,
|
|
199
|
-
compact=config.getoption("--aitest-summary-compact"),
|
|
200
|
+
compact=config.getoption("--aitest-summary-compact") is True,
|
|
200
201
|
)
|
|
201
202
|
|
|
202
203
|
# Use asyncio.run() instead of deprecated get_event_loop().run_until_complete()
|
|
@@ -5,7 +5,7 @@ You are analyzing test results for **pytest-skill-engineering**, a skill enginee
|
|
|
5
5
|
## Key Concepts
|
|
6
6
|
|
|
7
7
|
An **Eval** is a complete test configuration — the harness that exercises the skill stack:
|
|
8
|
-
- **Model**: The LLM (e.g., `gpt-5-mini`, `gpt-4.1`)
|
|
8
|
+
- **Model**: The LLM (e.g., `gpt-5.4-mini`, `gpt-4.1`)
|
|
9
9
|
- **MCP/CLI Servers**: The tools being tested (tool descriptions + schemas)
|
|
10
10
|
- **MCP Prompt Templates**: Slash-command prompts bundled with MCP servers (e.g., `/mcp.servername.promptname`)
|
|
11
11
|
- **Skill**: Optional domain knowledge injected into context
|
|
@@ -309,7 +309,7 @@ Use these sections as needed (skip sections with no content):
|
|
|
309
309
|
- **Effective**: Eval followed instructions correctly
|
|
310
310
|
- **Mixed**: Some tests passed, others showed confusion
|
|
311
311
|
- **Ineffective**: Instructions ignored or misunderstood
|
|
312
|
-
- **Model-specific effectiveness**: Instructions that fail with one model may succeed with another. If a variant was tested with multiple models (e.g., `gpt-5-mini + detailed` failed but `gpt-4.1 + detailed` passed), label it **mixed** — NOT ineffective. Only label instructions **ineffective** if they failed across ALL models tested. Always qualify: "ineffective with gpt-5-mini" rather than just "ineffective".
|
|
312
|
+
- **Model-specific effectiveness**: Instructions that fail with one model may succeed with another. If a variant was tested with multiple models (e.g., `gpt-5.4-mini + detailed` failed but `gpt-4.1 + detailed` passed), label it **mixed** — NOT ineffective. Only label instructions **ineffective** if they failed across ALL models tested. Always qualify: "ineffective with gpt-5.4-mini" rather than just "ineffective".
|
|
313
313
|
- Note token bloat: "150 tokens of examples could be removed"
|
|
314
314
|
|
|
315
315
|
### Skill Feedback
|
|
@@ -372,7 +372,7 @@ Use these sections as needed (skip sections with no content):
|
|
|
372
372
|
- **Gauge color values**: green=#4ade80, amber=#facc15, red=#f87171, blue=#60a5fa
|
|
373
373
|
12. **Use pre-computed numbers** — The input includes a "Pre-computed Eval Statistics" section with exact values for pass rates, costs, tokens, winner designation, and aggregate stats (total tests, failures, agents, avg turns). Use these numbers verbatim. Never estimate or approximate.
|
|
374
374
|
13. **Cost comparisons must use actual data** — When comparing costs between agents, use the **actual per-test cost** from the pre-computed statistics (total cost ÷ number of tests). Never cite model list pricing or theoretical cost differences. A cheaper model may use more tokens, making the realized cost difference much smaller than the per-token price difference. For example, if model A costs $0.0018/test and model B costs $0.0025/test, say "~28% cheaper" — NOT "85% cheaper" or "6× cheaper" based on list pricing.
|
|
375
|
-
14. **Instruction labels must be model-specific** — Never label custom agent instructions as globally "ineffective" or globally "effective" when tested with multiple models and produced different outcomes. If `gpt-5-mini + detailed` failed but `gpt-4.1 + detailed` passed, the instructions are "mixed" — effective with gpt-4.1, ineffective with gpt-5-mini. The same applies to the Optimizations section: do not say "restrict [instructions] usage" if they work correctly with some models.
|
|
375
|
+
14. **Instruction labels must be model-specific** — Never label custom agent instructions as globally "ineffective" or globally "effective" when tested with multiple models and produced different outcomes. If `gpt-5.4-mini + detailed` failed but `gpt-4.1 + detailed` passed, the instructions are "mixed" — effective with gpt-4.1, ineffective with gpt-5.4-mini. The same applies to the Optimizations section: do not say "restrict [instructions] usage" if they work correctly with some models.
|
|
376
376
|
15. **Bullet lists need a blank line before them** — In markdown, a list must be preceded by a blank line to render correctly. NEVER put a bullet list directly after a `**bold label:**` on the next line — the markdown parser will collapse them into a single paragraph. Use `####` headings instead of bold labels when you need a label followed by a list.
|
|
377
377
|
16. **Iteration awareness** — When iteration data is present ("Iter Pass Rate" in Pre-computed Eval Statistics), factor consistency into your recommendation. An agent with 100% pass rate at 5/5 iterations is more reliable than one with 100% pass rate at 3/5 iterations. Flag tests with <100% iteration pass rate as **flaky** in your analysis. When no iteration data is present, skip all iteration-related analysis.
|
|
378
378
|
17. **Score awareness** — When LLM score data is present (`LLM Score: X/Y (Z%)`), mention the weighted score in the Winner Card summary and note any dimensions below 70% in the analysis. When no score data exists, skip all score-related commentary.
|
|
@@ -288,7 +288,7 @@ Use these sections as needed (skip sections with no content):
|
|
|
288
288
|
- **Effective**: Eval followed instructions and completed tasks correctly
|
|
289
289
|
- **Mixed**: Some tasks succeeded, others showed the agent ignoring or misunderstanding instructions
|
|
290
290
|
- **Ineffective**: Instructions were ignored or produced worse behavior
|
|
291
|
-
- **Model-specific effectiveness**: An instruction that fails with one model may succeed with another. If an instruction variant was tested with multiple models (e.g., `gpt-5-mini + verbose` failed but `gpt-4.1 + verbose` passed), label it **mixed** — NOT ineffective. Only label an instruction **ineffective** if it failed across ALL models it was tested with. Always qualify: "ineffective with gpt-5-mini" rather than just "ineffective".
|
|
291
|
+
- **Model-specific effectiveness**: An instruction that fails with one model may succeed with another. If an instruction variant was tested with multiple models (e.g., `gpt-5.4-mini + verbose` failed but `gpt-4.1 + verbose` passed), label it **mixed** — NOT ineffective. Only label an instruction **ineffective** if it failed across ALL models it was tested with. Always qualify: "ineffective with gpt-5.4-mini" rather than just "ineffective".
|
|
292
292
|
- Always show the problematic instruction text and a concrete replacement
|
|
293
293
|
|
|
294
294
|
### Tool Usage
|
|
@@ -357,5 +357,5 @@ Use these sections as needed (skip sections with no content):
|
|
|
357
357
|
- **No inline color styles** — use only the CSS class names (green, blue, amber, red) on metric-card and metric-value
|
|
358
358
|
12. **Use pre-computed numbers** — The input includes a "Pre-computed Eval Statistics" section with exact values for pass rates, costs, tokens, winner designation, and aggregate stats (total tests, failures, agents, avg turns). Use these numbers verbatim. Never estimate or approximate.
|
|
359
359
|
13. **Cost comparisons must use actual data** — When comparing costs between agents, use the **actual per-test cost** from the pre-computed statistics (total cost ÷ number of tests). Never cite model list pricing or theoretical cost differences. A cheaper model may use more tokens, making the realized cost difference much smaller than the per-token price difference.
|
|
360
|
-
14. **Instruction labels must be model-specific** — Never label instructions as globally "ineffective" or globally "effective" when tested with multiple models producing different outcomes. If `gpt-5-mini + verbose` failed but `gpt-4.1 + verbose` passed, the instructions are "mixed" — effective with gpt-4.1, ineffective with gpt-5-mini.
|
|
360
|
+
14. **Instruction labels must be model-specific** — Never label instructions as globally "ineffective" or globally "effective" when tested with multiple models producing different outcomes. If `gpt-5.4-mini + verbose` failed but `gpt-4.1 + verbose` passed, the instructions are "mixed" — effective with gpt-4.1, ineffective with gpt-5.4-mini.
|
|
361
361
|
15. **Bullet lists need a blank line before them** — In markdown, a list must be preceded by a blank line to render correctly. NEVER put a bullet list directly after a `**bold label:**` on the next line — the markdown parser will collapse them into a single paragraph. Use `####` headings instead of bold labels when you need a label followed by a list.
|
|
@@ -479,7 +479,7 @@ async def generate_insights(
|
|
|
479
479
|
custom_agent_info: list[CustomAgentInfo] | None = None,
|
|
480
480
|
prompt_names: list[str] | None = None,
|
|
481
481
|
instruction_file_info: list[InstructionFileInfo] | None = None,
|
|
482
|
-
model: str = "copilot/gpt-5-mini",
|
|
482
|
+
model: str = "copilot/gpt-5.4-mini",
|
|
483
483
|
cache_dir: Path | None = None,
|
|
484
484
|
min_pass_rate: int | None = None,
|
|
485
485
|
analysis_prompt: str | None = None,
|
|
@@ -496,7 +496,7 @@ async def generate_insights(
|
|
|
496
496
|
custom_agent_info: Custom agent metadata (optional)
|
|
497
497
|
prompt_names: Names of prompt files tested (optional)
|
|
498
498
|
instruction_file_info: Custom instruction file metadata (optional)
|
|
499
|
-
model: Model identifier (e.g., "copilot/gpt-5-mini", "azure/gpt-5-mini")
|
|
499
|
+
model: Model identifier (e.g., "copilot/gpt-5.4-mini", "azure/gpt-5.4-mini")
|
|
500
500
|
cache_dir: Directory for caching results (optional)
|
|
501
501
|
min_pass_rate: Minimum pass rate threshold for disqualifying agents
|
|
502
502
|
analysis_prompt: Custom analysis prompt text. If None, uses the built-in
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|