pytest-skill-engineering 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pytest_skill_engineering-0.0.1/.gitignore +64 -0
- pytest_skill_engineering-0.0.1/LICENSE +21 -0
- pytest_skill_engineering-0.0.1/PKG-INFO +223 -0
- pytest_skill_engineering-0.0.1/README.md +171 -0
- pytest_skill_engineering-0.0.1/pyproject.toml +152 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/__init__.py +165 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/cli.py +333 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/__init__.py +41 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/eval.py +278 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/evals.py +65 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/events.py +433 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/fixtures.py +183 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/model.py +382 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/personas.py +414 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/result.py +227 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/runner.py +223 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/__init__.py +78 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/errors.py +44 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/eval.py +510 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/evals.py +474 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/prompt.py +167 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/result.py +415 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/serialization.py +246 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/skill.py +250 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/__init__.py +13 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/clarification.py +78 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/cli_toolset.py +84 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/cost.py +167 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/engine.py +387 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/optimizer.py +163 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/pydantic_adapter.py +473 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/rate_limiter.py +177 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/servers.py +502 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/skill_tools.py +117 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/fixtures/__init__.py +30 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/fixtures/factories.py +44 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/fixtures/iteration.py +16 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/fixtures/llm_assert.py +123 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/fixtures/llm_assert_image.py +126 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/fixtures/llm_score.py +360 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/fixtures/run.py +176 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/hooks.py +40 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/plugin.py +1073 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/prompts/__init__.py +24 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/prompts/ai_summary.md +378 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/prompts/coding_agent_analysis.md +361 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/__init__.py +30 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/collector.py +191 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/__init__.py +24 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/agent_leaderboard.py +146 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/agent_selector.py +58 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/overlay.py +75 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/report.py +426 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/test_comparison.py +355 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/test_grid.py +332 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/types.py +182 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/generator.py +637 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/insights.py +611 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/markdown.py +465 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/templates/partials/report.css +1000 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/templates/partials/scripts.js +110 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/testing/__init__.py +17 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/testing/banking.py +479 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/testing/banking_mcp.py +148 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/testing/todo.py +380 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/testing/todo_mcp.py +155 -0
- pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/testing/types.py +31 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
*.egg-info/
|
|
20
|
+
.installed.cfg
|
|
21
|
+
*.egg
|
|
22
|
+
|
|
23
|
+
# Virtual environments
|
|
24
|
+
.venv/
|
|
25
|
+
venv/
|
|
26
|
+
ENV/
|
|
27
|
+
env/
|
|
28
|
+
|
|
29
|
+
# IDE
|
|
30
|
+
.idea/
|
|
31
|
+
.vscode/
|
|
32
|
+
*.swp
|
|
33
|
+
*.swo
|
|
34
|
+
*~
|
|
35
|
+
|
|
36
|
+
# Testing
|
|
37
|
+
.pytest_cache/
|
|
38
|
+
.coverage
|
|
39
|
+
htmlcov/
|
|
40
|
+
.tox/
|
|
41
|
+
.nox/
|
|
42
|
+
|
|
43
|
+
# Type checking
|
|
44
|
+
.mypy_cache/
|
|
45
|
+
|
|
46
|
+
# Test outputs
|
|
47
|
+
test_results/
|
|
48
|
+
aitest-reports/
|
|
49
|
+
tests/fixtures/reports/html/
|
|
50
|
+
|
|
51
|
+
# Node.js (Tailwind build)
|
|
52
|
+
node_modules/
|
|
53
|
+
src/pytest_aitest/templates/node_modules/
|
|
54
|
+
package-lock.json
|
|
55
|
+
|
|
56
|
+
# Environment
|
|
57
|
+
.env
|
|
58
|
+
.env.local
|
|
59
|
+
|
|
60
|
+
# OS
|
|
61
|
+
.DS_Store
|
|
62
|
+
Thumbs.db
|
|
63
|
+
site/
|
|
64
|
+
.cache/
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Stefan Brunner
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pytest-skill-engineering
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: The testing framework for skill engineering. Test tool descriptions, prompt templates, agent skills, and custom agents with real LLMs. AI analyzes results and tells you what to fix.
|
|
5
|
+
Project-URL: Homepage, https://github.com/sbroenne/pytest-skill-engineering
|
|
6
|
+
Project-URL: Repository, https://github.com/sbroenne/pytest-skill-engineering
|
|
7
|
+
Author: Stefan Brunner
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: agents,ai,custom-agents,llm,mcp,pytest,skill-engineering,skills,testing
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Framework :: Pytest
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Software Development :: Testing
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Requires-Dist: azure-identity>=1.25.2
|
|
23
|
+
Requires-Dist: htpy>=25.12.0
|
|
24
|
+
Requires-Dist: litellm>=1.81.13
|
|
25
|
+
Requires-Dist: markdown>=3.10.2
|
|
26
|
+
Requires-Dist: mcp>=1.26
|
|
27
|
+
Requires-Dist: mdutils>=1.8.1
|
|
28
|
+
Requires-Dist: pydantic-ai>=1.61.0
|
|
29
|
+
Requires-Dist: pydantic-evals>=1.61.0
|
|
30
|
+
Requires-Dist: pydantic>=2.0
|
|
31
|
+
Requires-Dist: pytest>=9.0
|
|
32
|
+
Requires-Dist: python-frontmatter>=1.1.0
|
|
33
|
+
Provides-Extra: copilot
|
|
34
|
+
Requires-Dist: github-copilot-sdk>=0.1.25; extra == 'copilot'
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: pre-commit>=4.5; extra == 'dev'
|
|
37
|
+
Requires-Dist: pyright>=1.1.408; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest-asyncio>=1.3; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest-cov>=7.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: python-dotenv>=1.2; extra == 'dev'
|
|
41
|
+
Requires-Dist: ruff>=0.15; extra == 'dev'
|
|
42
|
+
Requires-Dist: typeguard>=4.5; extra == 'dev'
|
|
43
|
+
Provides-Extra: docs
|
|
44
|
+
Requires-Dist: cairosvg>=2.7; extra == 'docs'
|
|
45
|
+
Requires-Dist: mkdocs-material<9.8,>=9.6; extra == 'docs'
|
|
46
|
+
Requires-Dist: mkdocs>=1.6; extra == 'docs'
|
|
47
|
+
Requires-Dist: mkdocstrings[python]>=0.24; extra == 'docs'
|
|
48
|
+
Requires-Dist: pillow>=11.0; extra == 'docs'
|
|
49
|
+
Provides-Extra: test
|
|
50
|
+
Requires-Dist: syrupy>=5.1; extra == 'test'
|
|
51
|
+
Description-Content-Type: text/markdown
|
|
52
|
+
|
|
53
|
+
# pytest-skill-engineering
|
|
54
|
+
|
|
55
|
+
[](https://pypi.org/project/pytest-skill-engineering/)
|
|
56
|
+
[](https://pypi.org/project/pytest-skill-engineering/)
|
|
57
|
+
[](https://github.com/sbroenne/pytest-skill-engineering/actions/workflows/ci.yml)
|
|
58
|
+
[](https://opensource.org/licenses/MIT)
|
|
59
|
+
|
|
60
|
+
**Skill Engineering. Test-driven. AI-analyzed.**
|
|
61
|
+
|
|
62
|
+
A pytest plugin for skill engineering — test your MCP server tools, prompt templates, agent skills, and `.agent.md` instruction files with real LLMs. Red/Green/Refactor for the skill stack. Let AI analysis tell you what to fix.
|
|
63
|
+
|
|
64
|
+
## Why?
|
|
65
|
+
|
|
66
|
+
Modern AI systems are built on **skill engineering** — the discipline of designing modular, reliable, callable capabilities that an LLM can discover, invoke, and orchestrate to perform real tasks. Skills are what separate "text generator" from "coding agent that actually does things."
|
|
67
|
+
|
|
68
|
+
An MCP server is the runtime for those skills. It doesn't ship alone — it comes bundled with the **full skill engineering stack**: **tools** (callable functions), **prompt templates** (server-side reasoning starters), **agent skills** (domain knowledge and behavioral guidelines), and **`.agent.md` instruction files** (specialist sub-agent definitions in VS Code / Claude Code format). Users layer on their own **prompt files** (slash commands like `/review`) on top.
|
|
69
|
+
|
|
70
|
+
Your unit tests cover the server code. Nothing covers the skill stack. And the skill stack is what the LLM actually sees.
|
|
71
|
+
|
|
72
|
+
**Skill engineering breaks in ways code tests can't catch:**
|
|
73
|
+
|
|
74
|
+
- The tool description is too vague — the LLM picks the wrong tool or passes garbage parameters
|
|
75
|
+
- The prompt template renders correctly but the assembled message confuses the LLM
|
|
76
|
+
- A prompt file's slash command produces garbage output because the instructions are ambiguous
|
|
77
|
+
- The skill has the right facts but is structured so poorly the LLM skips it
|
|
78
|
+
- The `.agent.md` file has the right tools listed but the description is too vague to trigger subagent dispatch
|
|
79
|
+
|
|
80
|
+
**And when you're improving it — how do you know version A is better than version B?**
|
|
81
|
+
|
|
82
|
+
Skill engineering is iterative — prompt tuning, tool description refinement, `.agent.md` instructions, skill structure. You need A/B testing built in. Run both versions, same prompts, and let the leaderboard tell you which one wins on pass rate and cost.
|
|
83
|
+
|
|
84
|
+
That's what pytest-skill-engineering does: test the full skill engineering stack, compare variants, and get AI analysis that tells you exactly what to fix.
|
|
85
|
+
|
|
86
|
+
## How It Works
|
|
87
|
+
|
|
88
|
+
Write tests as natural language prompts — you assert on what happened. If a test fails, your tool descriptions or skill need work, not your code:
|
|
89
|
+
|
|
90
|
+
1. **Write a test** — a prompt that describes what a user would say
|
|
91
|
+
2. **Run it** — an LLM tries to use your tools and fails
|
|
92
|
+
3. **Fix the skill stack** — improve tool descriptions, schemas, prompts, or `.agent.md` instructions until it passes
|
|
93
|
+
4. **AI analysis tells you what else to optimize** — cost, redundant calls, unused tools
|
|
94
|
+
|
|
95
|
+
pytest-skill-engineering ships two test harnesses:
|
|
96
|
+
|
|
97
|
+
| | `Eval` + `eval_run` | `CopilotEval` + `copilot_eval` |
|
|
98
|
+
|---|---|---|
|
|
99
|
+
| **Runs the LLM** | [Pydantic AI](https://ai.pydantic.dev/) synthetic loop | Real GitHub Copilot (CLI SDK) |
|
|
100
|
+
| **Model** | Any provider (Azure, OpenAI, Copilot) | Copilot's active model only |
|
|
101
|
+
| **MCP auth** | You supply tokens / env vars | Copilot handles OAuth automatically |
|
|
102
|
+
| **Introspection** | Full per-call (tool name, args, timing) | Summary (tool names, final response) |
|
|
103
|
+
| **Cost tracking** | USD per test (via litellm pricing) | Premium requests (Copilot billing) |
|
|
104
|
+
| **Setup** | API keys + model config | `gh auth login` (Copilot subscription) |
|
|
105
|
+
|
|
106
|
+
### Eval + `eval_run` — bring your own model
|
|
107
|
+
|
|
108
|
+
You configure the model, wire up MCP servers directly, and get full per-call introspection. Best for iterating on tool descriptions, A/B testing model variants, and cheap CI runs:
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
from pytest_skill_engineering import Eval, Provider, MCPServer
|
|
112
|
+
|
|
113
|
+
async def test_balance_query(eval_run):
|
|
114
|
+
agent = Eval(
|
|
115
|
+
provider=Provider(model="azure/gpt-5-mini"),
|
|
116
|
+
mcp_servers=[MCPServer(command=["python", "-m", "my_banking_server"])],
|
|
117
|
+
)
|
|
118
|
+
result = await eval_run(agent, "What's my checking balance?")
|
|
119
|
+
assert result.success
|
|
120
|
+
assert result.tool_was_called("get_balance")
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### CopilotEval + `copilot_eval` — use the real coding agent
|
|
124
|
+
|
|
125
|
+
Runs the actual **GitHub Copilot coding agent** — the same one your users have. No model setup, no API keys. Best for end-to-end testing: OAuth handled automatically, skills and custom agents loaded natively:
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from pytest_skill_engineering.copilot import CopilotEval
|
|
129
|
+
|
|
130
|
+
async def test_skill(copilot_eval):
|
|
131
|
+
agent = CopilotEval(skill_directories=["skills/my-skill"])
|
|
132
|
+
result = await copilot_eval(agent, "What can you help me with?")
|
|
133
|
+
assert result.success
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
→ [Choosing a Test Harness](https://sbroenne.github.io/pytest-skill-engineering/explanation/choosing-a-harness/) — full trade-off guide
|
|
137
|
+
|
|
138
|
+
## AI Analysis
|
|
139
|
+
|
|
140
|
+
AI analyzes your results and tells you **what to fix**: which model to deploy, how to improve tool descriptions, where to cut costs. [See a sample report →](https://sbroenne.github.io/pytest-skill-engineering/demo/hero-report.html)
|
|
141
|
+
|
|
142
|
+

|
|
143
|
+
|
|
144
|
+
## Quick Start
|
|
145
|
+
|
|
146
|
+
**Using GitHub Copilot? Zero setup:**
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
uv add pytest-skill-engineering[copilot]
|
|
150
|
+
gh auth login # one-time
|
|
151
|
+
pytest tests/
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
**Using your own model (Azure, OpenAI, Anthropic…):**
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
uv add pytest-skill-engineering
|
|
158
|
+
export AZURE_API_BASE=https://your-resource.openai.azure.com/
|
|
159
|
+
az login
|
|
160
|
+
pytest tests/
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### AI Analysis judge model (optional but recommended)
|
|
164
|
+
|
|
165
|
+
The AI analysis report needs a model to generate insights. Configure it in `pyproject.toml`:
|
|
166
|
+
|
|
167
|
+
**GitHub Copilot:**
|
|
168
|
+
|
|
169
|
+
```toml
|
|
170
|
+
[tool.pytest.ini_options]
|
|
171
|
+
addopts = "--aitest-summary-model=copilot/gpt-5-mini"
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
**Azure OpenAI:**
|
|
175
|
+
|
|
176
|
+
```toml
|
|
177
|
+
[tool.pytest.ini_options]
|
|
178
|
+
addopts = "--aitest-summary-model=azure/gpt-5.2-chat"
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Features
|
|
182
|
+
|
|
183
|
+
- **MCP Server Testing** — Real models against real tool interfaces and bundled prompt templates
|
|
184
|
+
- **Prompt File Testing** — Test VS Code `.prompt.md` and Claude Code command files (slash commands) with `load_prompt_file()` / `load_prompt_files()`
|
|
185
|
+
- **CLI Server Testing** — Wrap CLIs as testable tool servers
|
|
186
|
+
- **Real Coding Agent Testing** — `CopilotEval + copilot_eval` runs the actual Copilot coding agent (native OAuth, skill loading, custom agent dispatch, exact user experience)
|
|
187
|
+
- **`.agent.md` Testing** — Load `.agent.md` files with `Eval.from_agent_file()` to test instructions with any model, or use `load_custom_agent()` + `CopilotEval` to test real custom agent dispatch
|
|
188
|
+
- **Eval Comparison** — Compare models, skills, `.agent.md` versions, and server configurations
|
|
189
|
+
- **Eval Leaderboard** — Auto-ranked by pass rate and cost
|
|
190
|
+
- **Multi-Turn Sessions** — Test conversations that build on context
|
|
191
|
+
- **AI Analysis** — Actionable feedback on tool descriptions, prompts, and costs
|
|
192
|
+
- **Multi-Provider** — Any model via [Pydantic AI](https://ai.pydantic.dev/) (OpenAI, Anthropic, Gemini, Azure, Bedrock, Mistral, and more)
|
|
193
|
+
- **Copilot SDK Provider** — Use `copilot/gpt-5-mini` for all LLM calls (judge, insights, scoring) — zero additional setup with `pytest-skill-engineering[copilot]`
|
|
194
|
+
- **Clarification Detection** — Catch evals that ask questions instead of acting
|
|
195
|
+
- **Semantic Assertions** — Built-in `llm_assert` fixture powered by [pydantic-evals](https://ai.pydantic.dev/evals/) LLM judge
|
|
196
|
+
- **Multi-Dimension Scoring** — `llm_score` fixture for granular quality measurement across named dimensions
|
|
197
|
+
- **Image Assertions** — `llm_assert_image` for AI-graded visual evaluation of screenshots and charts
|
|
198
|
+
- **Cost Estimation** — Automatic per-test cost tracking with pricing from litellm + custom overrides
|
|
199
|
+
|
|
200
|
+
## Who This Is For
|
|
201
|
+
|
|
202
|
+
- **MCP server authors** — Validate that LLMs can actually use your tools
|
|
203
|
+
- **Copilot skill authors** — Test skills and `.agent.md` instructions exactly as users experience them
|
|
204
|
+
- **Eval builders** — Compare models, prompts, and skills to find the best configuration
|
|
205
|
+
- **Teams shipping AI systems** — Catch LLM-facing regressions in CI/CD
|
|
206
|
+
|
|
207
|
+
## Documentation
|
|
208
|
+
|
|
209
|
+
📚 **[Full Documentation](https://sbroenne.github.io/pytest-skill-engineering/)**
|
|
210
|
+
|
|
211
|
+
## Requirements
|
|
212
|
+
|
|
213
|
+
- Python 3.11+
|
|
214
|
+
- pytest 9.0+
|
|
215
|
+
- An LLM provider (Azure, OpenAI, Anthropic, etc.) **or** a GitHub Copilot subscription (`pytest-skill-engineering[copilot]`)
|
|
216
|
+
|
|
217
|
+
## Acknowledgments
|
|
218
|
+
|
|
219
|
+
Inspired by [agent-benchmark](https://github.com/mykhaliev/agent-benchmark).
|
|
220
|
+
|
|
221
|
+
## License
|
|
222
|
+
|
|
223
|
+
MIT
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# pytest-skill-engineering
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/pytest-skill-engineering/)
|
|
4
|
+
[](https://pypi.org/project/pytest-skill-engineering/)
|
|
5
|
+
[](https://github.com/sbroenne/pytest-skill-engineering/actions/workflows/ci.yml)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
|
|
8
|
+
**Skill Engineering. Test-driven. AI-analyzed.**
|
|
9
|
+
|
|
10
|
+
A pytest plugin for skill engineering — test your MCP server tools, prompt templates, agent skills, and `.agent.md` instruction files with real LLMs. Red/Green/Refactor for the skill stack. Let AI analysis tell you what to fix.
|
|
11
|
+
|
|
12
|
+
## Why?
|
|
13
|
+
|
|
14
|
+
Modern AI systems are built on **skill engineering** — the discipline of designing modular, reliable, callable capabilities that an LLM can discover, invoke, and orchestrate to perform real tasks. Skills are what separate "text generator" from "coding agent that actually does things."
|
|
15
|
+
|
|
16
|
+
An MCP server is the runtime for those skills. It doesn't ship alone — it comes bundled with the **full skill engineering stack**: **tools** (callable functions), **prompt templates** (server-side reasoning starters), **agent skills** (domain knowledge and behavioral guidelines), and **`.agent.md` instruction files** (specialist sub-agent definitions in VS Code / Claude Code format). Users layer on their own **prompt files** (slash commands like `/review`) on top.
|
|
17
|
+
|
|
18
|
+
Your unit tests cover the server code. Nothing covers the skill stack. And the skill stack is what the LLM actually sees.
|
|
19
|
+
|
|
20
|
+
**Skill engineering breaks in ways code tests can't catch:**
|
|
21
|
+
|
|
22
|
+
- The tool description is too vague — the LLM picks the wrong tool or passes garbage parameters
|
|
23
|
+
- The prompt template renders correctly but the assembled message confuses the LLM
|
|
24
|
+
- A prompt file's slash command produces garbage output because the instructions are ambiguous
|
|
25
|
+
- The skill has the right facts but is structured so poorly the LLM skips it
|
|
26
|
+
- The `.agent.md` file has the right tools listed but the description is too vague to trigger subagent dispatch
|
|
27
|
+
|
|
28
|
+
**And when you're improving it — how do you know version A is better than version B?**
|
|
29
|
+
|
|
30
|
+
Skill engineering is iterative — prompt tuning, tool description refinement, `.agent.md` instructions, skill structure. You need A/B testing built in. Run both versions, same prompts, and let the leaderboard tell you which one wins on pass rate and cost.
|
|
31
|
+
|
|
32
|
+
That's what pytest-skill-engineering does: test the full skill engineering stack, compare variants, and get AI analysis that tells you exactly what to fix.
|
|
33
|
+
|
|
34
|
+
## How It Works
|
|
35
|
+
|
|
36
|
+
Write tests as natural language prompts — you assert on what happened. If a test fails, your tool descriptions or skill need work, not your code:
|
|
37
|
+
|
|
38
|
+
1. **Write a test** — a prompt that describes what a user would say
|
|
39
|
+
2. **Run it** — an LLM tries to use your tools and fails
|
|
40
|
+
3. **Fix the skill stack** — improve tool descriptions, schemas, prompts, or `.agent.md` instructions until it passes
|
|
41
|
+
4. **AI analysis tells you what else to optimize** — cost, redundant calls, unused tools
|
|
42
|
+
|
|
43
|
+
pytest-skill-engineering ships two test harnesses:
|
|
44
|
+
|
|
45
|
+
| | `Eval` + `eval_run` | `CopilotEval` + `copilot_eval` |
|
|
46
|
+
|---|---|---|
|
|
47
|
+
| **Runs the LLM** | [Pydantic AI](https://ai.pydantic.dev/) synthetic loop | Real GitHub Copilot (CLI SDK) |
|
|
48
|
+
| **Model** | Any provider (Azure, OpenAI, Copilot) | Copilot's active model only |
|
|
49
|
+
| **MCP auth** | You supply tokens / env vars | Copilot handles OAuth automatically |
|
|
50
|
+
| **Introspection** | Full per-call (tool name, args, timing) | Summary (tool names, final response) |
|
|
51
|
+
| **Cost tracking** | USD per test (via litellm pricing) | Premium requests (Copilot billing) |
|
|
52
|
+
| **Setup** | API keys + model config | `gh auth login` (Copilot subscription) |
|
|
53
|
+
|
|
54
|
+
### Eval + `eval_run` — bring your own model
|
|
55
|
+
|
|
56
|
+
You configure the model, wire up MCP servers directly, and get full per-call introspection. Best for iterating on tool descriptions, A/B testing model variants, and cheap CI runs:
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from pytest_skill_engineering import Eval, Provider, MCPServer
|
|
60
|
+
|
|
61
|
+
async def test_balance_query(eval_run):
|
|
62
|
+
agent = Eval(
|
|
63
|
+
provider=Provider(model="azure/gpt-5-mini"),
|
|
64
|
+
mcp_servers=[MCPServer(command=["python", "-m", "my_banking_server"])],
|
|
65
|
+
)
|
|
66
|
+
result = await eval_run(agent, "What's my checking balance?")
|
|
67
|
+
assert result.success
|
|
68
|
+
assert result.tool_was_called("get_balance")
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### CopilotEval + `copilot_eval` — use the real coding agent
|
|
72
|
+
|
|
73
|
+
Runs the actual **GitHub Copilot coding agent** — the same one your users have. No model setup, no API keys. Best for end-to-end testing: OAuth handled automatically, skills and custom agents loaded natively:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from pytest_skill_engineering.copilot import CopilotEval
|
|
77
|
+
|
|
78
|
+
async def test_skill(copilot_eval):
|
|
79
|
+
agent = CopilotEval(skill_directories=["skills/my-skill"])
|
|
80
|
+
result = await copilot_eval(agent, "What can you help me with?")
|
|
81
|
+
assert result.success
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
→ [Choosing a Test Harness](https://sbroenne.github.io/pytest-skill-engineering/explanation/choosing-a-harness/) — full trade-off guide
|
|
85
|
+
|
|
86
|
+
## AI Analysis
|
|
87
|
+
|
|
88
|
+
AI analyzes your results and tells you **what to fix**: which model to deploy, how to improve tool descriptions, where to cut costs. [See a sample report →](https://sbroenne.github.io/pytest-skill-engineering/demo/hero-report.html)
|
|
89
|
+
|
|
90
|
+

|
|
91
|
+
|
|
92
|
+
## Quick Start
|
|
93
|
+
|
|
94
|
+
**Using GitHub Copilot? Zero setup:**
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
uv add pytest-skill-engineering[copilot]
|
|
98
|
+
gh auth login # one-time
|
|
99
|
+
pytest tests/
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**Using your own model (Azure, OpenAI, Anthropic…):**
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
uv add pytest-skill-engineering
|
|
106
|
+
export AZURE_API_BASE=https://your-resource.openai.azure.com/
|
|
107
|
+
az login
|
|
108
|
+
pytest tests/
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### AI Analysis judge model (optional but recommended)
|
|
112
|
+
|
|
113
|
+
The AI analysis report needs a model to generate insights. Configure it in `pyproject.toml`:
|
|
114
|
+
|
|
115
|
+
**GitHub Copilot:**
|
|
116
|
+
|
|
117
|
+
```toml
|
|
118
|
+
[tool.pytest.ini_options]
|
|
119
|
+
addopts = "--aitest-summary-model=copilot/gpt-5-mini"
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
**Azure OpenAI:**
|
|
123
|
+
|
|
124
|
+
```toml
|
|
125
|
+
[tool.pytest.ini_options]
|
|
126
|
+
addopts = "--aitest-summary-model=azure/gpt-5.2-chat"
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Features
|
|
130
|
+
|
|
131
|
+
- **MCP Server Testing** — Real models against real tool interfaces and bundled prompt templates
|
|
132
|
+
- **Prompt File Testing** — Test VS Code `.prompt.md` and Claude Code command files (slash commands) with `load_prompt_file()` / `load_prompt_files()`
|
|
133
|
+
- **CLI Server Testing** — Wrap CLIs as testable tool servers
|
|
134
|
+
- **Real Coding Agent Testing** — `CopilotEval + copilot_eval` runs the actual Copilot coding agent (native OAuth, skill loading, custom agent dispatch, exact user experience)
|
|
135
|
+
- **`.agent.md` Testing** — Load `.agent.md` files with `Eval.from_agent_file()` to test instructions with any model, or use `load_custom_agent()` + `CopilotEval` to test real custom agent dispatch
|
|
136
|
+
- **Eval Comparison** — Compare models, skills, `.agent.md` versions, and server configurations
|
|
137
|
+
- **Eval Leaderboard** — Auto-ranked by pass rate and cost
|
|
138
|
+
- **Multi-Turn Sessions** — Test conversations that build on context
|
|
139
|
+
- **AI Analysis** — Actionable feedback on tool descriptions, prompts, and costs
|
|
140
|
+
- **Multi-Provider** — Any model via [Pydantic AI](https://ai.pydantic.dev/) (OpenAI, Anthropic, Gemini, Azure, Bedrock, Mistral, and more)
|
|
141
|
+
- **Copilot SDK Provider** — Use `copilot/gpt-5-mini` for all LLM calls (judge, insights, scoring) — zero additional setup with `pytest-skill-engineering[copilot]`
|
|
142
|
+
- **Clarification Detection** — Catch evals that ask questions instead of acting
|
|
143
|
+
- **Semantic Assertions** — Built-in `llm_assert` fixture powered by [pydantic-evals](https://ai.pydantic.dev/evals/) LLM judge
|
|
144
|
+
- **Multi-Dimension Scoring** — `llm_score` fixture for granular quality measurement across named dimensions
|
|
145
|
+
- **Image Assertions** — `llm_assert_image` for AI-graded visual evaluation of screenshots and charts
|
|
146
|
+
- **Cost Estimation** — Automatic per-test cost tracking with pricing from litellm + custom overrides
|
|
147
|
+
|
|
148
|
+
## Who This Is For
|
|
149
|
+
|
|
150
|
+
- **MCP server authors** — Validate that LLMs can actually use your tools
|
|
151
|
+
- **Copilot skill authors** — Test skills and `.agent.md` instructions exactly as users experience them
|
|
152
|
+
- **Eval builders** — Compare models, prompts, and skills to find the best configuration
|
|
153
|
+
- **Teams shipping AI systems** — Catch LLM-facing regressions in CI/CD
|
|
154
|
+
|
|
155
|
+
## Documentation
|
|
156
|
+
|
|
157
|
+
📚 **[Full Documentation](https://sbroenne.github.io/pytest-skill-engineering/)**
|
|
158
|
+
|
|
159
|
+
## Requirements
|
|
160
|
+
|
|
161
|
+
- Python 3.11+
|
|
162
|
+
- pytest 9.0+
|
|
163
|
+
- An LLM provider (Azure, OpenAI, Anthropic, etc.) **or** a GitHub Copilot subscription (`pytest-skill-engineering[copilot]`)
|
|
164
|
+
|
|
165
|
+
## Acknowledgments
|
|
166
|
+
|
|
167
|
+
Inspired by [agent-benchmark](https://github.com/mykhaliev/agent-benchmark).
|
|
168
|
+
|
|
169
|
+
## License
|
|
170
|
+
|
|
171
|
+
MIT
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pytest-skill-engineering"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
description = "The testing framework for skill engineering. Test tool descriptions, prompt templates, agent skills, and custom agents with real LLMs. AI analyzes results and tells you what to fix."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Stefan Brunner" }
|
|
14
|
+
]
|
|
15
|
+
keywords = ["pytest", "ai", "llm", "mcp", "testing", "agents", "skill-engineering", "skills", "custom-agents"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Framework :: Pytest",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Operating System :: OS Independent",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Programming Language :: Python :: 3.13",
|
|
26
|
+
"Topic :: Software Development :: Testing",
|
|
27
|
+
]
|
|
28
|
+
dependencies = [
|
|
29
|
+
"pytest>=9.0",
|
|
30
|
+
"mcp>=1.26",
|
|
31
|
+
"pydantic>=2.0",
|
|
32
|
+
"markdown>=3.10.2",
|
|
33
|
+
"htpy>=25.12.0",
|
|
34
|
+
"mdutils>=1.8.1",
|
|
35
|
+
"azure-identity>=1.25.2",
|
|
36
|
+
"pydantic-ai>=1.61.0",
|
|
37
|
+
"pydantic-evals>=1.61.0",
|
|
38
|
+
"litellm>=1.81.13",
|
|
39
|
+
"python-frontmatter>=1.1.0",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.optional-dependencies]
|
|
43
|
+
copilot = [
|
|
44
|
+
"github-copilot-sdk>=0.1.25",
|
|
45
|
+
]
|
|
46
|
+
dev = [
|
|
47
|
+
"pytest-cov>=7.0",
|
|
48
|
+
"pytest-asyncio>=1.3",
|
|
49
|
+
"python-dotenv>=1.2",
|
|
50
|
+
"ruff>=0.15",
|
|
51
|
+
"pyright>=1.1.408",
|
|
52
|
+
"pre-commit>=4.5",
|
|
53
|
+
"typeguard>=4.5",
|
|
54
|
+
]
|
|
55
|
+
test = [
|
|
56
|
+
"syrupy>=5.1",
|
|
57
|
+
]
|
|
58
|
+
docs = [
|
|
59
|
+
"mkdocs>=1.6",
|
|
60
|
+
"mkdocs-material>=9.6,<9.8",
|
|
61
|
+
"mkdocstrings[python]>=0.24",
|
|
62
|
+
"pillow>=11.0",
|
|
63
|
+
"cairosvg>=2.7",
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
[project.scripts]
|
|
67
|
+
pytest-skill-engineering-report = "pytest_skill_engineering.cli:main"
|
|
68
|
+
|
|
69
|
+
[project.entry-points.pytest11]
|
|
70
|
+
aitest = "pytest_skill_engineering.plugin"
|
|
71
|
+
|
|
72
|
+
[project.urls]
|
|
73
|
+
Homepage = "https://github.com/sbroenne/pytest-skill-engineering"
|
|
74
|
+
Repository = "https://github.com/sbroenne/pytest-skill-engineering"
|
|
75
|
+
|
|
76
|
+
[tool.hatch.build.targets.wheel]
|
|
77
|
+
packages = ["src/pytest_skill_engineering"]
|
|
78
|
+
|
|
79
|
+
[tool.hatch.build.targets.wheel.sources]
|
|
80
|
+
"src" = ""
|
|
81
|
+
|
|
82
|
+
[tool.hatch.build]
|
|
83
|
+
include = [
|
|
84
|
+
"src/pytest_skill_engineering/**/*.py",
|
|
85
|
+
"src/pytest_skill_engineering/**/*.md",
|
|
86
|
+
"src/pytest_skill_engineering/**/*.html",
|
|
87
|
+
"src/pytest_skill_engineering/**/*.css",
|
|
88
|
+
"src/pytest_skill_engineering/**/*.js",
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
[tool.ruff]
|
|
92
|
+
line-length = 100
|
|
93
|
+
target-version = "py311"
|
|
94
|
+
|
|
95
|
+
[tool.ruff.lint]
|
|
96
|
+
select = ["E", "F", "B", "I"]
|
|
97
|
+
|
|
98
|
+
[tool.ruff.lint.per-file-ignores]
|
|
99
|
+
# Test harnesses have long tool descriptions for LLM readability
|
|
100
|
+
"src/pytest_skill_engineering/testing/*.py" = ["E501"]
|
|
101
|
+
# Test files have long descriptions and variable declarations
|
|
102
|
+
"tests/**/*.py" = ["E402", "E501", "F841"]
|
|
103
|
+
|
|
104
|
+
[tool.ruff.format]
|
|
105
|
+
quote-style = "double"
|
|
106
|
+
indent-style = "space"
|
|
107
|
+
|
|
108
|
+
[tool.pyright]
|
|
109
|
+
include = ["src"]
|
|
110
|
+
exclude = ["src/pytest_skill_engineering/copilot"]
|
|
111
|
+
pythonVersion = "3.11"
|
|
112
|
+
typeCheckingMode = "basic"
|
|
113
|
+
|
|
114
|
+
[tool.pytest.ini_options]
|
|
115
|
+
testpaths = ["tests"]
|
|
116
|
+
python_files = ["test_*.py"]
|
|
117
|
+
python_classes = ["Test*"]
|
|
118
|
+
python_functions = ["test_*"]
|
|
119
|
+
asyncio_mode = "auto"
|
|
120
|
+
# Suppress third-party deprecation warnings we can't fix
|
|
121
|
+
filterwarnings = [
|
|
122
|
+
"ignore:enable_cleanup_closed:DeprecationWarning:aiohttp",
|
|
123
|
+
]
|
|
124
|
+
# pytest-skill-engineering configuration for this project's own tests.
|
|
125
|
+
# This demonstrates the recommended setup - configure once in pyproject.toml.
|
|
126
|
+
# LLM auth is handled by Pydantic AI via env vars (AZURE_API_BASE, OPENAI_API_KEY, etc.)
|
|
127
|
+
addopts = """
|
|
128
|
+
--aitest-summary-model=azure/gpt-5.2-chat
|
|
129
|
+
--aitest-html=aitest-reports/report.html
|
|
130
|
+
"""
|
|
131
|
+
markers = [
|
|
132
|
+
"integration: marks tests as integration tests (require LLM credentials)",
|
|
133
|
+
"aitest: AI agent tests",
|
|
134
|
+
"basic: basic usage tests (banking and todo)",
|
|
135
|
+
"model: tests with multiple models",
|
|
136
|
+
"sysprompt: tests with multiple system prompts",
|
|
137
|
+
"matrix: model × prompt cross-product tests",
|
|
138
|
+
"skill: tests with skills",
|
|
139
|
+
"session_test: multi-turn session tests",
|
|
140
|
+
"clarification: clarification detection tests",
|
|
141
|
+
"scoring: LLM scoring / rubric tests",
|
|
142
|
+
"iterations: iteration reliability tests",
|
|
143
|
+
"abtest: A/B server comparison tests",
|
|
144
|
+
"cli: CLI server tests",
|
|
145
|
+
"copilot: marks tests as requiring GitHub Copilot SDK credentials",
|
|
146
|
+
]
|
|
147
|
+
|
|
148
|
+
# UV/PEP 735 dependency groups (for uv sync)
|
|
149
|
+
[dependency-groups]
|
|
150
|
+
dev = [
|
|
151
|
+
"pytest-skill-engineering[dev,test,docs]",
|
|
152
|
+
]
|