pytest-skill-engineering 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. pytest_skill_engineering-0.0.1/.gitignore +64 -0
  2. pytest_skill_engineering-0.0.1/LICENSE +21 -0
  3. pytest_skill_engineering-0.0.1/PKG-INFO +223 -0
  4. pytest_skill_engineering-0.0.1/README.md +171 -0
  5. pytest_skill_engineering-0.0.1/pyproject.toml +152 -0
  6. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/__init__.py +165 -0
  7. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/cli.py +333 -0
  8. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/__init__.py +41 -0
  9. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/eval.py +278 -0
  10. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/evals.py +65 -0
  11. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/events.py +433 -0
  12. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/fixtures.py +183 -0
  13. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/model.py +382 -0
  14. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/personas.py +414 -0
  15. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/result.py +227 -0
  16. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/copilot/runner.py +223 -0
  17. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/__init__.py +78 -0
  18. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/errors.py +44 -0
  19. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/eval.py +510 -0
  20. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/evals.py +474 -0
  21. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/prompt.py +167 -0
  22. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/result.py +415 -0
  23. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/serialization.py +246 -0
  24. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/core/skill.py +250 -0
  25. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/__init__.py +13 -0
  26. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/clarification.py +78 -0
  27. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/cli_toolset.py +84 -0
  28. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/cost.py +167 -0
  29. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/engine.py +387 -0
  30. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/optimizer.py +163 -0
  31. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/pydantic_adapter.py +473 -0
  32. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/rate_limiter.py +177 -0
  33. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/servers.py +502 -0
  34. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/execution/skill_tools.py +117 -0
  35. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/fixtures/__init__.py +30 -0
  36. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/fixtures/factories.py +44 -0
  37. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/fixtures/iteration.py +16 -0
  38. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/fixtures/llm_assert.py +123 -0
  39. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/fixtures/llm_assert_image.py +126 -0
  40. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/fixtures/llm_score.py +360 -0
  41. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/fixtures/run.py +176 -0
  42. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/hooks.py +40 -0
  43. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/plugin.py +1073 -0
  44. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/prompts/__init__.py +24 -0
  45. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/prompts/ai_summary.md +378 -0
  46. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/prompts/coding_agent_analysis.md +361 -0
  47. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/__init__.py +30 -0
  48. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/collector.py +191 -0
  49. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/__init__.py +24 -0
  50. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/agent_leaderboard.py +146 -0
  51. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/agent_selector.py +58 -0
  52. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/overlay.py +75 -0
  53. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/report.py +426 -0
  54. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/test_comparison.py +355 -0
  55. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/test_grid.py +332 -0
  56. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/components/types.py +182 -0
  57. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/generator.py +637 -0
  58. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/insights.py +611 -0
  59. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/reporting/markdown.py +465 -0
  60. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/templates/partials/report.css +1000 -0
  61. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/templates/partials/scripts.js +110 -0
  62. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/testing/__init__.py +17 -0
  63. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/testing/banking.py +479 -0
  64. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/testing/banking_mcp.py +148 -0
  65. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/testing/todo.py +380 -0
  66. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/testing/todo_mcp.py +155 -0
  67. pytest_skill_engineering-0.0.1/src/pytest_skill_engineering/testing/types.py +31 -0
@@ -0,0 +1,64 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ .venv/
25
+ venv/
26
+ ENV/
27
+ env/
28
+
29
+ # IDE
30
+ .idea/
31
+ .vscode/
32
+ *.swp
33
+ *.swo
34
+ *~
35
+
36
+ # Testing
37
+ .pytest_cache/
38
+ .coverage
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+
43
+ # Type checking
44
+ .mypy_cache/
45
+
46
+ # Test outputs
47
+ test_results/
48
+ aitest-reports/
49
+ tests/fixtures/reports/html/
50
+
51
+ # Node.js (Tailwind build)
52
+ node_modules/
53
+ src/pytest_aitest/templates/node_modules/
54
+ package-lock.json
55
+
56
+ # Environment
57
+ .env
58
+ .env.local
59
+
60
+ # OS
61
+ .DS_Store
62
+ Thumbs.db
63
+ site/
64
+ .cache/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Stefan Brunner
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,223 @@
1
+ Metadata-Version: 2.4
2
+ Name: pytest-skill-engineering
3
+ Version: 0.0.1
4
+ Summary: The testing framework for skill engineering. Test tool descriptions, prompt templates, agent skills, and custom agents with real LLMs. AI analyzes results and tells you what to fix.
5
+ Project-URL: Homepage, https://github.com/sbroenne/pytest-skill-engineering
6
+ Project-URL: Repository, https://github.com/sbroenne/pytest-skill-engineering
7
+ Author: Stefan Brunner
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: agents,ai,custom-agents,llm,mcp,pytest,skill-engineering,skills,testing
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Framework :: Pytest
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Software Development :: Testing
21
+ Requires-Python: >=3.11
22
+ Requires-Dist: azure-identity>=1.25.2
23
+ Requires-Dist: htpy>=25.12.0
24
+ Requires-Dist: litellm>=1.81.13
25
+ Requires-Dist: markdown>=3.10.2
26
+ Requires-Dist: mcp>=1.26
27
+ Requires-Dist: mdutils>=1.8.1
28
+ Requires-Dist: pydantic-ai>=1.61.0
29
+ Requires-Dist: pydantic-evals>=1.61.0
30
+ Requires-Dist: pydantic>=2.0
31
+ Requires-Dist: pytest>=9.0
32
+ Requires-Dist: python-frontmatter>=1.1.0
33
+ Provides-Extra: copilot
34
+ Requires-Dist: github-copilot-sdk>=0.1.25; extra == 'copilot'
35
+ Provides-Extra: dev
36
+ Requires-Dist: pre-commit>=4.5; extra == 'dev'
37
+ Requires-Dist: pyright>=1.1.408; extra == 'dev'
38
+ Requires-Dist: pytest-asyncio>=1.3; extra == 'dev'
39
+ Requires-Dist: pytest-cov>=7.0; extra == 'dev'
40
+ Requires-Dist: python-dotenv>=1.2; extra == 'dev'
41
+ Requires-Dist: ruff>=0.15; extra == 'dev'
42
+ Requires-Dist: typeguard>=4.5; extra == 'dev'
43
+ Provides-Extra: docs
44
+ Requires-Dist: cairosvg>=2.7; extra == 'docs'
45
+ Requires-Dist: mkdocs-material<9.8,>=9.6; extra == 'docs'
46
+ Requires-Dist: mkdocs>=1.6; extra == 'docs'
47
+ Requires-Dist: mkdocstrings[python]>=0.24; extra == 'docs'
48
+ Requires-Dist: pillow>=11.0; extra == 'docs'
49
+ Provides-Extra: test
50
+ Requires-Dist: syrupy>=5.1; extra == 'test'
51
+ Description-Content-Type: text/markdown
52
+
53
+ # pytest-skill-engineering
54
+
55
+ [![PyPI version](https://img.shields.io/pypi/v/pytest-skill-engineering)](https://pypi.org/project/pytest-skill-engineering/)
56
+ [![Python versions](https://img.shields.io/pypi/pyversions/pytest-skill-engineering)](https://pypi.org/project/pytest-skill-engineering/)
57
+ [![CI](https://github.com/sbroenne/pytest-skill-engineering/actions/workflows/ci.yml/badge.svg)](https://github.com/sbroenne/pytest-skill-engineering/actions/workflows/ci.yml)
58
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
59
+
60
+ **Skill Engineering. Test-driven. AI-analyzed.**
61
+
62
+ A pytest plugin for skill engineering — test your MCP server tools, prompt templates, agent skills, and `.agent.md` instruction files with real LLMs. Red/Green/Refactor for the skill stack. Let AI analysis tell you what to fix.
63
+
64
+ ## Why?
65
+
66
+ Modern AI systems are built on **skill engineering** — the discipline of designing modular, reliable, callable capabilities that an LLM can discover, invoke, and orchestrate to perform real tasks. Skills are what separate "text generator" from "coding agent that actually does things."
67
+
68
+ An MCP server is the runtime for those skills. It doesn't ship alone — it comes bundled with the **full skill engineering stack**: **tools** (callable functions), **prompt templates** (server-side reasoning starters), **agent skills** (domain knowledge and behavioral guidelines), and **`.agent.md` instruction files** (specialist sub-agent definitions in VS Code / Claude Code format). Users layer on their own **prompt files** (slash commands like `/review`) on top.
69
+
70
+ Your unit tests cover the server code. Nothing covers the skill stack. And the skill stack is what the LLM actually sees.
71
+
72
+ **Skill engineering breaks in ways code tests can't catch:**
73
+
74
+ - The tool description is too vague — the LLM picks the wrong tool or passes garbage parameters
75
+ - The prompt template renders correctly but the assembled message confuses the LLM
76
+ - A prompt file's slash command produces garbage output because the instructions are ambiguous
77
+ - The skill has the right facts but is structured so poorly the LLM skips it
78
+ - The `.agent.md` file has the right tools listed but the description is too vague to trigger subagent dispatch
79
+
80
+ **And when you're improving it — how do you know version A is better than version B?**
81
+
82
+ Skill engineering is iterative — prompt tuning, tool description refinement, `.agent.md` instructions, skill structure. You need A/B testing built in. Run both versions, same prompts, and let the leaderboard tell you which one wins on pass rate and cost.
83
+
84
+ That's what pytest-skill-engineering does: test the full skill engineering stack, compare variants, and get AI analysis that tells you exactly what to fix.
85
+
86
+ ## How It Works
87
+
88
+ Write tests as natural language prompts — you assert on what happened. If a test fails, your tool descriptions or skill need work, not your code:
89
+
90
+ 1. **Write a test** — a prompt that describes what a user would say
91
+ 2. **Run it** — an LLM tries to use your tools and fails
92
+ 3. **Fix the skill stack** — improve tool descriptions, schemas, prompts, or `.agent.md` instructions until it passes
93
+ 4. **AI analysis tells you what else to optimize** — cost, redundant calls, unused tools
94
+
95
+ pytest-skill-engineering ships two test harnesses:
96
+
97
+ | | `Eval` + `eval_run` | `CopilotEval` + `copilot_eval` |
98
+ |---|---|---|
99
+ | **Runs the LLM** | [Pydantic AI](https://ai.pydantic.dev/) synthetic loop | Real GitHub Copilot (CLI SDK) |
100
+ | **Model** | Any provider (Azure, OpenAI, Copilot) | Copilot's active model only |
101
+ | **MCP auth** | You supply tokens / env vars | Copilot handles OAuth automatically |
102
+ | **Introspection** | Full per-call (tool name, args, timing) | Summary (tool names, final response) |
103
+ | **Cost tracking** | USD per test (via litellm pricing) | Premium requests (Copilot billing) |
104
+ | **Setup** | API keys + model config | `gh auth login` (Copilot subscription) |
105
+
106
+ ### Eval + `eval_run` — bring your own model
107
+
108
+ You configure the model, wire up MCP servers directly, and get full per-call introspection. Best for iterating on tool descriptions, A/B testing model variants, and cheap CI runs:
109
+
110
+ ```python
111
+ from pytest_skill_engineering import Eval, Provider, MCPServer
112
+
113
+ async def test_balance_query(eval_run):
114
+ agent = Eval(
115
+ provider=Provider(model="azure/gpt-5-mini"),
116
+ mcp_servers=[MCPServer(command=["python", "-m", "my_banking_server"])],
117
+ )
118
+ result = await eval_run(agent, "What's my checking balance?")
119
+ assert result.success
120
+ assert result.tool_was_called("get_balance")
121
+ ```
122
+
123
+ ### CopilotEval + `copilot_eval` — use the real coding agent
124
+
125
+ Runs the actual **GitHub Copilot coding agent** — the same one your users have. No model setup, no API keys. Best for end-to-end testing: OAuth handled automatically, skills and custom agents loaded natively:
126
+
127
+ ```python
128
+ from pytest_skill_engineering.copilot import CopilotEval
129
+
130
+ async def test_skill(copilot_eval):
131
+ agent = CopilotEval(skill_directories=["skills/my-skill"])
132
+ result = await copilot_eval(agent, "What can you help me with?")
133
+ assert result.success
134
+ ```
135
+
136
+ → [Choosing a Test Harness](https://sbroenne.github.io/pytest-skill-engineering/explanation/choosing-a-harness/) — full trade-off guide
137
+
138
+ ## AI Analysis
139
+
140
+ AI analyzes your results and tells you **what to fix**: which model to deploy, how to improve tool descriptions, where to cut costs. [See a sample report →](https://sbroenne.github.io/pytest-skill-engineering/demo/hero-report.html)
141
+
142
+ ![AI Analysis — winner recommendation, metrics, and comparative analysis](screenshots/ai_analysis.png)
143
+
144
+ ## Quick Start
145
+
146
+ **Using GitHub Copilot? Zero setup:**
147
+
148
+ ```bash
149
+ uv add pytest-skill-engineering[copilot]
150
+ gh auth login # one-time
151
+ pytest tests/
152
+ ```
153
+
154
+ **Using your own model (Azure, OpenAI, Anthropic…):**
155
+
156
+ ```bash
157
+ uv add pytest-skill-engineering
158
+ export AZURE_API_BASE=https://your-resource.openai.azure.com/
159
+ az login
160
+ pytest tests/
161
+ ```
162
+
163
+ ### AI Analysis judge model (optional but recommended)
164
+
165
+ The AI analysis report needs a model to generate insights. Configure it in `pyproject.toml`:
166
+
167
+ **GitHub Copilot:**
168
+
169
+ ```toml
170
+ [tool.pytest.ini_options]
171
+ addopts = "--aitest-summary-model=copilot/gpt-5-mini"
172
+ ```
173
+
174
+ **Azure OpenAI:**
175
+
176
+ ```toml
177
+ [tool.pytest.ini_options]
178
+ addopts = "--aitest-summary-model=azure/gpt-5.2-chat"
179
+ ```
180
+
181
+ ## Features
182
+
183
+ - **MCP Server Testing** — Real models against real tool interfaces and bundled prompt templates
184
+ - **Prompt File Testing** — Test VS Code `.prompt.md` and Claude Code command files (slash commands) with `load_prompt_file()` / `load_prompt_files()`
185
+ - **CLI Server Testing** — Wrap CLIs as testable tool servers
186
+ - **Real Coding Agent Testing** — `CopilotEval + copilot_eval` runs the actual Copilot coding agent (native OAuth, skill loading, custom agent dispatch, exact user experience)
187
+ - **`.agent.md` Testing** — Load `.agent.md` files with `Eval.from_agent_file()` to test instructions with any model, or use `load_custom_agent()` + `CopilotEval` to test real custom agent dispatch
188
+ - **Eval Comparison** — Compare models, skills, `.agent.md` versions, and server configurations
189
+ - **Eval Leaderboard** — Auto-ranked by pass rate and cost
190
+ - **Multi-Turn Sessions** — Test conversations that build on context
191
+ - **AI Analysis** — Actionable feedback on tool descriptions, prompts, and costs
192
+ - **Multi-Provider** — Any model via [Pydantic AI](https://ai.pydantic.dev/) (OpenAI, Anthropic, Gemini, Azure, Bedrock, Mistral, and more)
193
+ - **Copilot SDK Provider** — Use `copilot/gpt-5-mini` for all LLM calls (judge, insights, scoring) — zero additional setup with `pytest-skill-engineering[copilot]`
194
+ - **Clarification Detection** — Catch evals that ask questions instead of acting
195
+ - **Semantic Assertions** — Built-in `llm_assert` fixture powered by [pydantic-evals](https://ai.pydantic.dev/evals/) LLM judge
196
+ - **Multi-Dimension Scoring** — `llm_score` fixture for granular quality measurement across named dimensions
197
+ - **Image Assertions** — `llm_assert_image` for AI-graded visual evaluation of screenshots and charts
198
+ - **Cost Estimation** — Automatic per-test cost tracking with pricing from litellm + custom overrides
199
+
200
+ ## Who This Is For
201
+
202
+ - **MCP server authors** — Validate that LLMs can actually use your tools
203
+ - **Copilot skill authors** — Test skills and `.agent.md` instructions exactly as users experience them
204
+ - **Eval builders** — Compare models, prompts, and skills to find the best configuration
205
+ - **Teams shipping AI systems** — Catch LLM-facing regressions in CI/CD
206
+
207
+ ## Documentation
208
+
209
+ 📚 **[Full Documentation](https://sbroenne.github.io/pytest-skill-engineering/)**
210
+
211
+ ## Requirements
212
+
213
+ - Python 3.11+
214
+ - pytest 9.0+
215
+ - An LLM provider (Azure, OpenAI, Anthropic, etc.) **or** a GitHub Copilot subscription (`pytest-skill-engineering[copilot]`)
216
+
217
+ ## Acknowledgments
218
+
219
+ Inspired by [agent-benchmark](https://github.com/mykhaliev/agent-benchmark).
220
+
221
+ ## License
222
+
223
+ MIT
@@ -0,0 +1,171 @@
1
+ # pytest-skill-engineering
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/pytest-skill-engineering)](https://pypi.org/project/pytest-skill-engineering/)
4
+ [![Python versions](https://img.shields.io/pypi/pyversions/pytest-skill-engineering)](https://pypi.org/project/pytest-skill-engineering/)
5
+ [![CI](https://github.com/sbroenne/pytest-skill-engineering/actions/workflows/ci.yml/badge.svg)](https://github.com/sbroenne/pytest-skill-engineering/actions/workflows/ci.yml)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+
8
+ **Skill Engineering. Test-driven. AI-analyzed.**
9
+
10
+ A pytest plugin for skill engineering — test your MCP server tools, prompt templates, agent skills, and `.agent.md` instruction files with real LLMs. Red/Green/Refactor for the skill stack. Let AI analysis tell you what to fix.
11
+
12
+ ## Why?
13
+
14
+ Modern AI systems are built on **skill engineering** — the discipline of designing modular, reliable, callable capabilities that an LLM can discover, invoke, and orchestrate to perform real tasks. Skills are what separate "text generator" from "coding agent that actually does things."
15
+
16
+ An MCP server is the runtime for those skills. It doesn't ship alone — it comes bundled with the **full skill engineering stack**: **tools** (callable functions), **prompt templates** (server-side reasoning starters), **agent skills** (domain knowledge and behavioral guidelines), and **`.agent.md` instruction files** (specialist sub-agent definitions in VS Code / Claude Code format). Users layer on their own **prompt files** (slash commands like `/review`) on top.
17
+
18
+ Your unit tests cover the server code. Nothing covers the skill stack. And the skill stack is what the LLM actually sees.
19
+
20
+ **Skill engineering breaks in ways code tests can't catch:**
21
+
22
+ - The tool description is too vague — the LLM picks the wrong tool or passes garbage parameters
23
+ - The prompt template renders correctly but the assembled message confuses the LLM
24
+ - A prompt file's slash command produces garbage output because the instructions are ambiguous
25
+ - The skill has the right facts but is structured so poorly the LLM skips it
26
+ - The `.agent.md` file has the right tools listed but the description is too vague to trigger subagent dispatch
27
+
28
+ **And when you're improving it — how do you know version A is better than version B?**
29
+
30
+ Skill engineering is iterative — prompt tuning, tool description refinement, `.agent.md` instructions, skill structure. You need A/B testing built in. Run both versions, same prompts, and let the leaderboard tell you which one wins on pass rate and cost.
31
+
32
+ That's what pytest-skill-engineering does: test the full skill engineering stack, compare variants, and get AI analysis that tells you exactly what to fix.
33
+
34
+ ## How It Works
35
+
36
+ Write tests as natural language prompts — you assert on what happened. If a test fails, your tool descriptions or skill need work, not your code:
37
+
38
+ 1. **Write a test** — a prompt that describes what a user would say
39
+ 2. **Run it** — an LLM tries to use your tools and fails
40
+ 3. **Fix the skill stack** — improve tool descriptions, schemas, prompts, or `.agent.md` instructions until it passes
41
+ 4. **AI analysis tells you what else to optimize** — cost, redundant calls, unused tools
42
+
43
+ pytest-skill-engineering ships two test harnesses:
44
+
45
+ | | `Eval` + `eval_run` | `CopilotEval` + `copilot_eval` |
46
+ |---|---|---|
47
+ | **Runs the LLM** | [Pydantic AI](https://ai.pydantic.dev/) synthetic loop | Real GitHub Copilot (CLI SDK) |
48
+ | **Model** | Any provider (Azure, OpenAI, Copilot) | Copilot's active model only |
49
+ | **MCP auth** | You supply tokens / env vars | Copilot handles OAuth automatically |
50
+ | **Introspection** | Full per-call (tool name, args, timing) | Summary (tool names, final response) |
51
+ | **Cost tracking** | USD per test (via litellm pricing) | Premium requests (Copilot billing) |
52
+ | **Setup** | API keys + model config | `gh auth login` (Copilot subscription) |
53
+
54
+ ### Eval + `eval_run` — bring your own model
55
+
56
+ You configure the model, wire up MCP servers directly, and get full per-call introspection. Best for iterating on tool descriptions, A/B testing model variants, and cheap CI runs:
57
+
58
+ ```python
59
+ from pytest_skill_engineering import Eval, Provider, MCPServer
60
+
61
+ async def test_balance_query(eval_run):
62
+ agent = Eval(
63
+ provider=Provider(model="azure/gpt-5-mini"),
64
+ mcp_servers=[MCPServer(command=["python", "-m", "my_banking_server"])],
65
+ )
66
+ result = await eval_run(agent, "What's my checking balance?")
67
+ assert result.success
68
+ assert result.tool_was_called("get_balance")
69
+ ```
70
+
71
+ ### CopilotEval + `copilot_eval` — use the real coding agent
72
+
73
+ Runs the actual **GitHub Copilot coding agent** — the same one your users have. No model setup, no API keys. Best for end-to-end testing: OAuth handled automatically, skills and custom agents loaded natively:
74
+
75
+ ```python
76
+ from pytest_skill_engineering.copilot import CopilotEval
77
+
78
+ async def test_skill(copilot_eval):
79
+ agent = CopilotEval(skill_directories=["skills/my-skill"])
80
+ result = await copilot_eval(agent, "What can you help me with?")
81
+ assert result.success
82
+ ```
83
+
84
+ → [Choosing a Test Harness](https://sbroenne.github.io/pytest-skill-engineering/explanation/choosing-a-harness/) — full trade-off guide
85
+
86
+ ## AI Analysis
87
+
88
+ AI analyzes your results and tells you **what to fix**: which model to deploy, how to improve tool descriptions, where to cut costs. [See a sample report →](https://sbroenne.github.io/pytest-skill-engineering/demo/hero-report.html)
89
+
90
+ ![AI Analysis — winner recommendation, metrics, and comparative analysis](screenshots/ai_analysis.png)
91
+
92
+ ## Quick Start
93
+
94
+ **Using GitHub Copilot? Zero setup:**
95
+
96
+ ```bash
97
+ uv add pytest-skill-engineering[copilot]
98
+ gh auth login # one-time
99
+ pytest tests/
100
+ ```
101
+
102
+ **Using your own model (Azure, OpenAI, Anthropic…):**
103
+
104
+ ```bash
105
+ uv add pytest-skill-engineering
106
+ export AZURE_API_BASE=https://your-resource.openai.azure.com/
107
+ az login
108
+ pytest tests/
109
+ ```
110
+
111
+ ### AI Analysis judge model (optional but recommended)
112
+
113
+ The AI analysis report needs a model to generate insights. Configure it in `pyproject.toml`:
114
+
115
+ **GitHub Copilot:**
116
+
117
+ ```toml
118
+ [tool.pytest.ini_options]
119
+ addopts = "--aitest-summary-model=copilot/gpt-5-mini"
120
+ ```
121
+
122
+ **Azure OpenAI:**
123
+
124
+ ```toml
125
+ [tool.pytest.ini_options]
126
+ addopts = "--aitest-summary-model=azure/gpt-5.2-chat"
127
+ ```
128
+
129
+ ## Features
130
+
131
+ - **MCP Server Testing** — Real models against real tool interfaces and bundled prompt templates
132
+ - **Prompt File Testing** — Test VS Code `.prompt.md` and Claude Code command files (slash commands) with `load_prompt_file()` / `load_prompt_files()`
133
+ - **CLI Server Testing** — Wrap CLIs as testable tool servers
134
+ - **Real Coding Agent Testing** — `CopilotEval + copilot_eval` runs the actual Copilot coding agent (native OAuth, skill loading, custom agent dispatch, exact user experience)
135
+ - **`.agent.md` Testing** — Load `.agent.md` files with `Eval.from_agent_file()` to test instructions with any model, or use `load_custom_agent()` + `CopilotEval` to test real custom agent dispatch
136
+ - **Eval Comparison** — Compare models, skills, `.agent.md` versions, and server configurations
137
+ - **Eval Leaderboard** — Auto-ranked by pass rate and cost
138
+ - **Multi-Turn Sessions** — Test conversations that build on context
139
+ - **AI Analysis** — Actionable feedback on tool descriptions, prompts, and costs
140
+ - **Multi-Provider** — Any model via [Pydantic AI](https://ai.pydantic.dev/) (OpenAI, Anthropic, Gemini, Azure, Bedrock, Mistral, and more)
141
+ - **Copilot SDK Provider** — Use `copilot/gpt-5-mini` for all LLM calls (judge, insights, scoring) — zero additional setup with `pytest-skill-engineering[copilot]`
142
+ - **Clarification Detection** — Catch evals that ask questions instead of acting
143
+ - **Semantic Assertions** — Built-in `llm_assert` fixture powered by [pydantic-evals](https://ai.pydantic.dev/evals/) LLM judge
144
+ - **Multi-Dimension Scoring** — `llm_score` fixture for granular quality measurement across named dimensions
145
+ - **Image Assertions** — `llm_assert_image` for AI-graded visual evaluation of screenshots and charts
146
+ - **Cost Estimation** — Automatic per-test cost tracking with pricing from litellm + custom overrides
147
+
148
+ ## Who This Is For
149
+
150
+ - **MCP server authors** — Validate that LLMs can actually use your tools
151
+ - **Copilot skill authors** — Test skills and `.agent.md` instructions exactly as users experience them
152
+ - **Eval builders** — Compare models, prompts, and skills to find the best configuration
153
+ - **Teams shipping AI systems** — Catch LLM-facing regressions in CI/CD
154
+
155
+ ## Documentation
156
+
157
+ 📚 **[Full Documentation](https://sbroenne.github.io/pytest-skill-engineering/)**
158
+
159
+ ## Requirements
160
+
161
+ - Python 3.11+
162
+ - pytest 9.0+
163
+ - An LLM provider (Azure, OpenAI, Anthropic, etc.) **or** a GitHub Copilot subscription (`pytest-skill-engineering[copilot]`)
164
+
165
+ ## Acknowledgments
166
+
167
+ Inspired by [agent-benchmark](https://github.com/mykhaliev/agent-benchmark).
168
+
169
+ ## License
170
+
171
+ MIT
@@ -0,0 +1,152 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "pytest-skill-engineering"
7
+ version = "0.0.1"
8
+ description = "The testing framework for skill engineering. Test tool descriptions, prompt templates, agent skills, and custom agents with real LLMs. AI analyzes results and tells you what to fix."
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ requires-python = ">=3.11"
12
+ authors = [
13
+ { name = "Stefan Brunner" }
14
+ ]
15
+ keywords = ["pytest", "ai", "llm", "mcp", "testing", "agents", "skill-engineering", "skills", "custom-agents"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Framework :: Pytest",
19
+ "Intended Audience :: Developers",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Operating System :: OS Independent",
22
+ "Programming Language :: Python :: 3",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ "Programming Language :: Python :: 3.13",
26
+ "Topic :: Software Development :: Testing",
27
+ ]
28
+ dependencies = [
29
+ "pytest>=9.0",
30
+ "mcp>=1.26",
31
+ "pydantic>=2.0",
32
+ "markdown>=3.10.2",
33
+ "htpy>=25.12.0",
34
+ "mdutils>=1.8.1",
35
+ "azure-identity>=1.25.2",
36
+ "pydantic-ai>=1.61.0",
37
+ "pydantic-evals>=1.61.0",
38
+ "litellm>=1.81.13",
39
+ "python-frontmatter>=1.1.0",
40
+ ]
41
+
42
+ [project.optional-dependencies]
43
+ copilot = [
44
+ "github-copilot-sdk>=0.1.25",
45
+ ]
46
+ dev = [
47
+ "pytest-cov>=7.0",
48
+ "pytest-asyncio>=1.3",
49
+ "python-dotenv>=1.2",
50
+ "ruff>=0.15",
51
+ "pyright>=1.1.408",
52
+ "pre-commit>=4.5",
53
+ "typeguard>=4.5",
54
+ ]
55
+ test = [
56
+ "syrupy>=5.1",
57
+ ]
58
+ docs = [
59
+ "mkdocs>=1.6",
60
+ "mkdocs-material>=9.6,<9.8",
61
+ "mkdocstrings[python]>=0.24",
62
+ "pillow>=11.0",
63
+ "cairosvg>=2.7",
64
+ ]
65
+
66
+ [project.scripts]
67
+ pytest-skill-engineering-report = "pytest_skill_engineering.cli:main"
68
+
69
+ [project.entry-points.pytest11]
70
+ aitest = "pytest_skill_engineering.plugin"
71
+
72
+ [project.urls]
73
+ Homepage = "https://github.com/sbroenne/pytest-skill-engineering"
74
+ Repository = "https://github.com/sbroenne/pytest-skill-engineering"
75
+
76
+ [tool.hatch.build.targets.wheel]
77
+ packages = ["src/pytest_skill_engineering"]
78
+
79
+ [tool.hatch.build.targets.wheel.sources]
80
+ "src" = ""
81
+
82
+ [tool.hatch.build]
83
+ include = [
84
+ "src/pytest_skill_engineering/**/*.py",
85
+ "src/pytest_skill_engineering/**/*.md",
86
+ "src/pytest_skill_engineering/**/*.html",
87
+ "src/pytest_skill_engineering/**/*.css",
88
+ "src/pytest_skill_engineering/**/*.js",
89
+ ]
90
+
91
+ [tool.ruff]
92
+ line-length = 100
93
+ target-version = "py311"
94
+
95
+ [tool.ruff.lint]
96
+ select = ["E", "F", "B", "I"]
97
+
98
+ [tool.ruff.lint.per-file-ignores]
99
+ # Test harnesses have long tool descriptions for LLM readability
100
+ "src/pytest_skill_engineering/testing/*.py" = ["E501"]
101
+ # Test files have long descriptions and variable declarations
102
+ "tests/**/*.py" = ["E402", "E501", "F841"]
103
+
104
+ [tool.ruff.format]
105
+ quote-style = "double"
106
+ indent-style = "space"
107
+
108
+ [tool.pyright]
109
+ include = ["src"]
110
+ exclude = ["src/pytest_skill_engineering/copilot"]
111
+ pythonVersion = "3.11"
112
+ typeCheckingMode = "basic"
113
+
114
+ [tool.pytest.ini_options]
115
+ testpaths = ["tests"]
116
+ python_files = ["test_*.py"]
117
+ python_classes = ["Test*"]
118
+ python_functions = ["test_*"]
119
+ asyncio_mode = "auto"
120
+ # Suppress third-party deprecation warnings we can't fix
121
+ filterwarnings = [
122
+ "ignore:enable_cleanup_closed:DeprecationWarning:aiohttp",
123
+ ]
124
+ # pytest-skill-engineering configuration for this project's own tests.
125
+ # This demonstrates the recommended setup - configure once in pyproject.toml.
126
+ # LLM auth is handled by Pydantic AI via env vars (AZURE_API_BASE, OPENAI_API_KEY, etc.)
127
+ addopts = """
128
+ --aitest-summary-model=azure/gpt-5.2-chat
129
+ --aitest-html=aitest-reports/report.html
130
+ """
131
+ markers = [
132
+ "integration: marks tests as integration tests (require LLM credentials)",
133
+ "aitest: AI agent tests",
134
+ "basic: basic usage tests (banking and todo)",
135
+ "model: tests with multiple models",
136
+ "sysprompt: tests with multiple system prompts",
137
+ "matrix: model × prompt cross-product tests",
138
+ "skill: tests with skills",
139
+ "session_test: multi-turn session tests",
140
+ "clarification: clarification detection tests",
141
+ "scoring: LLM scoring / rubric tests",
142
+ "iterations: iteration reliability tests",
143
+ "abtest: A/B server comparison tests",
144
+ "cli: CLI server tests",
145
+ "copilot: marks tests as requiring GitHub Copilot SDK credentials",
146
+ ]
147
+
148
+ # UV/PEP 735 dependency groups (for uv sync)
149
+ [dependency-groups]
150
+ dev = [
151
+ "pytest-skill-engineering[dev,test,docs]",
152
+ ]