skillprobe 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skillprobe-0.1.0/.github/FUNDING.yml +4 -0
- skillprobe-0.1.0/.github/workflows/publish.yml +22 -0
- skillprobe-0.1.0/.github/workflows/test.yml +21 -0
- skillprobe-0.1.0/.gitignore +27 -0
- skillprobe-0.1.0/.python-version +1 -0
- skillprobe-0.1.0/LICENSE +21 -0
- skillprobe-0.1.0/PKG-INFO +197 -0
- skillprobe-0.1.0/README.md +175 -0
- skillprobe-0.1.0/examples/skills/clean-python.md +69 -0
- skillprobe-0.1.0/examples/skills/simple-implementations.md +66 -0
- skillprobe-0.1.0/examples/skills/systematic-debugging.md +46 -0
- skillprobe-0.1.0/examples/tests/test-activation.yaml +20 -0
- skillprobe-0.1.0/examples/tests/test-clean-python.yaml +58 -0
- skillprobe-0.1.0/examples/tests/test-response-quality.yaml +28 -0
- skillprobe-0.1.0/examples/tests/test-simple-implementations.yaml +53 -0
- skillprobe-0.1.0/examples/tests/test-systematic-debugging.yaml +41 -0
- skillprobe-0.1.0/pyproject.toml +47 -0
- skillprobe-0.1.0/src/skillprobe/__init__.py +1 -0
- skillprobe-0.1.0/src/skillprobe/adapters/__init__.py +11 -0
- skillprobe-0.1.0/src/skillprobe/adapters/base.py +25 -0
- skillprobe-0.1.0/src/skillprobe/adapters/claude_code.py +111 -0
- skillprobe-0.1.0/src/skillprobe/adapters/cursor.py +141 -0
- skillprobe-0.1.0/src/skillprobe/assertions.py +120 -0
- skillprobe-0.1.0/src/skillprobe/cli.py +140 -0
- skillprobe-0.1.0/src/skillprobe/evidence.py +22 -0
- skillprobe-0.1.0/src/skillprobe/init_generator.py +171 -0
- skillprobe-0.1.0/src/skillprobe/loader.py +66 -0
- skillprobe-0.1.0/src/skillprobe/orchestrator.py +135 -0
- skillprobe-0.1.0/src/skillprobe/py.typed +0 -0
- skillprobe-0.1.0/src/skillprobe/reporter.py +81 -0
- skillprobe-0.1.0/src/skillprobe/workspace.py +66 -0
- skillprobe-0.1.0/tests/__init__.py +0 -0
- skillprobe-0.1.0/tests/conftest.py +1 -0
- skillprobe-0.1.0/tests/test_adapters.py +193 -0
- skillprobe-0.1.0/tests/test_assertions.py +169 -0
- skillprobe-0.1.0/tests/test_evidence.py +38 -0
- skillprobe-0.1.0/tests/test_integration.py +129 -0
- skillprobe-0.1.0/tests/test_loader.py +130 -0
- skillprobe-0.1.0/tests/test_orchestrator.py +291 -0
- skillprobe-0.1.0/tests/test_reporter.py +107 -0
- skillprobe-0.1.0/tests/test_workspace.py +96 -0
- skillprobe-0.1.0/uv.lock +241 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name: publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
publish:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
permissions:
|
|
12
|
+
id-token: write
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- uses: astral-sh/setup-uv@v4
|
|
17
|
+
with:
|
|
18
|
+
version: "latest"
|
|
19
|
+
|
|
20
|
+
- run: uv build
|
|
21
|
+
|
|
22
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: test
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
|
|
15
|
+
- uses: astral-sh/setup-uv@v4
|
|
16
|
+
with:
|
|
17
|
+
version: "latest"
|
|
18
|
+
|
|
19
|
+
- run: uv sync --dev
|
|
20
|
+
|
|
21
|
+
- run: uv run pytest tests/ -v
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Python-generated files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[oc]
|
|
4
|
+
build/
|
|
5
|
+
dist/
|
|
6
|
+
wheels/
|
|
7
|
+
*.egg-info
|
|
8
|
+
|
|
9
|
+
# Virtual environments
|
|
10
|
+
.venv
|
|
11
|
+
|
|
12
|
+
# Runtime data
|
|
13
|
+
skillprobe.db
|
|
14
|
+
captured/
|
|
15
|
+
*.md.bak
|
|
16
|
+
|
|
17
|
+
# PoC files (kept for reference but not part of package)
|
|
18
|
+
proof_of_concept.py
|
|
19
|
+
test_intercept.py
|
|
20
|
+
main.py
|
|
21
|
+
|
|
22
|
+
# Plans and docs not part of the project
|
|
23
|
+
docs/superpowers/
|
|
24
|
+
|
|
25
|
+
# Harness workspace temp directories
|
|
26
|
+
.skillprobe-workspaces/
|
|
27
|
+
.skillprobe-harness-*.db
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
skillprobe-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Anish Shrestha
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: skillprobe
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Automated end-to-end skill testing for LLM coding tools
|
|
5
|
+
Project-URL: Homepage, https://github.com/Anyesh/skillprobe
|
|
6
|
+
Project-URL: Repository, https://github.com/Anyesh/skillprobe
|
|
7
|
+
Project-URL: Issues, https://github.com/Anyesh/skillprobe/issues
|
|
8
|
+
Author: Anish Shrestha
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Topic :: Software Development :: Testing
|
|
17
|
+
Requires-Python: >=3.12
|
|
18
|
+
Requires-Dist: click>=8.1
|
|
19
|
+
Requires-Dist: httpx>=0.27
|
|
20
|
+
Requires-Dist: pyyaml>=6.0
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# skillprobe
|
|
24
|
+
|
|
25
|
+
AI coding tools like Claude Code and Cursor inject instructions into the LLM context behind the scenes, whether they call them skills, rules, or system prompts. There's no good way to test whether those instructions are actually being followed. You write a skill that says "never add docstrings" and half the time the model adds them anyway.
|
|
26
|
+
|
|
27
|
+
skillprobe automates the testing. It launches Claude Code or Cursor as subprocesses, runs your test scenarios in real workspaces, checks the output against assertions, and reports what passed and what didn't, all from a single command with no manual prompting required.
|
|
28
|
+
|
|
29
|
+
## Who this is for (and who it isn't)
|
|
30
|
+
|
|
31
|
+
If you write a few skills for your own use and tweak them when something feels off, you probably don't need this. Most people create skills by asking an LLM to write one, try it a couple times, and if the output looks wrong they ask the LLM to adjust it. That loop is fast, cheap, and good enough for personal use.
|
|
32
|
+
|
|
33
|
+
Where that loop breaks down:
|
|
34
|
+
|
|
35
|
+
**Model updates break skills silently.** Anthropic ships a new Sonnet, Cursor updates their agent behavior, and a skill that worked last week now produces subtly different output. Nobody notices because nobody retested, and skillprobe exists to catch exactly that kind of silent regression.
|
|
36
|
+
|
|
37
|
+
**Teams sharing skills across engineers.** When 20 developers share a "code review" skill, one person's gut check isn't representative because everyone is hitting it with different prompts, different codebases, and different expectations. You need actual coverage across scenarios to know whether the skill holds up.
|
|
38
|
+
|
|
39
|
+
**Publishing to marketplaces.** Both Claude Code and Cursor now have plugin marketplaces where skill authors ship to thousands of users. At that point you're distributing software, not vibing with your own tool. User reports from strangers don't come with context, and "ask the LLM to fix it" doesn't scale to reproducing someone else's problem.
|
|
40
|
+
|
|
41
|
+
**Breaking the endless tweak loop.** You named a skill "clean-python" and told it to never add docstrings, but after three rounds of edits you're not sure if the latest version is actually better or if you just moved the problem around. skillprobe gives you a definitive "this version is better than the last one" signal by running the same scenarios against both and comparing pass rates.
|
|
42
|
+
|
|
43
|
+
If none of those situations apply to you, a simpler workflow (write skill, try it, adjust) is probably the right call. skillprobe is for when you need more confidence than vibing can provide.
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install skillprobe
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Or with uv:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
uv tool install skillprobe
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Or from source:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
git clone https://github.com/Anyesh/skillprobe.git
|
|
61
|
+
cd skillprobe
|
|
62
|
+
uv sync
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Quick start
|
|
66
|
+
|
|
67
|
+
Generate test scenarios from an existing skill, then run them:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
skillprobe init ./skills/my-skill --harness claude-code
|
|
71
|
+
skillprobe run tests/my-skill.yaml
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
Running: tests/my-skill.yaml
|
|
76
|
+
Harness: claude-code
|
|
77
|
+
Model: claude-haiku-4-5-20251001
|
|
78
|
+
Scenarios: 3
|
|
79
|
+
Parallel: 1
|
|
80
|
+
|
|
81
|
+
[PASS] commit skill activates on request (9.1s)
|
|
82
|
+
[PASS] multi-turn refinement (12.3s)
|
|
83
|
+
[FAIL] negative activation -- 'commit' found in response
|
|
84
|
+
step 1: "explain what this project does"
|
|
85
|
+
'commit' found in response
|
|
86
|
+
|
|
87
|
+
2/3 passed (27.8s)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Writing scenarios
|
|
91
|
+
|
|
92
|
+
Scenarios are YAML files describing what to test. Each scenario can have multiple conversational steps, a workspace fixture that gets copied fresh for every run, setup commands, and post-run assertions that check workspace state after everything finishes:
|
|
93
|
+
|
|
94
|
+
```yaml
|
|
95
|
+
harness: claude-code
|
|
96
|
+
model: claude-haiku-4-5-20251001
|
|
97
|
+
timeout: 120
|
|
98
|
+
skill: ./skills/commit
|
|
99
|
+
|
|
100
|
+
scenarios:
|
|
101
|
+
- name: "commit skill activates on request"
|
|
102
|
+
workspace: fixtures/dirty-repo
|
|
103
|
+
setup:
|
|
104
|
+
- run: "echo 'change' >> file.txt && git add ."
|
|
105
|
+
steps:
|
|
106
|
+
- prompt: "commit my changes"
|
|
107
|
+
assert:
|
|
108
|
+
- type: contains
|
|
109
|
+
value: "commit"
|
|
110
|
+
- type: tool_called
|
|
111
|
+
value: "Bash"
|
|
112
|
+
after:
|
|
113
|
+
- type: file_exists
|
|
114
|
+
value: ".git/COMMIT_EDITMSG"
|
|
115
|
+
|
|
116
|
+
- name: "does not activate for unrelated request"
|
|
117
|
+
steps:
|
|
118
|
+
- prompt: "explain what this project does"
|
|
119
|
+
assert:
|
|
120
|
+
- type: not_contains
|
|
121
|
+
value: "commit"
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Supported assertion types: `contains`, `not_contains`, `regex`, `tool_called`, `file_exists`, and `file_contains`. Any assertion can be inverted with `negate: true`.
|
|
125
|
+
|
|
126
|
+
## Generating tests
|
|
127
|
+
|
|
128
|
+
You don't have to write scenario YAML from scratch. Point `init` at a skill directory and it reads the SKILL.md, uses an LLM to figure out what should be tested (positive activation, negative activation, behavioral correctness, edge cases), and writes a starter YAML file you can review and tweak:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
skillprobe init ./skills/commit --harness claude-code
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
The `init` command supports both Anthropic and OpenAI as providers for test generation. Pass `--provider openai` and `--model gpt-4o` if you prefer, or it defaults to Anthropic with `claude-sonnet-4-6`. This requires an API key for whichever provider you choose (via `ANTHROPIC_API_KEY` or `OPENAI_API_KEY`).
|
|
135
|
+
|
|
136
|
+
## Commands
|
|
137
|
+
|
|
138
|
+
**`skillprobe run <test.yaml>`** runs test scenarios against a real coding tool.
|
|
139
|
+
|
|
140
|
+
| Flag | Default | Description |
|
|
141
|
+
|---|---|---|
|
|
142
|
+
| `--harness` | from YAML | `claude-code` or `cursor` |
|
|
143
|
+
| `--model` | from YAML | Model to use for the tool under test |
|
|
144
|
+
| `--parallel` | 1 | Number of scenarios to run concurrently |
|
|
145
|
+
| `--timeout` | from YAML | Per-scenario timeout in seconds |
|
|
146
|
+
| `--max-cost` | none | Max USD spend (Claude Code only) |
|
|
147
|
+
|
|
148
|
+
**`skillprobe init <skill-dir>`** generates starter test YAML from a skill definition.
|
|
149
|
+
|
|
150
|
+
| Flag | Default | Description |
|
|
151
|
+
|---|---|---|
|
|
152
|
+
| `--harness` | `claude-code` | Target harness |
|
|
153
|
+
| `--output` | `tests/<skill>.yaml` | Output YAML path |
|
|
154
|
+
| `--provider` | `anthropic` | LLM provider for generation |
|
|
155
|
+
| `--model` | auto | Model for generation |
|
|
156
|
+
| `--fixtures-dir` | `fixtures` | Where to write fixture directories |
|
|
157
|
+
|
|
158
|
+
## Using in CI
|
|
159
|
+
|
|
160
|
+
skillprobe works well in CI for catching regressions when models update or skills change. The CI environment needs the target tool's CLI installed and authenticated, since skillprobe spawns it as a subprocess.
|
|
161
|
+
|
|
162
|
+
```yaml
|
|
163
|
+
# .github/workflows/skill-tests.yml
|
|
164
|
+
name: skill-tests
|
|
165
|
+
|
|
166
|
+
on:
|
|
167
|
+
push:
|
|
168
|
+
paths: ["skills/**", "tests/**"]
|
|
169
|
+
schedule:
|
|
170
|
+
- cron: "0 6 * * 1" # weekly Monday 6am
|
|
171
|
+
|
|
172
|
+
jobs:
|
|
173
|
+
test:
|
|
174
|
+
runs-on: ubuntu-latest
|
|
175
|
+
steps:
|
|
176
|
+
- uses: actions/checkout@v4
|
|
177
|
+
|
|
178
|
+
- run: npm install -g @anthropic-ai/claude-code
|
|
179
|
+
|
|
180
|
+
- uses: astral-sh/setup-uv@v4
|
|
181
|
+
|
|
182
|
+
- run: uv tool install skillprobe
|
|
183
|
+
|
|
184
|
+
- run: skillprobe run tests/my-skill.yaml --harness claude-code
|
|
185
|
+
env:
|
|
186
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## Why not promptfoo
|
|
190
|
+
|
|
191
|
+
Tools like promptfoo test prompts in isolation by making their own API calls, outside the tool that will actually use them. skillprobe runs the real tools as subprocesses in real workspaces, so it tests the full stack: skill loading, tool use, file system interactions, multi-turn conversations. It also works with subscriptions (no API key required for the tool under test, only for `init` if you use it).
|
|
192
|
+
|
|
193
|
+
## References
|
|
194
|
+
|
|
195
|
+
- https://github.com/karpathy/autoresearch
|
|
196
|
+
- https://www.news.aakashg.com/p/autoresearch-guide-for-pms
|
|
197
|
+
- https://fortune.com/2026/03/17/andrej-karpathy-loop-autonomous-ai-agents-future/
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# skillprobe
|
|
2
|
+
|
|
3
|
+
AI coding tools like Claude Code and Cursor inject instructions into the LLM context behind the scenes, whether they call them skills, rules, or system prompts. There's no good way to test whether those instructions are actually being followed. You write a skill that says "never add docstrings" and half the time the model adds them anyway.
|
|
4
|
+
|
|
5
|
+
skillprobe automates the testing. It launches Claude Code or Cursor as subprocesses, runs your test scenarios in real workspaces, checks the output against assertions, and reports what passed and what didn't, all from a single command with no manual prompting required.
|
|
6
|
+
|
|
7
|
+
## Who this is for (and who it isn't)
|
|
8
|
+
|
|
9
|
+
If you write a few skills for your own use and tweak them when something feels off, you probably don't need this. Most people create skills by asking an LLM to write one, try it a couple times, and if the output looks wrong they ask the LLM to adjust it. That loop is fast, cheap, and good enough for personal use.
|
|
10
|
+
|
|
11
|
+
Where that loop breaks down:
|
|
12
|
+
|
|
13
|
+
**Model updates break skills silently.** Anthropic ships a new Sonnet, Cursor updates their agent behavior, and a skill that worked last week now produces subtly different output. Nobody notices because nobody retested, and skillprobe exists to catch exactly that kind of silent regression.
|
|
14
|
+
|
|
15
|
+
**Teams sharing skills across engineers.** When 20 developers share a "code review" skill, one person's gut check isn't representative because everyone is hitting it with different prompts, different codebases, and different expectations. You need actual coverage across scenarios to know whether the skill holds up.
|
|
16
|
+
|
|
17
|
+
**Publishing to marketplaces.** Both Claude Code and Cursor now have plugin marketplaces where skill authors ship to thousands of users. At that point you're distributing software, not vibing with your own tool. User reports from strangers don't come with context, and "ask the LLM to fix it" doesn't scale to reproducing someone else's problem.
|
|
18
|
+
|
|
19
|
+
**Breaking the endless tweak loop.** You named a skill "clean-python" and told it to never add docstrings, but after three rounds of edits you're not sure if the latest version is actually better or if you just moved the problem around. skillprobe gives you a definitive "this version is better than the last one" signal by running the same scenarios against both and comparing pass rates.
|
|
20
|
+
|
|
21
|
+
If none of those situations apply to you, a simpler workflow (write skill, try it, adjust) is probably the right call. skillprobe is for when you need more confidence than vibing can provide.
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install skillprobe
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Or with uv:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
uv tool install skillprobe
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Or from source:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
git clone https://github.com/Anyesh/skillprobe.git
|
|
39
|
+
cd skillprobe
|
|
40
|
+
uv sync
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Quick start
|
|
44
|
+
|
|
45
|
+
Generate test scenarios from an existing skill, then run them:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
skillprobe init ./skills/my-skill --harness claude-code
|
|
49
|
+
skillprobe run tests/my-skill.yaml
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
Running: tests/my-skill.yaml
|
|
54
|
+
Harness: claude-code
|
|
55
|
+
Model: claude-haiku-4-5-20251001
|
|
56
|
+
Scenarios: 3
|
|
57
|
+
Parallel: 1
|
|
58
|
+
|
|
59
|
+
[PASS] commit skill activates on request (9.1s)
|
|
60
|
+
[PASS] multi-turn refinement (12.3s)
|
|
61
|
+
[FAIL] negative activation -- 'commit' found in response
|
|
62
|
+
step 1: "explain what this project does"
|
|
63
|
+
'commit' found in response
|
|
64
|
+
|
|
65
|
+
2/3 passed (27.8s)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Writing scenarios
|
|
69
|
+
|
|
70
|
+
Scenarios are YAML files describing what to test. Each scenario can have multiple conversational steps, a workspace fixture that gets copied fresh for every run, setup commands, and post-run assertions that check workspace state after everything finishes:
|
|
71
|
+
|
|
72
|
+
```yaml
|
|
73
|
+
harness: claude-code
|
|
74
|
+
model: claude-haiku-4-5-20251001
|
|
75
|
+
timeout: 120
|
|
76
|
+
skill: ./skills/commit
|
|
77
|
+
|
|
78
|
+
scenarios:
|
|
79
|
+
- name: "commit skill activates on request"
|
|
80
|
+
workspace: fixtures/dirty-repo
|
|
81
|
+
setup:
|
|
82
|
+
- run: "echo 'change' >> file.txt && git add ."
|
|
83
|
+
steps:
|
|
84
|
+
- prompt: "commit my changes"
|
|
85
|
+
assert:
|
|
86
|
+
- type: contains
|
|
87
|
+
value: "commit"
|
|
88
|
+
- type: tool_called
|
|
89
|
+
value: "Bash"
|
|
90
|
+
after:
|
|
91
|
+
- type: file_exists
|
|
92
|
+
value: ".git/COMMIT_EDITMSG"
|
|
93
|
+
|
|
94
|
+
- name: "does not activate for unrelated request"
|
|
95
|
+
steps:
|
|
96
|
+
- prompt: "explain what this project does"
|
|
97
|
+
assert:
|
|
98
|
+
- type: not_contains
|
|
99
|
+
value: "commit"
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Supported assertion types: `contains`, `not_contains`, `regex`, `tool_called`, `file_exists`, and `file_contains`. Any assertion can be inverted with `negate: true`.
|
|
103
|
+
|
|
104
|
+
## Generating tests
|
|
105
|
+
|
|
106
|
+
You don't have to write scenario YAML from scratch. Point `init` at a skill directory and it reads the SKILL.md, uses an LLM to figure out what should be tested (positive activation, negative activation, behavioral correctness, edge cases), and writes a starter YAML file you can review and tweak:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
skillprobe init ./skills/commit --harness claude-code
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
The `init` command supports both Anthropic and OpenAI as providers for test generation. Pass `--provider openai` and `--model gpt-4o` if you prefer, or it defaults to Anthropic with `claude-sonnet-4-6`. This requires an API key for whichever provider you choose (via `ANTHROPIC_API_KEY` or `OPENAI_API_KEY`).
|
|
113
|
+
|
|
114
|
+
## Commands
|
|
115
|
+
|
|
116
|
+
**`skillprobe run <test.yaml>`** runs test scenarios against a real coding tool.
|
|
117
|
+
|
|
118
|
+
| Flag | Default | Description |
|
|
119
|
+
|---|---|---|
|
|
120
|
+
| `--harness` | from YAML | `claude-code` or `cursor` |
|
|
121
|
+
| `--model` | from YAML | Model to use for the tool under test |
|
|
122
|
+
| `--parallel` | 1 | Number of scenarios to run concurrently |
|
|
123
|
+
| `--timeout` | from YAML | Per-scenario timeout in seconds |
|
|
124
|
+
| `--max-cost` | none | Max USD spend (Claude Code only) |
|
|
125
|
+
|
|
126
|
+
**`skillprobe init <skill-dir>`** generates starter test YAML from a skill definition.
|
|
127
|
+
|
|
128
|
+
| Flag | Default | Description |
|
|
129
|
+
|---|---|---|
|
|
130
|
+
| `--harness` | `claude-code` | Target harness |
|
|
131
|
+
| `--output` | `tests/<skill>.yaml` | Output YAML path |
|
|
132
|
+
| `--provider` | `anthropic` | LLM provider for generation |
|
|
133
|
+
| `--model` | auto | Model for generation |
|
|
134
|
+
| `--fixtures-dir` | `fixtures` | Where to write fixture directories |
|
|
135
|
+
|
|
136
|
+
## Using in CI
|
|
137
|
+
|
|
138
|
+
skillprobe works well in CI for catching regressions when models update or skills change. The CI environment needs the target tool's CLI installed and authenticated, since skillprobe spawns it as a subprocess.
|
|
139
|
+
|
|
140
|
+
```yaml
|
|
141
|
+
# .github/workflows/skill-tests.yml
|
|
142
|
+
name: skill-tests
|
|
143
|
+
|
|
144
|
+
on:
|
|
145
|
+
push:
|
|
146
|
+
paths: ["skills/**", "tests/**"]
|
|
147
|
+
schedule:
|
|
148
|
+
- cron: "0 6 * * 1" # weekly Monday 6am
|
|
149
|
+
|
|
150
|
+
jobs:
|
|
151
|
+
test:
|
|
152
|
+
runs-on: ubuntu-latest
|
|
153
|
+
steps:
|
|
154
|
+
- uses: actions/checkout@v4
|
|
155
|
+
|
|
156
|
+
- run: npm install -g @anthropic-ai/claude-code
|
|
157
|
+
|
|
158
|
+
- uses: astral-sh/setup-uv@v4
|
|
159
|
+
|
|
160
|
+
- run: uv tool install skillprobe
|
|
161
|
+
|
|
162
|
+
- run: skillprobe run tests/my-skill.yaml --harness claude-code
|
|
163
|
+
env:
|
|
164
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Why not promptfoo
|
|
168
|
+
|
|
169
|
+
Tools like promptfoo test prompts in isolation by making their own API calls, outside the tool that will actually use them. skillprobe runs the real tools as subprocesses in real workspaces, so it tests the full stack: skill loading, tool use, file system interactions, multi-turn conversations. It also works with subscriptions (no API key required for the tool under test, only for `init` if you use it).
|
|
170
|
+
|
|
171
|
+
## References
|
|
172
|
+
|
|
173
|
+
- https://github.com/karpathy/autoresearch
|
|
174
|
+
- https://www.news.aakashg.com/p/autoresearch-guide-for-pms
|
|
175
|
+
- https://fortune.com/2026/03/17/andrej-karpathy-loop-autonomous-ai-agents-future/
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clean-python
|
|
3
|
+
description: Use when writing Python code - enforce minimal function signatures, type hints, and avoid docstrings on simple functions
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Clean Python
|
|
7
|
+
|
|
8
|
+
Write minimal, focused Python functions that prioritize clarity over verbosity.
|
|
9
|
+
|
|
10
|
+
## Core Principles
|
|
11
|
+
|
|
12
|
+
1. **No docstrings on simple functions** - If a function's purpose is clear from its name and type hints, a docstring is noise
|
|
13
|
+
2. **Always use type hints** - Type hints are documentation that the compiler can verify
|
|
14
|
+
3. **Single-line definitions when possible** - Keep simple functions on one line with arrow notation
|
|
15
|
+
|
|
16
|
+
## When to Use
|
|
17
|
+
|
|
18
|
+
- Writing utility functions (less than 5 lines)
|
|
19
|
+
- Creating helper functions with clear names
|
|
20
|
+
- Building libraries where type hints aid discoverability
|
|
21
|
+
|
|
22
|
+
## When NOT to Use
|
|
23
|
+
|
|
24
|
+
- Complex business logic that genuinely needs explanation
|
|
25
|
+
- Public APIs with non-obvious behavior
|
|
26
|
+
- Functions with tricky edge cases
|
|
27
|
+
|
|
28
|
+
## Examples
|
|
29
|
+
|
|
30
|
+
### Good: Clean and clear
|
|
31
|
+
```python
|
|
32
|
+
def is_prime(n: int) -> bool:
|
|
33
|
+
return n > 1 and all(n % i != 0 for i in range(2, int(n**0.5) + 1))
|
|
34
|
+
|
|
35
|
+
def parse_config(path: str) -> dict[str, Any]:
|
|
36
|
+
with open(path) as f:
|
|
37
|
+
return json.load(f)
|
|
38
|
+
|
|
39
|
+
def clamp(value: float, minimum: float, maximum: float) -> float:
|
|
40
|
+
return max(minimum, min(maximum, value))
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Bad: Over-documented
|
|
44
|
+
```python
|
|
45
|
+
def is_prime(n):
|
|
46
|
+
"""Check if n is a prime number."""
|
|
47
|
+
return n > 1 and all(n % i != 0 for i in range(2, int(n**0.5) + 1))
|
|
48
|
+
|
|
49
|
+
def parse_config(path):
|
|
50
|
+
"""
|
|
51
|
+
Loads configuration from a JSON file at the given path.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
path: The file path to load
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
A dictionary containing the config
|
|
58
|
+
"""
|
|
59
|
+
with open(path) as f:
|
|
60
|
+
return json.load(f)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Guidelines
|
|
64
|
+
|
|
65
|
+
**Keep it simple:** If the function name + type hints tell the story, stop writing.
|
|
66
|
+
|
|
67
|
+
**Type hints are mandatory:** Every parameter and return value needs a type annotation.
|
|
68
|
+
|
|
69
|
+
**Multi-line functions may need explanation:** If it's complex enough to need 5+ lines, consider whether a docstring helps or if you should refactor instead.
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: simple-implementations
|
|
3
|
+
description: Use when writing solutions for straightforward problems - prefer direct implementations over premature abstractions
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Simple Implementations
|
|
7
|
+
|
|
8
|
+
Ship working code first. Abstractions emerge from patterns, not predictions.
|
|
9
|
+
|
|
10
|
+
## Core Principle
|
|
11
|
+
|
|
12
|
+
**Three lines of duplication is better than one early abstraction.**
|
|
13
|
+
|
|
14
|
+
Don't write helper functions, factories, or base classes until you've solved the same problem three times.
|
|
15
|
+
|
|
16
|
+
## When to Apply
|
|
17
|
+
|
|
18
|
+
- Building utilities or libraries
|
|
19
|
+
- Solving well-defined problems with clear requirements
|
|
20
|
+
- When the problem is smaller than the "solution"
|
|
21
|
+
|
|
22
|
+
## When NOT to Apply
|
|
23
|
+
|
|
24
|
+
- Architectural layers (MVC, layering, domains)
|
|
25
|
+
- Established design patterns for your domain
|
|
26
|
+
- Code that's mandated by your framework
|
|
27
|
+
|
|
28
|
+
## Pattern
|
|
29
|
+
|
|
30
|
+
### Problem: Premature Abstraction
|
|
31
|
+
```python
|
|
32
|
+
# Over-engineered: generic parameter processing before we know what we need
|
|
33
|
+
class ParameterProcessor:
|
|
34
|
+
def process(self, params: dict, schema: dict) -> dict:
|
|
35
|
+
result = {}
|
|
36
|
+
for key, rule in schema.items():
|
|
37
|
+
if key in params:
|
|
38
|
+
result[key] = self._apply_transformations(params[key], rule.get('transforms'))
|
|
39
|
+
return result
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Solution: Direct Implementation
|
|
43
|
+
```python
|
|
44
|
+
# Direct: Do what's needed, nothing more
|
|
45
|
+
def validate_api_request(headers: dict, body: dict) -> tuple[bool, str]:
|
|
46
|
+
if 'authorization' not in headers:
|
|
47
|
+
return False, "Missing auth header"
|
|
48
|
+
if 'email' not in body:
|
|
49
|
+
return False, "Missing email"
|
|
50
|
+
return True, ""
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Trade-offs
|
|
54
|
+
|
|
55
|
+
**Direct code is harder to extend** - but extending is free (copy-paste 5 lines takes 10 seconds).
|
|
56
|
+
|
|
57
|
+
**Abstractions are easy to misuse** - but building them takes hours. Write direct code, extract patterns when they emerge.
|
|
58
|
+
|
|
59
|
+
## Common Mistakes
|
|
60
|
+
|
|
61
|
+
- Writing base classes for one implementation
|
|
62
|
+
- Creating helper utilities before you repeat code
|
|
63
|
+
- Building configuration systems for one use case
|
|
64
|
+
- Parameterizing everything "just in case"
|
|
65
|
+
|
|
66
|
+
**Cost:** Abstractions that never get used, code that's harder to understand than the problem it solves.
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: systematic-debugging
|
|
3
|
+
description: Use when a test is failing, code behavior is unexpected, or you're hunting a bug - trace execution systematically before guessing
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Systematic Debugging
|
|
7
|
+
|
|
8
|
+
Find the root cause before changing code.
|
|
9
|
+
|
|
10
|
+
## Three Rules
|
|
11
|
+
|
|
12
|
+
1. **Read the error message** - Start here. 99% of bugs are explained by the error.
|
|
13
|
+
2. **Verify your assumptions** - What you think is true might not be. Check the state.
|
|
14
|
+
3. **Trace execution** - Follow the code path from input to failure point.
|
|
15
|
+
|
|
16
|
+
## Pattern: Hypothesis Testing
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
1. What's the observed behavior? (test output, error, unexpected result)
|
|
20
|
+
2. What should happen? (expected behavior)
|
|
21
|
+
3. Where does it diverge? (add logging to narrow it down)
|
|
22
|
+
4. Why does it diverge? (root cause)
|
|
23
|
+
5. Fix the root cause, not the symptom
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## When NOT to Use
|
|
27
|
+
|
|
28
|
+
- Code review (looking for style issues)
|
|
29
|
+
- Performance optimization (not debugging, different methodology)
|
|
30
|
+
- Learning new frameworks (read docs, not debugging)
|
|
31
|
+
|
|
32
|
+
## Common Mistakes
|
|
33
|
+
|
|
34
|
+
- Adding more logging without reading existing logs
|
|
35
|
+
- Changing code without understanding the failure
|
|
36
|
+
- Fixing the symptom instead of the root cause
|
|
37
|
+
- Using print statements instead of actual debugger
|
|
38
|
+
|
|
39
|
+
**Cost:** Hours wasted, multiple failed fixes, frustration.
|
|
40
|
+
|
|
41
|
+
## Tools
|
|
42
|
+
|
|
43
|
+
- **Debugger** - Breakpoints, step through execution
|
|
44
|
+
- **Logging** - Trace variable state
|
|
45
|
+
- **Assertions** - Verify assumptions
|
|
46
|
+
- **Reproducible test case** - Isolate the problem
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
activations:
|
|
2
|
+
- skill: clean-python
|
|
3
|
+
should_load_when:
|
|
4
|
+
- "write a python function"
|
|
5
|
+
- "refactor this python code"
|
|
6
|
+
- "create a python module"
|
|
7
|
+
should_not_load_when:
|
|
8
|
+
- "hello"
|
|
9
|
+
- "explain how HTTP works"
|
|
10
|
+
- "write a react component"
|
|
11
|
+
|
|
12
|
+
- skill: sqlalchemy
|
|
13
|
+
should_load_when:
|
|
14
|
+
- "write a sqlalchemy model"
|
|
15
|
+
- "create a database migration"
|
|
16
|
+
- "define the ORM schema"
|
|
17
|
+
should_not_load_when:
|
|
18
|
+
- "write a hello world in python"
|
|
19
|
+
- "what is recursion"
|
|
20
|
+
- "parse this JSON file"
|