skillprobe 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. skillprobe-0.1.0/.github/FUNDING.yml +4 -0
  2. skillprobe-0.1.0/.github/workflows/publish.yml +22 -0
  3. skillprobe-0.1.0/.github/workflows/test.yml +21 -0
  4. skillprobe-0.1.0/.gitignore +27 -0
  5. skillprobe-0.1.0/.python-version +1 -0
  6. skillprobe-0.1.0/LICENSE +21 -0
  7. skillprobe-0.1.0/PKG-INFO +197 -0
  8. skillprobe-0.1.0/README.md +175 -0
  9. skillprobe-0.1.0/examples/skills/clean-python.md +69 -0
  10. skillprobe-0.1.0/examples/skills/simple-implementations.md +66 -0
  11. skillprobe-0.1.0/examples/skills/systematic-debugging.md +46 -0
  12. skillprobe-0.1.0/examples/tests/test-activation.yaml +20 -0
  13. skillprobe-0.1.0/examples/tests/test-clean-python.yaml +58 -0
  14. skillprobe-0.1.0/examples/tests/test-response-quality.yaml +28 -0
  15. skillprobe-0.1.0/examples/tests/test-simple-implementations.yaml +53 -0
  16. skillprobe-0.1.0/examples/tests/test-systematic-debugging.yaml +41 -0
  17. skillprobe-0.1.0/pyproject.toml +47 -0
  18. skillprobe-0.1.0/src/skillprobe/__init__.py +1 -0
  19. skillprobe-0.1.0/src/skillprobe/adapters/__init__.py +11 -0
  20. skillprobe-0.1.0/src/skillprobe/adapters/base.py +25 -0
  21. skillprobe-0.1.0/src/skillprobe/adapters/claude_code.py +111 -0
  22. skillprobe-0.1.0/src/skillprobe/adapters/cursor.py +141 -0
  23. skillprobe-0.1.0/src/skillprobe/assertions.py +120 -0
  24. skillprobe-0.1.0/src/skillprobe/cli.py +140 -0
  25. skillprobe-0.1.0/src/skillprobe/evidence.py +22 -0
  26. skillprobe-0.1.0/src/skillprobe/init_generator.py +171 -0
  27. skillprobe-0.1.0/src/skillprobe/loader.py +66 -0
  28. skillprobe-0.1.0/src/skillprobe/orchestrator.py +135 -0
  29. skillprobe-0.1.0/src/skillprobe/py.typed +0 -0
  30. skillprobe-0.1.0/src/skillprobe/reporter.py +81 -0
  31. skillprobe-0.1.0/src/skillprobe/workspace.py +66 -0
  32. skillprobe-0.1.0/tests/__init__.py +0 -0
  33. skillprobe-0.1.0/tests/conftest.py +1 -0
  34. skillprobe-0.1.0/tests/test_adapters.py +193 -0
  35. skillprobe-0.1.0/tests/test_assertions.py +169 -0
  36. skillprobe-0.1.0/tests/test_evidence.py +38 -0
  37. skillprobe-0.1.0/tests/test_integration.py +129 -0
  38. skillprobe-0.1.0/tests/test_loader.py +130 -0
  39. skillprobe-0.1.0/tests/test_orchestrator.py +291 -0
  40. skillprobe-0.1.0/tests/test_reporter.py +107 -0
  41. skillprobe-0.1.0/tests/test_workspace.py +96 -0
  42. skillprobe-0.1.0/uv.lock +241 -0
@@ -0,0 +1,4 @@
1
+ # These are supported funding model platforms
2
+
3
+ github: anyesh
4
+ buy_me_a_coffee: anyesh
@@ -0,0 +1,22 @@
1
+ name: publish
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ publish:
10
+ runs-on: ubuntu-latest
11
+ permissions:
12
+ id-token: write
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - uses: astral-sh/setup-uv@v4
17
+ with:
18
+ version: "latest"
19
+
20
+ - run: uv build
21
+
22
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,21 @@
1
+ name: test
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+
15
+ - uses: astral-sh/setup-uv@v4
16
+ with:
17
+ version: "latest"
18
+
19
+ - run: uv sync --dev
20
+
21
+ - run: uv run pytest tests/ -v
@@ -0,0 +1,27 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # Runtime data
13
+ skillprobe.db
14
+ captured/
15
+ *.md.bak
16
+
17
+ # PoC files (kept for reference but not part of package)
18
+ proof_of_concept.py
19
+ test_intercept.py
20
+ main.py
21
+
22
+ # Plans and docs not part of the project
23
+ docs/superpowers/
24
+
25
+ # Harness workspace temp directories
26
+ .skillprobe-workspaces/
27
+ .skillprobe-harness-*.db
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Anish Shrestha
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,197 @@
1
+ Metadata-Version: 2.4
2
+ Name: skillprobe
3
+ Version: 0.1.0
4
+ Summary: Automated end-to-end skill testing for LLM coding tools
5
+ Project-URL: Homepage, https://github.com/Anyesh/skillprobe
6
+ Project-URL: Repository, https://github.com/Anyesh/skillprobe
7
+ Project-URL: Issues, https://github.com/Anyesh/skillprobe/issues
8
+ Author: Anish Shrestha
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Software Development :: Testing
17
+ Requires-Python: >=3.12
18
+ Requires-Dist: click>=8.1
19
+ Requires-Dist: httpx>=0.27
20
+ Requires-Dist: pyyaml>=6.0
21
+ Description-Content-Type: text/markdown
22
+
23
+ # skillprobe
24
+
25
+ AI coding tools like Claude Code and Cursor inject instructions into the LLM context behind the scenes, whether they call them skills, rules, or system prompts. There's no good way to test whether those instructions are actually being followed. You write a skill that says "never add docstrings" and half the time the model adds them anyway.
26
+
27
+ skillprobe automates the testing. It launches Claude Code or Cursor as subprocesses, runs your test scenarios in real workspaces, checks the output against assertions, and reports what passed and what didn't, all from a single command with no manual prompting required.
28
+
29
+ ## Who this is for (and who it isn't)
30
+
31
+ If you write a few skills for your own use and tweak them when something feels off, you probably don't need this. Most people create skills by asking an LLM to write one, try it a couple times, and if the output looks wrong they ask the LLM to adjust it. That loop is fast, cheap, and good enough for personal use.
32
+
33
+ Where that loop breaks down:
34
+
35
+ **Model updates break skills silently.** Anthropic ships a new Sonnet, Cursor updates their agent behavior, and a skill that worked last week now produces subtly different output. Nobody notices because nobody retested, and skillprobe exists to catch exactly that kind of silent regression.
36
+
37
+ **Teams sharing skills across engineers.** When 20 developers share a "code review" skill, one person's gut check isn't representative because everyone is hitting it with different prompts, different codebases, and different expectations. You need actual coverage across scenarios to know whether the skill holds up.
38
+
39
+ **Publishing to marketplaces.** Both Claude Code and Cursor now have plugin marketplaces where skill authors ship to thousands of users. At that point you're distributing software, not vibing with your own tool. User reports from strangers don't come with context, and "ask the LLM to fix it" doesn't scale to reproducing someone else's problem.
40
+
41
+ **Breaking the endless tweak loop.** You named a skill "clean-python" and told it to never add docstrings, but after three rounds of edits you're not sure if the latest version is actually better or if you just moved the problem around. skillprobe gives you a definitive "this version is better than the last one" signal by running the same scenarios against both and comparing pass rates.
42
+
43
+ If none of those situations apply to you, a simpler workflow (write skill, try it, adjust) is probably the right call. skillprobe is for when you need more confidence than vibing can provide.
44
+
45
+ ## Installation
46
+
47
+ ```bash
48
+ pip install skillprobe
49
+ ```
50
+
51
+ Or with uv:
52
+
53
+ ```bash
54
+ uv tool install skillprobe
55
+ ```
56
+
57
+ Or from source:
58
+
59
+ ```bash
60
+ git clone https://github.com/Anyesh/skillprobe.git
61
+ cd skillprobe
62
+ uv sync
63
+ ```
64
+
65
+ ## Quick start
66
+
67
+ Generate test scenarios from an existing skill, then run them:
68
+
69
+ ```bash
70
+ skillprobe init ./skills/my-skill --harness claude-code
71
+ skillprobe run tests/my-skill.yaml
72
+ ```
73
+
74
+ ```
75
+ Running: tests/my-skill.yaml
76
+ Harness: claude-code
77
+ Model: claude-haiku-4-5-20251001
78
+ Scenarios: 3
79
+ Parallel: 1
80
+
81
+ [PASS] commit skill activates on request (9.1s)
82
+ [PASS] multi-turn refinement (12.3s)
83
+ [FAIL] negative activation -- 'commit' found in response
84
+ step 1: "explain what this project does"
85
+ 'commit' found in response
86
+
87
+ 2/3 passed (27.8s)
88
+ ```
89
+
90
+ ## Writing scenarios
91
+
92
+ Scenarios are YAML files describing what to test. Each scenario can have multiple conversational steps, a workspace fixture that gets copied fresh for every run, setup commands, and post-run assertions that check workspace state after everything finishes:
93
+
94
+ ```yaml
95
+ harness: claude-code
96
+ model: claude-haiku-4-5-20251001
97
+ timeout: 120
98
+ skill: ./skills/commit
99
+
100
+ scenarios:
101
+ - name: "commit skill activates on request"
102
+ workspace: fixtures/dirty-repo
103
+ setup:
104
+ - run: "echo 'change' >> file.txt && git add ."
105
+ steps:
106
+ - prompt: "commit my changes"
107
+ assert:
108
+ - type: contains
109
+ value: "commit"
110
+ - type: tool_called
111
+ value: "Bash"
112
+ after:
113
+ - type: file_exists
114
+ value: ".git/COMMIT_EDITMSG"
115
+
116
+ - name: "does not activate for unrelated request"
117
+ steps:
118
+ - prompt: "explain what this project does"
119
+ assert:
120
+ - type: not_contains
121
+ value: "commit"
122
+ ```
123
+
124
+ Supported assertion types: `contains`, `not_contains`, `regex`, `tool_called`, `file_exists`, and `file_contains`. Any assertion can be inverted with `negate: true`.
125
+
126
+ ## Generating tests
127
+
128
+ You don't have to write scenario YAML from scratch. Point `init` at a skill directory and it reads the SKILL.md, uses an LLM to figure out what should be tested (positive activation, negative activation, behavioral correctness, edge cases), and writes a starter YAML file you can review and tweak:
129
+
130
+ ```bash
131
+ skillprobe init ./skills/commit --harness claude-code
132
+ ```
133
+
134
+ The `init` command supports both Anthropic and OpenAI as providers for test generation. Pass `--provider openai` and `--model gpt-4o` if you prefer, or it defaults to Anthropic with `claude-sonnet-4-6`. This requires an API key for whichever provider you choose (via `ANTHROPIC_API_KEY` or `OPENAI_API_KEY`).
135
+
136
+ ## Commands
137
+
138
+ **`skillprobe run <test.yaml>`** runs test scenarios against a real coding tool.
139
+
140
+ | Flag | Default | Description |
141
+ |---|---|---|
142
+ | `--harness` | from YAML | `claude-code` or `cursor` |
143
+ | `--model` | from YAML | Model to use for the tool under test |
144
+ | `--parallel` | 1 | Number of scenarios to run concurrently |
145
+ | `--timeout` | from YAML | Per-scenario timeout in seconds |
146
+ | `--max-cost` | none | Max USD spend (Claude Code only) |
147
+
148
+ **`skillprobe init <skill-dir>`** generates starter test YAML from a skill definition.
149
+
150
+ | Flag | Default | Description |
151
+ |---|---|---|
152
+ | `--harness` | `claude-code` | Target harness |
153
+ | `--output` | `tests/<skill>.yaml` | Output YAML path |
154
+ | `--provider` | `anthropic` | LLM provider for generation |
155
+ | `--model` | auto | Model for generation |
156
+ | `--fixtures-dir` | `fixtures` | Where to write fixture directories |
157
+
158
+ ## Using in CI
159
+
160
+ skillprobe works well in CI for catching regressions when models update or skills change. The CI environment needs the target tool's CLI installed and authenticated, since skillprobe spawns it as a subprocess.
161
+
162
+ ```yaml
163
+ # .github/workflows/skill-tests.yml
164
+ name: skill-tests
165
+
166
+ on:
167
+ push:
168
+ paths: ["skills/**", "tests/**"]
169
+ schedule:
170
+ - cron: "0 6 * * 1" # weekly Monday 6am
171
+
172
+ jobs:
173
+ test:
174
+ runs-on: ubuntu-latest
175
+ steps:
176
+ - uses: actions/checkout@v4
177
+
178
+ - run: npm install -g @anthropic-ai/claude-code
179
+
180
+ - uses: astral-sh/setup-uv@v4
181
+
182
+ - run: uv tool install skillprobe
183
+
184
+ - run: skillprobe run tests/my-skill.yaml --harness claude-code
185
+ env:
186
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
187
+ ```
188
+
189
+ ## Why not promptfoo
190
+
191
+ Tools like promptfoo test prompts in isolation by making their own API calls, outside the tool that will actually use them. skillprobe runs the real tools as subprocesses in real workspaces, so it tests the full stack: skill loading, tool use, file system interactions, multi-turn conversations. It also works with subscriptions (no API key required for the tool under test, only for `init` if you use it).
192
+
193
+ ## References
194
+
195
+ - https://github.com/karpathy/autoresearch
196
+ - https://www.news.aakashg.com/p/autoresearch-guide-for-pms
197
+ - https://fortune.com/2026/03/17/andrej-karpathy-loop-autonomous-ai-agents-future/
@@ -0,0 +1,175 @@
1
+ # skillprobe
2
+
3
+ AI coding tools like Claude Code and Cursor inject instructions into the LLM context behind the scenes, whether they call them skills, rules, or system prompts. There's no good way to test whether those instructions are actually being followed. You write a skill that says "never add docstrings" and half the time the model adds them anyway.
4
+
5
+ skillprobe automates the testing. It launches Claude Code or Cursor as subprocesses, runs your test scenarios in real workspaces, checks the output against assertions, and reports what passed and what didn't, all from a single command with no manual prompting required.
6
+
7
+ ## Who this is for (and who it isn't)
8
+
9
+ If you write a few skills for your own use and tweak them when something feels off, you probably don't need this. Most people create skills by asking an LLM to write one, try it a couple times, and if the output looks wrong they ask the LLM to adjust it. That loop is fast, cheap, and good enough for personal use.
10
+
11
+ Where that loop breaks down:
12
+
13
+ **Model updates break skills silently.** Anthropic ships a new Sonnet, Cursor updates their agent behavior, and a skill that worked last week now produces subtly different output. Nobody notices because nobody retested, and skillprobe exists to catch exactly that kind of silent regression.
14
+
15
+ **Teams sharing skills across engineers.** When 20 developers share a "code review" skill, one person's gut check isn't representative because everyone is hitting it with different prompts, different codebases, and different expectations. You need actual coverage across scenarios to know whether the skill holds up.
16
+
17
+ **Publishing to marketplaces.** Both Claude Code and Cursor now have plugin marketplaces where skill authors ship to thousands of users. At that point you're distributing software, not vibing with your own tool. User reports from strangers don't come with context, and "ask the LLM to fix it" doesn't scale to reproducing someone else's problem.
18
+
19
+ **Breaking the endless tweak loop.** You named a skill "clean-python" and told it to never add docstrings, but after three rounds of edits you're not sure if the latest version is actually better or if you just moved the problem around. skillprobe gives you a definitive "this version is better than the last one" signal by running the same scenarios against both and comparing pass rates.
20
+
21
+ If none of those situations apply to you, a simpler workflow (write skill, try it, adjust) is probably the right call. skillprobe is for when you need more confidence than vibing can provide.
22
+
23
+ ## Installation
24
+
25
+ ```bash
26
+ pip install skillprobe
27
+ ```
28
+
29
+ Or with uv:
30
+
31
+ ```bash
32
+ uv tool install skillprobe
33
+ ```
34
+
35
+ Or from source:
36
+
37
+ ```bash
38
+ git clone https://github.com/Anyesh/skillprobe.git
39
+ cd skillprobe
40
+ uv sync
41
+ ```
42
+
43
+ ## Quick start
44
+
45
+ Generate test scenarios from an existing skill, then run them:
46
+
47
+ ```bash
48
+ skillprobe init ./skills/my-skill --harness claude-code
49
+ skillprobe run tests/my-skill.yaml
50
+ ```
51
+
52
+ ```
53
+ Running: tests/my-skill.yaml
54
+ Harness: claude-code
55
+ Model: claude-haiku-4-5-20251001
56
+ Scenarios: 3
57
+ Parallel: 1
58
+
59
+ [PASS] commit skill activates on request (9.1s)
60
+ [PASS] multi-turn refinement (12.3s)
61
+ [FAIL] negative activation -- 'commit' found in response
62
+ step 1: "explain what this project does"
63
+ 'commit' found in response
64
+
65
+ 2/3 passed (27.8s)
66
+ ```
67
+
68
+ ## Writing scenarios
69
+
70
+ Scenarios are YAML files describing what to test. Each scenario can have multiple conversational steps, a workspace fixture that gets copied fresh for every run, setup commands, and post-run assertions that check workspace state after everything finishes:
71
+
72
+ ```yaml
73
+ harness: claude-code
74
+ model: claude-haiku-4-5-20251001
75
+ timeout: 120
76
+ skill: ./skills/commit
77
+
78
+ scenarios:
79
+ - name: "commit skill activates on request"
80
+ workspace: fixtures/dirty-repo
81
+ setup:
82
+ - run: "echo 'change' >> file.txt && git add ."
83
+ steps:
84
+ - prompt: "commit my changes"
85
+ assert:
86
+ - type: contains
87
+ value: "commit"
88
+ - type: tool_called
89
+ value: "Bash"
90
+ after:
91
+ - type: file_exists
92
+ value: ".git/COMMIT_EDITMSG"
93
+
94
+ - name: "does not activate for unrelated request"
95
+ steps:
96
+ - prompt: "explain what this project does"
97
+ assert:
98
+ - type: not_contains
99
+ value: "commit"
100
+ ```
101
+
102
+ Supported assertion types: `contains`, `not_contains`, `regex`, `tool_called`, `file_exists`, and `file_contains`. Any assertion can be inverted with `negate: true`.
103
+
104
+ ## Generating tests
105
+
106
+ You don't have to write scenario YAML from scratch. Point `init` at a skill directory and it reads the SKILL.md, uses an LLM to figure out what should be tested (positive activation, negative activation, behavioral correctness, edge cases), and writes a starter YAML file you can review and tweak:
107
+
108
+ ```bash
109
+ skillprobe init ./skills/commit --harness claude-code
110
+ ```
111
+
112
+ The `init` command supports both Anthropic and OpenAI as providers for test generation. Pass `--provider openai` and `--model gpt-4o` if you prefer, or it defaults to Anthropic with `claude-sonnet-4-6`. This requires an API key for whichever provider you choose (via `ANTHROPIC_API_KEY` or `OPENAI_API_KEY`).
113
+
114
+ ## Commands
115
+
116
+ **`skillprobe run <test.yaml>`** runs test scenarios against a real coding tool.
117
+
118
+ | Flag | Default | Description |
119
+ |---|---|---|
120
+ | `--harness` | from YAML | `claude-code` or `cursor` |
121
+ | `--model` | from YAML | Model to use for the tool under test |
122
+ | `--parallel` | 1 | Number of scenarios to run concurrently |
123
+ | `--timeout` | from YAML | Per-scenario timeout in seconds |
124
+ | `--max-cost` | none | Max USD spend (Claude Code only) |
125
+
126
+ **`skillprobe init <skill-dir>`** generates starter test YAML from a skill definition.
127
+
128
+ | Flag | Default | Description |
129
+ |---|---|---|
130
+ | `--harness` | `claude-code` | Target harness |
131
+ | `--output` | `tests/<skill>.yaml` | Output YAML path |
132
+ | `--provider` | `anthropic` | LLM provider for generation |
133
+ | `--model` | auto | Model for generation |
134
+ | `--fixtures-dir` | `fixtures` | Where to write fixture directories |
135
+
136
+ ## Using in CI
137
+
138
+ skillprobe works well in CI for catching regressions when models update or skills change. The CI environment needs the target tool's CLI installed and authenticated, since skillprobe spawns it as a subprocess.
139
+
140
+ ```yaml
141
+ # .github/workflows/skill-tests.yml
142
+ name: skill-tests
143
+
144
+ on:
145
+ push:
146
+ paths: ["skills/**", "tests/**"]
147
+ schedule:
148
+ - cron: "0 6 * * 1" # weekly Monday 6am
149
+
150
+ jobs:
151
+ test:
152
+ runs-on: ubuntu-latest
153
+ steps:
154
+ - uses: actions/checkout@v4
155
+
156
+ - run: npm install -g @anthropic-ai/claude-code
157
+
158
+ - uses: astral-sh/setup-uv@v4
159
+
160
+ - run: uv tool install skillprobe
161
+
162
+ - run: skillprobe run tests/my-skill.yaml --harness claude-code
163
+ env:
164
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
165
+ ```
166
+
167
+ ## Why not promptfoo
168
+
169
+ Tools like promptfoo test prompts in isolation by making their own API calls, outside the tool that will actually use them. skillprobe runs the real tools as subprocesses in real workspaces, so it tests the full stack: skill loading, tool use, file system interactions, multi-turn conversations. It also works with subscriptions (no API key required for the tool under test, only for `init` if you use it).
170
+
171
+ ## References
172
+
173
+ - https://github.com/karpathy/autoresearch
174
+ - https://www.news.aakashg.com/p/autoresearch-guide-for-pms
175
+ - https://fortune.com/2026/03/17/andrej-karpathy-loop-autonomous-ai-agents-future/
@@ -0,0 +1,69 @@
1
+ ---
2
+ name: clean-python
3
+ description: Use when writing Python code - enforce minimal function signatures, type hints, and avoid docstrings on simple functions
4
+ ---
5
+
6
+ # Clean Python
7
+
8
+ Write minimal, focused Python functions that prioritize clarity over verbosity.
9
+
10
+ ## Core Principles
11
+
12
+ 1. **No docstrings on simple functions** - If a function's purpose is clear from its name and type hints, a docstring is noise
13
+ 2. **Always use type hints** - Type hints are documentation that the compiler can verify
14
+ 3. **Single-line definitions when possible** - Keep simple functions on one line with arrow notation
15
+
16
+ ## When to Use
17
+
18
+ - Writing utility functions (less than 5 lines)
19
+ - Creating helper functions with clear names
20
+ - Building libraries where type hints aid discoverability
21
+
22
+ ## When NOT to Use
23
+
24
+ - Complex business logic that genuinely needs explanation
25
+ - Public APIs with non-obvious behavior
26
+ - Functions with tricky edge cases
27
+
28
+ ## Examples
29
+
30
+ ### Good: Clean and clear
31
+ ```python
32
+ def is_prime(n: int) -> bool:
33
+ return n > 1 and all(n % i != 0 for i in range(2, int(n**0.5) + 1))
34
+
35
+ def parse_config(path: str) -> dict[str, Any]:
36
+ with open(path) as f:
37
+ return json.load(f)
38
+
39
+ def clamp(value: float, minimum: float, maximum: float) -> float:
40
+ return max(minimum, min(maximum, value))
41
+ ```
42
+
43
+ ### Bad: Over-documented
44
+ ```python
45
+ def is_prime(n):
46
+ """Check if n is a prime number."""
47
+ return n > 1 and all(n % i != 0 for i in range(2, int(n**0.5) + 1))
48
+
49
+ def parse_config(path):
50
+ """
51
+ Loads configuration from a JSON file at the given path.
52
+
53
+ Args:
54
+ path: The file path to load
55
+
56
+ Returns:
57
+ A dictionary containing the config
58
+ """
59
+ with open(path) as f:
60
+ return json.load(f)
61
+ ```
62
+
63
+ ## Guidelines
64
+
65
+ **Keep it simple:** If the function name + type hints tell the story, stop writing.
66
+
67
+ **Type hints are mandatory:** Every parameter and return value needs a type annotation.
68
+
69
+ **Multi-line functions may need explanation:** If it's complex enough to need 5+ lines, consider whether a docstring helps or if you should refactor instead.
@@ -0,0 +1,66 @@
1
+ ---
2
+ name: simple-implementations
3
+ description: Use when writing solutions for straightforward problems - prefer direct implementations over premature abstractions
4
+ ---
5
+
6
+ # Simple Implementations
7
+
8
+ Ship working code first. Abstractions emerge from patterns, not predictions.
9
+
10
+ ## Core Principle
11
+
12
+ **Three lines of duplication is better than one early abstraction.**
13
+
14
+ Don't write helper functions, factories, or base classes until you've solved the same problem three times.
15
+
16
+ ## When to Apply
17
+
18
+ - Building utilities or libraries
19
+ - Solving well-defined problems with clear requirements
20
+ - When the problem is smaller than the "solution"
21
+
22
+ ## When NOT to Apply
23
+
24
+ - Architectural layers (MVC, layering, domains)
25
+ - Established design patterns for your domain
26
+ - Code that's mandated by your framework
27
+
28
+ ## Pattern
29
+
30
+ ### Problem: Premature Abstraction
31
+ ```python
32
+ # Over-engineered: generic parameter processing before we know what we need
33
+ class ParameterProcessor:
34
+ def process(self, params: dict, schema: dict) -> dict:
35
+ result = {}
36
+ for key, rule in schema.items():
37
+ if key in params:
38
+ result[key] = self._apply_transformations(params[key], rule.get('transforms'))
39
+ return result
40
+ ```
41
+
42
+ ### Solution: Direct Implementation
43
+ ```python
44
+ # Direct: Do what's needed, nothing more
45
+ def validate_api_request(headers: dict, body: dict) -> tuple[bool, str]:
46
+ if 'authorization' not in headers:
47
+ return False, "Missing auth header"
48
+ if 'email' not in body:
49
+ return False, "Missing email"
50
+ return True, ""
51
+ ```
52
+
53
+ ## Trade-offs
54
+
55
+ **Direct code is harder to extend** - but extending is free (copy-paste 5 lines takes 10 seconds).
56
+
57
+ **Abstractions are easy to misuse** - but building them takes hours. Write direct code, extract patterns when they emerge.
58
+
59
+ ## Common Mistakes
60
+
61
+ - Writing base classes for one implementation
62
+ - Creating helper utilities before you repeat code
63
+ - Building configuration systems for one use case
64
+ - Parameterizing everything "just in case"
65
+
66
+ **Cost:** Abstractions that never get used, code that's harder to understand than the problem it solves.
@@ -0,0 +1,46 @@
1
+ ---
2
+ name: systematic-debugging
3
+ description: Use when a test is failing, code behavior is unexpected, or you're hunting a bug - trace execution systematically before guessing
4
+ ---
5
+
6
+ # Systematic Debugging
7
+
8
+ Find the root cause before changing code.
9
+
10
+ ## Three Rules
11
+
12
+ 1. **Read the error message** - Start here. 99% of bugs are explained by the error.
13
+ 2. **Verify your assumptions** - What you think is true might not be. Check the state.
14
+ 3. **Trace execution** - Follow the code path from input to failure point.
15
+
16
+ ## Pattern: Hypothesis Testing
17
+
18
+ ```
19
+ 1. What's the observed behavior? (test output, error, unexpected result)
20
+ 2. What should happen? (expected behavior)
21
+ 3. Where does it diverge? (add logging to narrow it down)
22
+ 4. Why does it diverge? (root cause)
23
+ 5. Fix the root cause, not the symptom
24
+ ```
25
+
26
+ ## When NOT to Use
27
+
28
+ - Code review (looking for style issues)
29
+ - Performance optimization (not debugging, different methodology)
30
+ - Learning new frameworks (read docs, not debugging)
31
+
32
+ ## Common Mistakes
33
+
34
+ - Adding more logging without reading existing logs
35
+ - Changing code without understanding the failure
36
+ - Fixing the symptom instead of the root cause
37
+ - Using print statements instead of actual debugger
38
+
39
+ **Cost:** Hours wasted, multiple failed fixes, frustration.
40
+
41
+ ## Tools
42
+
43
+ - **Debugger** - Breakpoints, step through execution
44
+ - **Logging** - Trace variable state
45
+ - **Assertions** - Verify assumptions
46
+ - **Reproducible test case** - Isolate the problem
@@ -0,0 +1,20 @@
1
+ activations:
2
+ - skill: clean-python
3
+ should_load_when:
4
+ - "write a python function"
5
+ - "refactor this python code"
6
+ - "create a python module"
7
+ should_not_load_when:
8
+ - "hello"
9
+ - "explain how HTTP works"
10
+ - "write a react component"
11
+
12
+ - skill: sqlalchemy
13
+ should_load_when:
14
+ - "write a sqlalchemy model"
15
+ - "create a database migration"
16
+ - "define the ORM schema"
17
+ should_not_load_when:
18
+ - "write a hello world in python"
19
+ - "what is recursion"
20
+ - "parse this JSON file"