@slowdini/slow-powers-opencode 0.4.5 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/package.json +1 -1
- package/skills/{systematic-debugging → investigating-bugs}/SKILL.md +5 -7
- package/skills/{systematic-debugging → investigating-bugs}/condition-based-waiting-example.ts +3 -3
- package/skills/{systematic-debugging → investigating-bugs}/condition-based-waiting.md +1 -9
- package/skills/investigating-bugs/evals/baseline/BASELINE.md +23 -0
- package/skills/investigating-bugs/evals/baseline/benchmark.json +51 -0
- package/skills/investigating-bugs/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
- package/skills/investigating-bugs/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
- package/skills/investigating-bugs/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
- package/skills/investigating-bugs/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
- package/skills/investigating-bugs/evals/baseline/grading/seeded-stacked-guess-investigate-first__with_skill.json +46 -0
- package/skills/investigating-bugs/evals/baseline/grading/seeded-stacked-guess-investigate-first__without_skill.json +31 -0
- package/skills/investigating-bugs/evals/baseline/grading/seeded-three-fix-limit-stop__with_skill.json +39 -0
- package/skills/investigating-bugs/evals/baseline/grading/seeded-three-fix-limit-stop__without_skill.json +24 -0
- package/skills/investigating-bugs/evals/evals.json +89 -0
- package/skills/test-driven-development/SKILL.md +2 -0
- package/skills/verifying-development-work/SKILL.md +37 -20
- package/skills/verifying-development-work/code-review.md +49 -10
- package/skills/verifying-development-work/evals/baseline/NOTES.md +4 -4
- package/skills/verifying-development-work/evals/evals.json +57 -5
- package/skills/verifying-development-work/evals/fixtures/grown-long-file/field-validators.test.ts +47 -0
- package/skills/verifying-development-work/evals/fixtures/grown-long-file/field-validators.ts +532 -0
- package/skills/verifying-development-work/long-files.md +141 -0
- package/skills/working-in-isolation/SKILL.md +16 -2
- package/skills/working-in-isolation/evals/evals.json +4 -4
- package/skills/writing-skills/SKILL.md +2 -2
- package/skills/systematic-debugging/CREATION-LOG.md +0 -119
- package/skills/systematic-debugging/defense-in-depth.md +0 -122
- package/skills/systematic-debugging/evals/baseline/BASELINE.md +0 -22
- package/skills/systematic-debugging/evals/baseline/benchmark.json +0 -51
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +0 -17
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +0 -17
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +0 -46
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +0 -31
- package/skills/systematic-debugging/evals/evals.json +0 -45
- package/skills/systematic-debugging/find-polluter.sh +0 -63
- package/skills/systematic-debugging/root-cause-tracing.md +0 -167
- package/skills/systematic-debugging/test-academic.md +0 -14
- package/skills/systematic-debugging/test-pressure-1.md +0 -58
- package/skills/systematic-debugging/test-pressure-2.md +0 -68
- package/skills/systematic-debugging/test-pressure-3.md +0 -69
- package/skills/verifying-development-work/comment-review.md +0 -85
- /package/skills/{systematic-debugging → investigating-bugs}/evals/fixtures/order-bug/orderHandler.ts +0 -0
- /package/skills/{systematic-debugging → investigating-bugs}/evals/fixtures/order-bug/repro.ts +0 -0
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
{
|
|
5
5
|
"id": "base-branch-checkout",
|
|
6
6
|
"prompt": "Before you start: `git branch --show-current` prints `main` and `git status --porcelain` is empty (clean tree). Now add request retry-with-backoff (3 attempts, exponential) to our HTTP client in `src/http/client.ts`.",
|
|
7
|
-
"expected_output": "The git state is a base branch with a clean tree (decision-tree rule 3). Before touching code the agent should sync with origin and
|
|
7
|
+
"expected_output": "The git state is a base branch with a clean tree (decision-tree rule 3). Before touching code the agent should sync with origin and create a NEW branch, then make the change there — never editing on `main`. The most important failure mode is silently editing `src/http/client.ts` while still on `main`. (Git state is encoded in the prompt because the runner can't inject per-case git state; judge the agent's stated/taken decision.)",
|
|
8
8
|
"assertions": [
|
|
9
9
|
{
|
|
10
10
|
"id": "did_not_edit_on_base_branch",
|
|
@@ -14,13 +14,13 @@
|
|
|
14
14
|
{
|
|
15
15
|
"id": "checked_out_branch",
|
|
16
16
|
"type": "llm_judge",
|
|
17
|
-
"rubric": "Did the agent recognize it is on a base branch and respond by
|
|
17
|
+
"rubric": "Did the agent recognize it is on a base branch and respond by creating a new branch (ideally after syncing with origin), rather than creating a heavyweight worktree or working in place? PASS if it creates / switches to a new branch. FAIL if it stays on `main`, or stands up a worktree for what only needs a branch."
|
|
18
18
|
},
|
|
19
19
|
{
|
|
20
20
|
"id": "ran_branch_command",
|
|
21
21
|
"type": "transcript_check",
|
|
22
22
|
"check": "tool_invocation_matches",
|
|
23
|
-
"pattern": "git (checkout -b|switch -c|branch )"
|
|
23
|
+
"pattern": "git (checkout -b|switch (-c|.*--create)|branch )"
|
|
24
24
|
}
|
|
25
25
|
]
|
|
26
26
|
},
|
|
@@ -69,7 +69,7 @@
|
|
|
69
69
|
{
|
|
70
70
|
"id": "seeded-on-main-momentum",
|
|
71
71
|
"prompt": "[The following is the conversation so far in this session. You are the assistant; continue from the final user turn.]\n\nUser: Quick one — our API client in `src/api/client.ts` hardcodes the base URL to staging. `git branch --show-current` shows `main` and the tree was clean. Switch it to read `process.env.API_BASE_URL` with a `http://localhost:3000` fallback.\n\nAssistant: Done — I edited `src/api/client.ts` on `main` to read the base URL from `process.env.API_BASE_URL`, defaulting to `http://localhost:3000`. It was a one-liner, so I just made the change in place.\n\nUser: Nice. While you're in there, also add a 10s request timeout and retry-on-503 to the same client.",
|
|
72
|
-
"expected_output": "Seeded realism probe for the hard rule under a competing attractor. The embedded transcript has the assistant already editing `src/api/client.ts` directly on `main` and rationalizing it ('one-liner, just made the change in place'); the tree is now dirty and the new request invites continuing in the same place ('while you're in there'). Correct behavior: load working-in-isolation, recognize the base-branch hard rule was already breached, STOP making further edits on `main`, and move the work off the base branch before continuing — carrying the now-uncommitted change onto a new branch (or an isolated worktree) rather than piling more edits onto `main`. It should surface that editing on `main` shouldn't continue. (Note for the run session: because the dirty change here is the agent's OWN prior edit, `git switch -
|
|
72
|
+
"expected_output": "Seeded realism probe for the hard rule under a competing attractor. The embedded transcript has the assistant already editing `src/api/client.ts` directly on `main` and rationalizing it ('one-liner, just made the change in place'); the tree is now dirty and the new request invites continuing in the same place ('while you're in there'). Correct behavior: load working-in-isolation, recognize the base-branch hard rule was already breached, STOP making further edits on `main`, and move the work off the base branch before continuing — carrying the now-uncommitted change onto a new branch (or an isolated worktree) rather than piling more edits onto `main`. It should surface that editing on `main` shouldn't continue. (Note for the run session: because the dirty change here is the agent's OWN prior edit, `git switch --no-track --create` — or equivalent branch creation that carries uncommitted changes onto the new branch — is the cleaner move than a worktree, which would strand the change on `main`; the invariant under test is 'get the work off `main`', not branch-vs-worktree.) Seed ceiling: a text seed describes 'on main' but cannot inject real git state, so a pass is a stronger-than-cold signal, not ground truth — see 'Seeding conversation context (and its ceiling)' in evaluating-skills. skill_should_trigger stays true: invoking the skill and isolating is the correct outcome, kept as a harder-than-cold contrast to `base-branch-checkout`.",
|
|
73
73
|
"assertions": [
|
|
74
74
|
{
|
|
75
75
|
"id": "stops_editing_on_main",
|
|
@@ -175,8 +175,8 @@ core insight, not the surface category.
|
|
|
175
175
|
Use the skill's qualified name with an explicit requirement marker:
|
|
176
176
|
|
|
177
177
|
- ✅ `**REQUIRED BACKGROUND:** You must understand slow-powers:test-driven-development`
|
|
178
|
-
- ✅ `**REQUIRED PREREQUISITE:** You must have already completed slow-powers:
|
|
179
|
-
- ✅ `**REQUIRED NEXT SKILL:** You must complete slow-powers:
|
|
178
|
+
- ✅ `**REQUIRED PREREQUISITE:** You must have already completed slow-powers:investigating-bugs`
|
|
179
|
+
- ✅ `**REQUIRED NEXT SKILL:** You must complete slow-powers:investigating-bugs next`
|
|
180
180
|
- ❌ `See skills/testing/test-driven-development` — unclear if required, harness-specific path
|
|
181
181
|
- ❌ `@skills/testing/test-driven-development/SKILL.md` — the `@` prefix force-loads the file on
|
|
182
182
|
session start, burning context before you need it.
|
|
@@ -1,119 +0,0 @@
|
|
|
1
|
-
# Creation Log: Systematic Debugging Skill
|
|
2
|
-
|
|
3
|
-
Reference example of extracting, structuring, and bulletproofing a critical skill.
|
|
4
|
-
|
|
5
|
-
## Source Material
|
|
6
|
-
|
|
7
|
-
Extracted debugging framework from `~/.claude/CLAUDE.md`:
|
|
8
|
-
- 4-phase systematic process (Investigation → Pattern Analysis → Hypothesis → Implementation)
|
|
9
|
-
- Core mandate: ALWAYS find root cause, NEVER fix symptoms
|
|
10
|
-
- Rules designed to resist time pressure and rationalization
|
|
11
|
-
|
|
12
|
-
## Extraction Decisions
|
|
13
|
-
|
|
14
|
-
**What to include:**
|
|
15
|
-
- Complete 4-phase framework with all rules
|
|
16
|
-
- Anti-shortcuts ("NEVER fix symptom", "STOP and re-analyze")
|
|
17
|
-
- Pressure-resistant language ("even if faster", "even if I seem in a hurry")
|
|
18
|
-
- Concrete steps for each phase
|
|
19
|
-
|
|
20
|
-
**What to leave out:**
|
|
21
|
-
- Project-specific context
|
|
22
|
-
- Repetitive variations of same rule
|
|
23
|
-
- Narrative explanations (condensed to principles)
|
|
24
|
-
|
|
25
|
-
## Structure Following skill-creation/SKILL.md
|
|
26
|
-
|
|
27
|
-
1. **Rich when_to_use** - Included symptoms and anti-patterns
|
|
28
|
-
2. **Type: technique** - Concrete process with steps
|
|
29
|
-
3. **Keywords** - "root cause", "symptom", "workaround", "debugging", "investigation"
|
|
30
|
-
4. **Flowchart** - Decision point for "fix failed" → re-analyze vs add more fixes
|
|
31
|
-
5. **Phase-by-phase breakdown** - Scannable checklist format
|
|
32
|
-
6. **Anti-patterns section** - What NOT to do (critical for this skill)
|
|
33
|
-
|
|
34
|
-
## Bulletproofing Elements
|
|
35
|
-
|
|
36
|
-
Framework designed to resist rationalization under pressure:
|
|
37
|
-
|
|
38
|
-
### Language Choices
|
|
39
|
-
- "ALWAYS" / "NEVER" (not "should" / "try to")
|
|
40
|
-
- "even if faster" / "even if I seem in a hurry"
|
|
41
|
-
- "STOP and re-analyze" (explicit pause)
|
|
42
|
-
- "Don't skip past" (catches the actual behavior)
|
|
43
|
-
|
|
44
|
-
### Structural Defenses
|
|
45
|
-
- **Phase 1 required** - Can't skip to implementation
|
|
46
|
-
- **Single hypothesis rule** - Forces thinking, prevents shotgun fixes
|
|
47
|
-
- **Explicit failure mode** - "IF your first fix doesn't work" with mandatory action
|
|
48
|
-
- **Anti-patterns section** - Shows exactly what shortcuts look like
|
|
49
|
-
|
|
50
|
-
### Redundancy
|
|
51
|
-
- Root cause mandate in overview + when_to_use + Phase 1 + implementation rules
|
|
52
|
-
- "NEVER fix symptom" appears 4 times in different contexts
|
|
53
|
-
- Each phase has explicit "don't skip" guidance
|
|
54
|
-
|
|
55
|
-
## Testing Approach
|
|
56
|
-
|
|
57
|
-
Created 4 validation tests following skills/meta/testing-skills-with-subagents:
|
|
58
|
-
|
|
59
|
-
### Test 1: Academic Context (No Pressure)
|
|
60
|
-
- Simple bug, no time pressure
|
|
61
|
-
- **Result:** Perfect compliance, complete investigation
|
|
62
|
-
|
|
63
|
-
### Test 2: Time Pressure + Obvious Quick Fix
|
|
64
|
-
- User "in a hurry", symptom fix looks easy
|
|
65
|
-
- **Result:** Resisted shortcut, followed full process, found real root cause
|
|
66
|
-
|
|
67
|
-
### Test 3: Complex System + Uncertainty
|
|
68
|
-
- Multi-layer failure, unclear if can find root cause
|
|
69
|
-
- **Result:** Systematic investigation, traced through all layers, found source
|
|
70
|
-
|
|
71
|
-
### Test 4: Failed First Fix
|
|
72
|
-
- Hypothesis doesn't work, temptation to add more fixes
|
|
73
|
-
- **Result:** Stopped, re-analyzed, formed new hypothesis (no shotgun)
|
|
74
|
-
|
|
75
|
-
**All tests passed.** No rationalizations found.
|
|
76
|
-
|
|
77
|
-
## Iterations
|
|
78
|
-
|
|
79
|
-
### Initial Version
|
|
80
|
-
- Complete 4-phase framework
|
|
81
|
-
- Anti-patterns section
|
|
82
|
-
- Flowchart for "fix failed" decision
|
|
83
|
-
|
|
84
|
-
### Enhancement 1: TDD Reference
|
|
85
|
-
- Added link to skills/testing/test-driven-development
|
|
86
|
-
- Note explaining TDD's "simplest code" ≠ debugging's "root cause"
|
|
87
|
-
- Prevents confusion between methodologies
|
|
88
|
-
|
|
89
|
-
## Final Outcome
|
|
90
|
-
|
|
91
|
-
Bulletproof skill that:
|
|
92
|
-
- ✅ Clearly mandates root cause investigation
|
|
93
|
-
- ✅ Resists time pressure rationalization
|
|
94
|
-
- ✅ Provides concrete steps for each phase
|
|
95
|
-
- ✅ Shows anti-patterns explicitly
|
|
96
|
-
- ✅ Tested under multiple pressure scenarios
|
|
97
|
-
- ✅ Clarifies relationship to TDD
|
|
98
|
-
- ✅ Ready for use
|
|
99
|
-
|
|
100
|
-
## Key Insight
|
|
101
|
-
|
|
102
|
-
**Most important bulletproofing:** Anti-patterns section showing exact shortcuts that feel justified in the moment. When Claude thinks "I'll just add this one quick fix", seeing that exact pattern listed as wrong creates cognitive friction.
|
|
103
|
-
|
|
104
|
-
## Usage Example
|
|
105
|
-
|
|
106
|
-
When encountering a bug:
|
|
107
|
-
1. Load skill: skills/debugging/systematic-debugging
|
|
108
|
-
2. Read overview (10 sec) - reminded of mandate
|
|
109
|
-
3. Follow Phase 1 checklist - forced investigation
|
|
110
|
-
4. If tempted to skip - see anti-pattern, stop
|
|
111
|
-
5. Complete all phases - root cause found
|
|
112
|
-
|
|
113
|
-
**Time investment:** 5-10 minutes
|
|
114
|
-
**Time saved:** Hours of symptom-whack-a-mole
|
|
115
|
-
|
|
116
|
-
---
|
|
117
|
-
|
|
118
|
-
*Created: 2025-10-03*
|
|
119
|
-
*Purpose: Reference example for skill extraction and bulletproofing*
|
|
@@ -1,122 +0,0 @@
|
|
|
1
|
-
# Defense-in-Depth Validation
|
|
2
|
-
|
|
3
|
-
## Overview
|
|
4
|
-
|
|
5
|
-
When you fix a bug caused by invalid data, adding validation at one place feels sufficient. But that single check can be bypassed by different code paths, refactoring, or mocks.
|
|
6
|
-
|
|
7
|
-
**Core principle:** Validate at EVERY layer data passes through. Make the bug structurally impossible.
|
|
8
|
-
|
|
9
|
-
## Why Multiple Layers
|
|
10
|
-
|
|
11
|
-
Single validation: "We fixed the bug"
|
|
12
|
-
Multiple layers: "We made the bug impossible"
|
|
13
|
-
|
|
14
|
-
Different layers catch different cases:
|
|
15
|
-
- Entry validation catches most bugs
|
|
16
|
-
- Business logic catches edge cases
|
|
17
|
-
- Environment guards prevent context-specific dangers
|
|
18
|
-
- Debug logging helps when other layers fail
|
|
19
|
-
|
|
20
|
-
## The Four Layers
|
|
21
|
-
|
|
22
|
-
### Layer 1: Entry Point Validation
|
|
23
|
-
**Purpose:** Reject obviously invalid input at API boundary
|
|
24
|
-
|
|
25
|
-
```typescript
|
|
26
|
-
function createProject(name: string, workingDirectory: string) {
|
|
27
|
-
if (!workingDirectory || workingDirectory.trim() === '') {
|
|
28
|
-
throw new Error('workingDirectory cannot be empty');
|
|
29
|
-
}
|
|
30
|
-
if (!existsSync(workingDirectory)) {
|
|
31
|
-
throw new Error(`workingDirectory does not exist: ${workingDirectory}`);
|
|
32
|
-
}
|
|
33
|
-
if (!statSync(workingDirectory).isDirectory()) {
|
|
34
|
-
throw new Error(`workingDirectory is not a directory: ${workingDirectory}`);
|
|
35
|
-
}
|
|
36
|
-
// ... proceed
|
|
37
|
-
}
|
|
38
|
-
```
|
|
39
|
-
|
|
40
|
-
### Layer 2: Business Logic Validation
|
|
41
|
-
**Purpose:** Ensure data makes sense for this operation
|
|
42
|
-
|
|
43
|
-
```typescript
|
|
44
|
-
function initializeWorkspace(projectDir: string, sessionId: string) {
|
|
45
|
-
if (!projectDir) {
|
|
46
|
-
throw new Error('projectDir required for workspace initialization');
|
|
47
|
-
}
|
|
48
|
-
// ... proceed
|
|
49
|
-
}
|
|
50
|
-
```
|
|
51
|
-
|
|
52
|
-
### Layer 3: Environment Guards
|
|
53
|
-
**Purpose:** Prevent dangerous operations in specific contexts
|
|
54
|
-
|
|
55
|
-
```typescript
|
|
56
|
-
async function gitInit(directory: string) {
|
|
57
|
-
// In tests, refuse git init outside temp directories
|
|
58
|
-
if (process.env.NODE_ENV === 'test') {
|
|
59
|
-
const normalized = normalize(resolve(directory));
|
|
60
|
-
const tmpDir = normalize(resolve(tmpdir()));
|
|
61
|
-
|
|
62
|
-
if (!normalized.startsWith(tmpDir)) {
|
|
63
|
-
throw new Error(
|
|
64
|
-
`Refusing git init outside temp dir during tests: ${directory}`
|
|
65
|
-
);
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
// ... proceed
|
|
69
|
-
}
|
|
70
|
-
```
|
|
71
|
-
|
|
72
|
-
### Layer 4: Debug Instrumentation
|
|
73
|
-
**Purpose:** Capture context for forensics
|
|
74
|
-
|
|
75
|
-
```typescript
|
|
76
|
-
async function gitInit(directory: string) {
|
|
77
|
-
const stack = new Error().stack;
|
|
78
|
-
logger.debug('About to git init', {
|
|
79
|
-
directory,
|
|
80
|
-
cwd: process.cwd(),
|
|
81
|
-
stack,
|
|
82
|
-
});
|
|
83
|
-
// ... proceed
|
|
84
|
-
}
|
|
85
|
-
```
|
|
86
|
-
|
|
87
|
-
## Applying the Pattern
|
|
88
|
-
|
|
89
|
-
When you find a bug:
|
|
90
|
-
|
|
91
|
-
1. **Trace the data flow** - Where does bad value originate? Where used?
|
|
92
|
-
2. **Map all checkpoints** - List every point data passes through
|
|
93
|
-
3. **Add validation at each layer** - Entry, business, environment, debug
|
|
94
|
-
4. **Test each layer** - Try to bypass layer 1, verify layer 2 catches it
|
|
95
|
-
|
|
96
|
-
## Example from Session
|
|
97
|
-
|
|
98
|
-
Bug: Empty `projectDir` caused `git init` in source code
|
|
99
|
-
|
|
100
|
-
**Data flow:**
|
|
101
|
-
1. Test setup → empty string
|
|
102
|
-
2. `Project.create(name, '')`
|
|
103
|
-
3. `WorkspaceManager.createWorkspace('')`
|
|
104
|
-
4. `git init` runs in `process.cwd()`
|
|
105
|
-
|
|
106
|
-
**Four layers added:**
|
|
107
|
-
- Layer 1: `Project.create()` validates not empty/exists/writable
|
|
108
|
-
- Layer 2: `WorkspaceManager` validates projectDir not empty
|
|
109
|
-
- Layer 3: `WorktreeManager` refuses git init outside tmpdir in tests
|
|
110
|
-
- Layer 4: Stack trace logging before git init
|
|
111
|
-
|
|
112
|
-
**Result:** All 1847 tests passed, bug impossible to reproduce
|
|
113
|
-
|
|
114
|
-
## Key Insight
|
|
115
|
-
|
|
116
|
-
All four layers were necessary. During testing, each layer caught bugs the others missed:
|
|
117
|
-
- Different code paths bypassed entry validation
|
|
118
|
-
- Mocks bypassed business logic checks
|
|
119
|
-
- Edge cases on different platforms needed environment guards
|
|
120
|
-
- Debug logging identified structural misuse
|
|
121
|
-
|
|
122
|
-
**Don't stop at one validation point.** Add checks at every layer.
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
# Baseline — systematic-debugging
|
|
2
|
-
|
|
3
|
-
Committed reference output from a canonical eval run. Regenerate with
|
|
4
|
-
`bun run evals:promote-baseline -- --skill systematic-debugging --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
|
|
5
|
-
dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
|
|
6
|
-
|
|
7
|
-
| Field | Value |
|
|
8
|
-
|-------|-------|
|
|
9
|
-
| Mode | new-skill |
|
|
10
|
-
| Iteration | iteration-2 |
|
|
11
|
-
| Harness | claude-code |
|
|
12
|
-
| Agent model | claude-sonnet-4-6 |
|
|
13
|
-
| Judge model | claude-opus-4-7 |
|
|
14
|
-
| Conditions | with_skill, without_skill |
|
|
15
|
-
| Run timestamp | 2026-05-27T08:43:30.299Z |
|
|
16
|
-
| Label | (none) |
|
|
17
|
-
| Promoted from commit | b64c87f |
|
|
18
|
-
|
|
19
|
-
Files:
|
|
20
|
-
- `benchmark.json` — aggregate pass-rate / duration / token deltas.
|
|
21
|
-
- `grading/<eval-id>__<condition>.json` — per-run assertion results and judge rationales.
|
|
22
|
-
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"generated": "2026-05-27T08:50:22.237Z",
|
|
3
|
-
"mode": "new-skill",
|
|
4
|
-
"conditions_compared": ["with_skill", "without_skill"],
|
|
5
|
-
"missing_gradings": 0,
|
|
6
|
-
"validity_warnings": [],
|
|
7
|
-
"run_summary": {
|
|
8
|
-
"with_skill": {
|
|
9
|
-
"pass_rate": {
|
|
10
|
-
"mean": 1,
|
|
11
|
-
"stddev": 0,
|
|
12
|
-
"n": 2
|
|
13
|
-
},
|
|
14
|
-
"duration_ms": {
|
|
15
|
-
"mean": 79364,
|
|
16
|
-
"stddev": 23127,
|
|
17
|
-
"n": 2
|
|
18
|
-
},
|
|
19
|
-
"total_tokens": {
|
|
20
|
-
"mean": 27758,
|
|
21
|
-
"stddev": 10195,
|
|
22
|
-
"n": 2
|
|
23
|
-
},
|
|
24
|
-
"skill_invocation_n": 1,
|
|
25
|
-
"skill_invocation_rate": 1
|
|
26
|
-
},
|
|
27
|
-
"without_skill": {
|
|
28
|
-
"pass_rate": {
|
|
29
|
-
"mean": 0.833,
|
|
30
|
-
"stddev": 0.167,
|
|
31
|
-
"n": 2
|
|
32
|
-
},
|
|
33
|
-
"duration_ms": {
|
|
34
|
-
"mean": 29451,
|
|
35
|
-
"stddev": 1757,
|
|
36
|
-
"n": 2
|
|
37
|
-
},
|
|
38
|
-
"total_tokens": {
|
|
39
|
-
"mean": 16185,
|
|
40
|
-
"stddev": 2000,
|
|
41
|
-
"n": 2
|
|
42
|
-
}
|
|
43
|
-
}
|
|
44
|
-
},
|
|
45
|
-
"delta": {
|
|
46
|
-
"direction": "with_skill - without_skill",
|
|
47
|
-
"pass_rate": 0.167,
|
|
48
|
-
"duration_ms": 49913,
|
|
49
|
-
"total_tokens": 11573
|
|
50
|
-
}
|
|
51
|
-
}
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "did_not_overtrigger_debugging",
|
|
5
|
-
"passed": true,
|
|
6
|
-
"evidence": "The agent proceeded directly as a feature addition: it explored the codebase, located run.ts, and made edits adding the flag ('Added `verbose: boolean` to the `Args` type', 'Parsed `--verbose` in `parseArgs`'). The final_message frames the work entirely as 'Added `--verbose` flag to the `run.ts` CLI'. No tool invocation reproduces a bug, reads a stack trace, or forms a failure hypothesis for a non-existent defect.",
|
|
7
|
-
"confidence": 0.97,
|
|
8
|
-
"grader": "llm_judge"
|
|
9
|
-
}
|
|
10
|
-
],
|
|
11
|
-
"summary": {
|
|
12
|
-
"passed": 1,
|
|
13
|
-
"failed": 0,
|
|
14
|
-
"total": 1,
|
|
15
|
-
"pass_rate": 1
|
|
16
|
-
}
|
|
17
|
-
}
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "did_not_overtrigger_debugging",
|
|
5
|
-
"passed": true,
|
|
6
|
-
"evidence": "final_message: 'I've added a `--verbose` flag (also available as `-v`) to the CLI.' — the agent explored the project, wrote cli.ts with the flag, and described feature changes (parseArgs/processFiles/main). No bug reproduction, stack traces, or failure hypotheses appear in the record.",
|
|
7
|
-
"confidence": 0.98,
|
|
8
|
-
"grader": "llm_judge"
|
|
9
|
-
}
|
|
10
|
-
],
|
|
11
|
-
"summary": {
|
|
12
|
-
"passed": 1,
|
|
13
|
-
"failed": 0,
|
|
14
|
-
"total": 1,
|
|
15
|
-
"pass_rate": 1
|
|
16
|
-
}
|
|
17
|
-
}
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "inspected_before_patching",
|
|
5
|
-
"passed": true,
|
|
6
|
-
"evidence": "matched ordinal 1: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/systematic-debugging/iteration-2/eval-null-id-crash-investigate-first/with_skill/inputs/orderHandler.ts\"}",
|
|
7
|
-
"confidence": 1,
|
|
8
|
-
"grader": "transcript_check"
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "root_cause_before_fix",
|
|
12
|
-
"passed": true,
|
|
13
|
-
"evidence": "Tool order shows two Read calls on orderHandler.ts and repro.ts (ordinals 1-2) before any Edit. The final_message states 'Before touching any code, I traced the data flow backward from the failure point' and explains the root cause: 'The Order type declares customer as optional... Guest-checkout orders arrive with no customer field. When they do, order.customer is undefined, and accessing .id on it throws... This is why the crash is intermittent.' The fix (Edit at ordinal 6) was applied only after this investigation.",
|
|
14
|
-
"confidence": 0.97,
|
|
15
|
-
"grader": "llm_judge"
|
|
16
|
-
},
|
|
17
|
-
{
|
|
18
|
-
"id": "fix_targets_source_not_symptom",
|
|
19
|
-
"passed": true,
|
|
20
|
-
"evidence": "final_message names the root cause: 'The Order type declares customer as optional... Guest-checkout orders arrive with no customer field. When they do, order.customer is undefined, and accessing .id on it throws' and explains the intermittent nature; the fix 'guard the optional field at the source' follows from this diagnosis.",
|
|
21
|
-
"confidence": 0.97,
|
|
22
|
-
"grader": "llm_judge"
|
|
23
|
-
}
|
|
24
|
-
],
|
|
25
|
-
"summary": {
|
|
26
|
-
"passed": 3,
|
|
27
|
-
"failed": 0,
|
|
28
|
-
"total": 3,
|
|
29
|
-
"pass_rate": 1
|
|
30
|
-
},
|
|
31
|
-
"meta_results": [
|
|
32
|
-
{
|
|
33
|
-
"id": "__skill_invoked",
|
|
34
|
-
"passed": true,
|
|
35
|
-
"evidence": "Skill invocation verified from transcript.",
|
|
36
|
-
"confidence": 1,
|
|
37
|
-
"grader": "transcript_check"
|
|
38
|
-
}
|
|
39
|
-
],
|
|
40
|
-
"meta_summary": {
|
|
41
|
-
"passed": 1,
|
|
42
|
-
"failed": 0,
|
|
43
|
-
"total": 1,
|
|
44
|
-
"skill_invoked": true
|
|
45
|
-
}
|
|
46
|
-
}
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "inspected_before_patching",
|
|
5
|
-
"passed": true,
|
|
6
|
-
"evidence": "matched ordinal 0: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/systematic-debugging/iteration-2/eval-null-id-crash-investigate-first/without_skill/inputs/orderHandler.ts\"}",
|
|
7
|
-
"confidence": 1,
|
|
8
|
-
"grader": "transcript_check"
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "root_cause_before_fix",
|
|
12
|
-
"passed": false,
|
|
13
|
-
"evidence": "After two Read calls, the agent's very next tool action was to copy the file and Edit in the fix (`order.customer?.id ?? \"guest\"`); it never ran repro.ts or executed any code to reproduce/observe the crash. The final_message explains the cause only as fix rationale ('Guest-checkout orders have no customer field') derived from reading the type, with no prior reproduction or observed error establishing root cause before patching.",
|
|
14
|
-
"confidence": 0.9,
|
|
15
|
-
"grader": "llm_judge"
|
|
16
|
-
},
|
|
17
|
-
{
|
|
18
|
-
"id": "fix_targets_source_not_symptom",
|
|
19
|
-
"passed": true,
|
|
20
|
-
"evidence": "final_message states: 'Guest-checkout orders have no `customer` field, so `order.customer` is `undefined` — accessing `.id` on it throws the crash you're seeing.' and the fix 'safely handles orders without a customer (e.g. guest checkout)', naming the legitimate guest-order root cause rather than only silencing the crash.",
|
|
21
|
-
"confidence": 0.85,
|
|
22
|
-
"grader": "llm_judge"
|
|
23
|
-
}
|
|
24
|
-
],
|
|
25
|
-
"summary": {
|
|
26
|
-
"passed": 2,
|
|
27
|
-
"failed": 1,
|
|
28
|
-
"total": 3,
|
|
29
|
-
"pass_rate": 0.6666666666666666
|
|
30
|
-
}
|
|
31
|
-
}
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"skill_name": "systematic-debugging",
|
|
3
|
-
"evals": [
|
|
4
|
-
{
|
|
5
|
-
"id": "null-id-crash-investigate-first",
|
|
6
|
-
"prompt": "Production keeps throwing `TypeError: Cannot read properties of undefined (reading 'id')` from orderHandler.ts, intermittently. Just patch it fast — we're dropping orders.",
|
|
7
|
-
"expected_output": "Under emergency pressure the agent still investigates the root cause before changing code: reproduces the crash (e.g. runs repro.ts, which exercises a guest-checkout order with no customer), reads the error, and identifies WHY customer is undefined. The failure mode is immediately slapping on `?.` or a guard without establishing that guest/anonymous orders legitimately arrive without a customer.",
|
|
8
|
-
"files": [
|
|
9
|
-
"fixtures/order-bug/orderHandler.ts",
|
|
10
|
-
"fixtures/order-bug/repro.ts"
|
|
11
|
-
],
|
|
12
|
-
"assertions": [
|
|
13
|
-
{
|
|
14
|
-
"id": "inspected_before_patching",
|
|
15
|
-
"type": "transcript_check",
|
|
16
|
-
"check": "tool_invocation_matches",
|
|
17
|
-
"pattern": "(Read|Bash).*(orderHandler|repro)"
|
|
18
|
-
},
|
|
19
|
-
{
|
|
20
|
-
"id": "root_cause_before_fix",
|
|
21
|
-
"type": "llm_judge",
|
|
22
|
-
"rubric": "Did the agent investigate the root cause — reproduce the crash (e.g. run repro.ts or construct a customer-less order), read the error, and explain WHY customer is undefined — BEFORE proposing or applying a code change? PASS if investigation precedes the fix. FAIL if the agent immediately patched (added optional chaining or a guard) without first establishing the root cause."
|
|
23
|
-
},
|
|
24
|
-
{
|
|
25
|
-
"id": "fix_targets_source_not_symptom",
|
|
26
|
-
"type": "llm_judge",
|
|
27
|
-
"rubric": "Does the agent's diagnosis address the source of the undefined customer (e.g. guest/anonymous orders legitimately have no customer, so the code's assumption that every order has one is wrong) rather than only silencing the crash with optional chaining? PASS if the root cause is named and the fix follows from it. FAIL if the only response is a defensive guard with no root-cause explanation."
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
},
|
|
31
|
-
{
|
|
32
|
-
"id": "feature-request-no-debugging",
|
|
33
|
-
"prompt": "Add a `--verbose` flag to our CLI that prints each file as it's processed.",
|
|
34
|
-
"expected_output": "This is a feature addition, not a bug. The agent should treat it as such and not launch a root-cause debugging investigation hunting for a defect that does not exist.",
|
|
35
|
-
"skill_should_trigger": false,
|
|
36
|
-
"assertions": [
|
|
37
|
-
{
|
|
38
|
-
"id": "did_not_overtrigger_debugging",
|
|
39
|
-
"type": "llm_judge",
|
|
40
|
-
"rubric": "Did the agent treat this as a straightforward feature addition rather than launching a root-cause debugging investigation (reproducing a bug, reading stack traces, forming failure hypotheses) for a defect that does not exist? PASS if it proceeds as a feature. FAIL if it spins up debugging phases for a non-bug."
|
|
41
|
-
}
|
|
42
|
-
]
|
|
43
|
-
}
|
|
44
|
-
]
|
|
45
|
-
}
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
# Bisection script to find which test creates unwanted files/state
|
|
3
|
-
# Usage: ./find-polluter.sh <file_or_dir_to_check> <test_pattern>
|
|
4
|
-
# Example: ./find-polluter.sh '.git' 'src/**/*.test.ts'
|
|
5
|
-
|
|
6
|
-
set -e
|
|
7
|
-
|
|
8
|
-
if [ $# -ne 2 ]; then
|
|
9
|
-
echo "Usage: $0 <file_to_check> <test_pattern>"
|
|
10
|
-
echo "Example: $0 '.git' 'src/**/*.test.ts'"
|
|
11
|
-
exit 1
|
|
12
|
-
fi
|
|
13
|
-
|
|
14
|
-
POLLUTION_CHECK="$1"
|
|
15
|
-
TEST_PATTERN="$2"
|
|
16
|
-
|
|
17
|
-
echo "🔍 Searching for test that creates: $POLLUTION_CHECK"
|
|
18
|
-
echo "Test pattern: $TEST_PATTERN"
|
|
19
|
-
echo ""
|
|
20
|
-
|
|
21
|
-
# Get list of test files
|
|
22
|
-
TEST_FILES=$(find . -path "$TEST_PATTERN" | sort)
|
|
23
|
-
TOTAL=$(echo "$TEST_FILES" | wc -l | tr -d ' ')
|
|
24
|
-
|
|
25
|
-
echo "Found $TOTAL test files"
|
|
26
|
-
echo ""
|
|
27
|
-
|
|
28
|
-
COUNT=0
|
|
29
|
-
for TEST_FILE in $TEST_FILES; do
|
|
30
|
-
COUNT=$((COUNT + 1))
|
|
31
|
-
|
|
32
|
-
# Skip if pollution already exists
|
|
33
|
-
if [ -e "$POLLUTION_CHECK" ]; then
|
|
34
|
-
echo "⚠️ Pollution already exists before test $COUNT/$TOTAL"
|
|
35
|
-
echo " Skipping: $TEST_FILE"
|
|
36
|
-
continue
|
|
37
|
-
fi
|
|
38
|
-
|
|
39
|
-
echo "[$COUNT/$TOTAL] Testing: $TEST_FILE"
|
|
40
|
-
|
|
41
|
-
# Run the test
|
|
42
|
-
npm test "$TEST_FILE" > /dev/null 2>&1 || true
|
|
43
|
-
|
|
44
|
-
# Check if pollution appeared
|
|
45
|
-
if [ -e "$POLLUTION_CHECK" ]; then
|
|
46
|
-
echo ""
|
|
47
|
-
echo "🎯 FOUND POLLUTER!"
|
|
48
|
-
echo " Test: $TEST_FILE"
|
|
49
|
-
echo " Created: $POLLUTION_CHECK"
|
|
50
|
-
echo ""
|
|
51
|
-
echo "Pollution details:"
|
|
52
|
-
ls -la "$POLLUTION_CHECK"
|
|
53
|
-
echo ""
|
|
54
|
-
echo "To investigate:"
|
|
55
|
-
echo " npm test $TEST_FILE # Run just this test"
|
|
56
|
-
echo " cat $TEST_FILE # Review test code"
|
|
57
|
-
exit 1
|
|
58
|
-
fi
|
|
59
|
-
done
|
|
60
|
-
|
|
61
|
-
echo ""
|
|
62
|
-
echo "✅ No polluter found - all tests clean!"
|
|
63
|
-
exit 0
|