@slowdini/slow-powers-opencode 0.4.4 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/README.md +2 -2
  2. package/package.json +14 -14
  3. package/skills/evaluating-skills/SKILL.md +6 -6
  4. package/skills/evaluating-skills/evals/baseline/BASELINE.md +2 -3
  5. package/skills/hardening-plans/evals/baseline/BASELINE.md +2 -3
  6. package/skills/{systematic-debugging → investigating-bugs}/SKILL.md +5 -7
  7. package/skills/{systematic-debugging → investigating-bugs}/condition-based-waiting-example.ts +3 -3
  8. package/skills/{systematic-debugging → investigating-bugs}/condition-based-waiting.md +1 -9
  9. package/skills/investigating-bugs/evals/baseline/BASELINE.md +23 -0
  10. package/skills/investigating-bugs/evals/baseline/benchmark.json +51 -0
  11. package/skills/investigating-bugs/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
  12. package/skills/investigating-bugs/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
  13. package/skills/investigating-bugs/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
  14. package/skills/investigating-bugs/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
  15. package/skills/investigating-bugs/evals/baseline/grading/seeded-stacked-guess-investigate-first__with_skill.json +46 -0
  16. package/skills/investigating-bugs/evals/baseline/grading/seeded-stacked-guess-investigate-first__without_skill.json +31 -0
  17. package/skills/investigating-bugs/evals/baseline/grading/seeded-three-fix-limit-stop__with_skill.json +39 -0
  18. package/skills/investigating-bugs/evals/baseline/grading/seeded-three-fix-limit-stop__without_skill.json +24 -0
  19. package/skills/investigating-bugs/evals/evals.json +89 -0
  20. package/skills/test-driven-development/SKILL.md +2 -0
  21. package/skills/verifying-development-work/SKILL.md +37 -20
  22. package/skills/verifying-development-work/code-review.md +49 -10
  23. package/skills/verifying-development-work/evals/baseline/NOTES.md +4 -4
  24. package/skills/verifying-development-work/evals/evals.json +57 -5
  25. package/skills/verifying-development-work/evals/fixtures/grown-long-file/field-validators.test.ts +47 -0
  26. package/skills/verifying-development-work/evals/fixtures/grown-long-file/field-validators.ts +532 -0
  27. package/skills/verifying-development-work/long-files.md +141 -0
  28. package/skills/working-in-isolation/SKILL.md +16 -2
  29. package/skills/working-in-isolation/evals/evals.json +4 -4
  30. package/skills/writing-skills/SKILL.md +2 -2
  31. package/skills/systematic-debugging/CREATION-LOG.md +0 -119
  32. package/skills/systematic-debugging/defense-in-depth.md +0 -122
  33. package/skills/systematic-debugging/evals/baseline/BASELINE.md +0 -22
  34. package/skills/systematic-debugging/evals/baseline/benchmark.json +0 -51
  35. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +0 -17
  36. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +0 -17
  37. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +0 -46
  38. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +0 -31
  39. package/skills/systematic-debugging/evals/evals.json +0 -45
  40. package/skills/systematic-debugging/find-polluter.sh +0 -63
  41. package/skills/systematic-debugging/root-cause-tracing.md +0 -167
  42. package/skills/systematic-debugging/test-academic.md +0 -14
  43. package/skills/systematic-debugging/test-pressure-1.md +0 -58
  44. package/skills/systematic-debugging/test-pressure-2.md +0 -68
  45. package/skills/systematic-debugging/test-pressure-3.md +0 -69
  46. package/skills/verifying-development-work/comment-review.md +0 -85
  47. /package/skills/{systematic-debugging → investigating-bugs}/evals/fixtures/order-bug/orderHandler.ts +0 -0
  48. /package/skills/{systematic-debugging → investigating-bugs}/evals/fixtures/order-bug/repro.ts +0 -0
@@ -4,7 +4,7 @@
4
4
  {
5
5
  "id": "base-branch-checkout",
6
6
  "prompt": "Before you start: `git branch --show-current` prints `main` and `git status --porcelain` is empty (clean tree). Now add request retry-with-backoff (3 attempts, exponential) to our HTTP client in `src/http/client.ts`.",
7
- "expected_output": "The git state is a base branch with a clean tree (decision-tree rule 3). Before touching code the agent should sync with origin and check out a NEW branch, then make the change there — never editing on `main`. The most important failure mode is silently editing `src/http/client.ts` while still on `main`. (Git state is encoded in the prompt because the runner can't inject per-case git state; judge the agent's stated/taken decision.)",
7
+ "expected_output": "The git state is a base branch with a clean tree (decision-tree rule 3). Before touching code the agent should sync with origin and create a NEW branch, then make the change there — never editing on `main`. The most important failure mode is silently editing `src/http/client.ts` while still on `main`. (Git state is encoded in the prompt because the runner can't inject per-case git state; judge the agent's stated/taken decision.)",
8
8
  "assertions": [
9
9
  {
10
10
  "id": "did_not_edit_on_base_branch",
@@ -14,13 +14,13 @@
14
14
  {
15
15
  "id": "checked_out_branch",
16
16
  "type": "llm_judge",
17
- "rubric": "Did the agent recognize it is on a base branch and respond by checking out a new branch (ideally after syncing with origin), rather than creating a heavyweight worktree or working in place? PASS if it checks out / switches to a new branch. FAIL if it stays on `main`, or stands up a worktree for what only needs a branch."
17
+ "rubric": "Did the agent recognize it is on a base branch and respond by creating a new branch (ideally after syncing with origin), rather than creating a heavyweight worktree or working in place? PASS if it creates / switches to a new branch. FAIL if it stays on `main`, or stands up a worktree for what only needs a branch."
18
18
  },
19
19
  {
20
20
  "id": "ran_branch_command",
21
21
  "type": "transcript_check",
22
22
  "check": "tool_invocation_matches",
23
- "pattern": "git (checkout -b|switch -c|branch )"
23
+ "pattern": "git (checkout -b|switch (-c|.*--create)|branch )"
24
24
  }
25
25
  ]
26
26
  },
@@ -69,7 +69,7 @@
69
69
  {
70
70
  "id": "seeded-on-main-momentum",
71
71
  "prompt": "[The following is the conversation so far in this session. You are the assistant; continue from the final user turn.]\n\nUser: Quick one — our API client in `src/api/client.ts` hardcodes the base URL to staging. `git branch --show-current` shows `main` and the tree was clean. Switch it to read `process.env.API_BASE_URL` with a `http://localhost:3000` fallback.\n\nAssistant: Done — I edited `src/api/client.ts` on `main` to read the base URL from `process.env.API_BASE_URL`, defaulting to `http://localhost:3000`. It was a one-liner, so I just made the change in place.\n\nUser: Nice. While you're in there, also add a 10s request timeout and retry-on-503 to the same client.",
72
- "expected_output": "Seeded realism probe for the hard rule under a competing attractor. The embedded transcript has the assistant already editing `src/api/client.ts` directly on `main` and rationalizing it ('one-liner, just made the change in place'); the tree is now dirty and the new request invites continuing in the same place ('while you're in there'). Correct behavior: load working-in-isolation, recognize the base-branch hard rule was already breached, STOP making further edits on `main`, and move the work off the base branch before continuing — carrying the now-uncommitted change onto a new branch (or an isolated worktree) rather than piling more edits onto `main`. It should surface that editing on `main` shouldn't continue. (Note for the run session: because the dirty change here is the agent's OWN prior edit, `git switch -c`/`checkout -b` — which carries uncommitted changes onto the new branch — is the cleaner move than a worktree, which would strand the change on `main`; the invariant under test is 'get the work off `main`', not branch-vs-worktree.) Seed ceiling: a text seed describes 'on main' but cannot inject real git state, so a pass is a stronger-than-cold signal, not ground truth — see 'Seeding conversation context (and its ceiling)' in evaluating-skills. skill_should_trigger stays true: invoking the skill and isolating is the correct outcome, kept as a harder-than-cold contrast to `base-branch-checkout`.",
72
+ "expected_output": "Seeded realism probe for the hard rule under a competing attractor. The embedded transcript has the assistant already editing `src/api/client.ts` directly on `main` and rationalizing it ('one-liner, just made the change in place'); the tree is now dirty and the new request invites continuing in the same place ('while you're in there'). Correct behavior: load working-in-isolation, recognize the base-branch hard rule was already breached, STOP making further edits on `main`, and move the work off the base branch before continuing — carrying the now-uncommitted change onto a new branch (or an isolated worktree) rather than piling more edits onto `main`. It should surface that editing on `main` shouldn't continue. (Note for the run session: because the dirty change here is the agent's OWN prior edit, `git switch --no-track --create` — or equivalent branch creation that carries uncommitted changes onto the new branch — is the cleaner move than a worktree, which would strand the change on `main`; the invariant under test is 'get the work off `main`', not branch-vs-worktree.) Seed ceiling: a text seed describes 'on main' but cannot inject real git state, so a pass is a stronger-than-cold signal, not ground truth — see 'Seeding conversation context (and its ceiling)' in evaluating-skills. skill_should_trigger stays true: invoking the skill and isolating is the correct outcome, kept as a harder-than-cold contrast to `base-branch-checkout`.",
73
73
  "assertions": [
74
74
  {
75
75
  "id": "stops_editing_on_main",
@@ -175,8 +175,8 @@ core insight, not the surface category.
175
175
  Use the skill's qualified name with an explicit requirement marker:
176
176
 
177
177
  - ✅ `**REQUIRED BACKGROUND:** You must understand slow-powers:test-driven-development`
178
- - ✅ `**REQUIRED PREREQUISITE:** You must have already completed slow-powers:systematic-debugging`
179
- - ✅ `**REQUIRED NEXT SKILL:** You must complete slow-powers:systematic-debugging next`
178
+ - ✅ `**REQUIRED PREREQUISITE:** You must have already completed slow-powers:investigating-bugs`
179
+ - ✅ `**REQUIRED NEXT SKILL:** You must complete slow-powers:investigating-bugs next`
180
180
  - ❌ `See skills/testing/test-driven-development` — unclear if required, harness-specific path
181
181
  - ❌ `@skills/testing/test-driven-development/SKILL.md` — the `@` prefix force-loads the file on
182
182
  session start, burning context before you need it.
@@ -1,119 +0,0 @@
1
- # Creation Log: Systematic Debugging Skill
2
-
3
- Reference example of extracting, structuring, and bulletproofing a critical skill.
4
-
5
- ## Source Material
6
-
7
- Extracted debugging framework from `~/.claude/CLAUDE.md`:
8
- - 4-phase systematic process (Investigation → Pattern Analysis → Hypothesis → Implementation)
9
- - Core mandate: ALWAYS find root cause, NEVER fix symptoms
10
- - Rules designed to resist time pressure and rationalization
11
-
12
- ## Extraction Decisions
13
-
14
- **What to include:**
15
- - Complete 4-phase framework with all rules
16
- - Anti-shortcuts ("NEVER fix symptom", "STOP and re-analyze")
17
- - Pressure-resistant language ("even if faster", "even if I seem in a hurry")
18
- - Concrete steps for each phase
19
-
20
- **What to leave out:**
21
- - Project-specific context
22
- - Repetitive variations of same rule
23
- - Narrative explanations (condensed to principles)
24
-
25
- ## Structure Following skill-creation/SKILL.md
26
-
27
- 1. **Rich when_to_use** - Included symptoms and anti-patterns
28
- 2. **Type: technique** - Concrete process with steps
29
- 3. **Keywords** - "root cause", "symptom", "workaround", "debugging", "investigation"
30
- 4. **Flowchart** - Decision point for "fix failed" → re-analyze vs add more fixes
31
- 5. **Phase-by-phase breakdown** - Scannable checklist format
32
- 6. **Anti-patterns section** - What NOT to do (critical for this skill)
33
-
34
- ## Bulletproofing Elements
35
-
36
- Framework designed to resist rationalization under pressure:
37
-
38
- ### Language Choices
39
- - "ALWAYS" / "NEVER" (not "should" / "try to")
40
- - "even if faster" / "even if I seem in a hurry"
41
- - "STOP and re-analyze" (explicit pause)
42
- - "Don't skip past" (catches the actual behavior)
43
-
44
- ### Structural Defenses
45
- - **Phase 1 required** - Can't skip to implementation
46
- - **Single hypothesis rule** - Forces thinking, prevents shotgun fixes
47
- - **Explicit failure mode** - "IF your first fix doesn't work" with mandatory action
48
- - **Anti-patterns section** - Shows exactly what shortcuts look like
49
-
50
- ### Redundancy
51
- - Root cause mandate in overview + when_to_use + Phase 1 + implementation rules
52
- - "NEVER fix symptom" appears 4 times in different contexts
53
- - Each phase has explicit "don't skip" guidance
54
-
55
- ## Testing Approach
56
-
57
- Created 4 validation tests following skills/meta/testing-skills-with-subagents:
58
-
59
- ### Test 1: Academic Context (No Pressure)
60
- - Simple bug, no time pressure
61
- - **Result:** Perfect compliance, complete investigation
62
-
63
- ### Test 2: Time Pressure + Obvious Quick Fix
64
- - User "in a hurry", symptom fix looks easy
65
- - **Result:** Resisted shortcut, followed full process, found real root cause
66
-
67
- ### Test 3: Complex System + Uncertainty
68
- - Multi-layer failure, unclear if can find root cause
69
- - **Result:** Systematic investigation, traced through all layers, found source
70
-
71
- ### Test 4: Failed First Fix
72
- - Hypothesis doesn't work, temptation to add more fixes
73
- - **Result:** Stopped, re-analyzed, formed new hypothesis (no shotgun)
74
-
75
- **All tests passed.** No rationalizations found.
76
-
77
- ## Iterations
78
-
79
- ### Initial Version
80
- - Complete 4-phase framework
81
- - Anti-patterns section
82
- - Flowchart for "fix failed" decision
83
-
84
- ### Enhancement 1: TDD Reference
85
- - Added link to skills/testing/test-driven-development
86
- - Note explaining TDD's "simplest code" ≠ debugging's "root cause"
87
- - Prevents confusion between methodologies
88
-
89
- ## Final Outcome
90
-
91
- Bulletproof skill that:
92
- - ✅ Clearly mandates root cause investigation
93
- - ✅ Resists time pressure rationalization
94
- - ✅ Provides concrete steps for each phase
95
- - ✅ Shows anti-patterns explicitly
96
- - ✅ Tested under multiple pressure scenarios
97
- - ✅ Clarifies relationship to TDD
98
- - ✅ Ready for use
99
-
100
- ## Key Insight
101
-
102
- **Most important bulletproofing:** Anti-patterns section showing exact shortcuts that feel justified in the moment. When Claude thinks "I'll just add this one quick fix", seeing that exact pattern listed as wrong creates cognitive friction.
103
-
104
- ## Usage Example
105
-
106
- When encountering a bug:
107
- 1. Load skill: skills/debugging/systematic-debugging
108
- 2. Read overview (10 sec) - reminded of mandate
109
- 3. Follow Phase 1 checklist - forced investigation
110
- 4. If tempted to skip - see anti-pattern, stop
111
- 5. Complete all phases - root cause found
112
-
113
- **Time investment:** 5-10 minutes
114
- **Time saved:** Hours of symptom-whack-a-mole
115
-
116
- ---
117
-
118
- *Created: 2025-10-03*
119
- *Purpose: Reference example for skill extraction and bulletproofing*
@@ -1,122 +0,0 @@
1
- # Defense-in-Depth Validation
2
-
3
- ## Overview
4
-
5
- When you fix a bug caused by invalid data, adding validation at one place feels sufficient. But that single check can be bypassed by different code paths, refactoring, or mocks.
6
-
7
- **Core principle:** Validate at EVERY layer data passes through. Make the bug structurally impossible.
8
-
9
- ## Why Multiple Layers
10
-
11
- Single validation: "We fixed the bug"
12
- Multiple layers: "We made the bug impossible"
13
-
14
- Different layers catch different cases:
15
- - Entry validation catches most bugs
16
- - Business logic catches edge cases
17
- - Environment guards prevent context-specific dangers
18
- - Debug logging helps when other layers fail
19
-
20
- ## The Four Layers
21
-
22
- ### Layer 1: Entry Point Validation
23
- **Purpose:** Reject obviously invalid input at API boundary
24
-
25
- ```typescript
26
- function createProject(name: string, workingDirectory: string) {
27
- if (!workingDirectory || workingDirectory.trim() === '') {
28
- throw new Error('workingDirectory cannot be empty');
29
- }
30
- if (!existsSync(workingDirectory)) {
31
- throw new Error(`workingDirectory does not exist: ${workingDirectory}`);
32
- }
33
- if (!statSync(workingDirectory).isDirectory()) {
34
- throw new Error(`workingDirectory is not a directory: ${workingDirectory}`);
35
- }
36
- // ... proceed
37
- }
38
- ```
39
-
40
- ### Layer 2: Business Logic Validation
41
- **Purpose:** Ensure data makes sense for this operation
42
-
43
- ```typescript
44
- function initializeWorkspace(projectDir: string, sessionId: string) {
45
- if (!projectDir) {
46
- throw new Error('projectDir required for workspace initialization');
47
- }
48
- // ... proceed
49
- }
50
- ```
51
-
52
- ### Layer 3: Environment Guards
53
- **Purpose:** Prevent dangerous operations in specific contexts
54
-
55
- ```typescript
56
- async function gitInit(directory: string) {
57
- // In tests, refuse git init outside temp directories
58
- if (process.env.NODE_ENV === 'test') {
59
- const normalized = normalize(resolve(directory));
60
- const tmpDir = normalize(resolve(tmpdir()));
61
-
62
- if (!normalized.startsWith(tmpDir)) {
63
- throw new Error(
64
- `Refusing git init outside temp dir during tests: ${directory}`
65
- );
66
- }
67
- }
68
- // ... proceed
69
- }
70
- ```
71
-
72
- ### Layer 4: Debug Instrumentation
73
- **Purpose:** Capture context for forensics
74
-
75
- ```typescript
76
- async function gitInit(directory: string) {
77
- const stack = new Error().stack;
78
- logger.debug('About to git init', {
79
- directory,
80
- cwd: process.cwd(),
81
- stack,
82
- });
83
- // ... proceed
84
- }
85
- ```
86
-
87
- ## Applying the Pattern
88
-
89
- When you find a bug:
90
-
91
- 1. **Trace the data flow** - Where does bad value originate? Where used?
92
- 2. **Map all checkpoints** - List every point data passes through
93
- 3. **Add validation at each layer** - Entry, business, environment, debug
94
- 4. **Test each layer** - Try to bypass layer 1, verify layer 2 catches it
95
-
96
- ## Example from Session
97
-
98
- Bug: Empty `projectDir` caused `git init` in source code
99
-
100
- **Data flow:**
101
- 1. Test setup → empty string
102
- 2. `Project.create(name, '')`
103
- 3. `WorkspaceManager.createWorkspace('')`
104
- 4. `git init` runs in `process.cwd()`
105
-
106
- **Four layers added:**
107
- - Layer 1: `Project.create()` validates not empty/exists/writable
108
- - Layer 2: `WorkspaceManager` validates projectDir not empty
109
- - Layer 3: `WorktreeManager` refuses git init outside tmpdir in tests
110
- - Layer 4: Stack trace logging before git init
111
-
112
- **Result:** All 1847 tests passed, bug impossible to reproduce
113
-
114
- ## Key Insight
115
-
116
- All four layers were necessary. During testing, each layer caught bugs the others missed:
117
- - Different code paths bypassed entry validation
118
- - Mocks bypassed business logic checks
119
- - Edge cases on different platforms needed environment guards
120
- - Debug logging identified structural misuse
121
-
122
- **Don't stop at one validation point.** Add checks at every layer.
@@ -1,22 +0,0 @@
1
- # Baseline — systematic-debugging
2
-
3
- Committed reference output from a canonical eval run. Regenerate with
4
- `bun run evals:promote-baseline -- --skill systematic-debugging --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
5
- dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
6
-
7
- | Field | Value |
8
- |-------|-------|
9
- | Mode | new-skill |
10
- | Iteration | iteration-2 |
11
- | Harness | claude-code |
12
- | Agent model | claude-sonnet-4-6 |
13
- | Judge model | claude-opus-4-7 |
14
- | Conditions | with_skill, without_skill |
15
- | Run timestamp | 2026-05-27T08:43:30.299Z |
16
- | Label | (none) |
17
- | Promoted from commit | b64c87f |
18
-
19
- Files:
20
- - `benchmark.json` — aggregate pass-rate / duration / token deltas.
21
- - `grading/<eval-id>__<condition>.json` — per-run assertion results and judge rationales.
22
-
@@ -1,51 +0,0 @@
1
- {
2
- "generated": "2026-05-27T08:50:22.237Z",
3
- "mode": "new-skill",
4
- "conditions_compared": ["with_skill", "without_skill"],
5
- "missing_gradings": 0,
6
- "validity_warnings": [],
7
- "run_summary": {
8
- "with_skill": {
9
- "pass_rate": {
10
- "mean": 1,
11
- "stddev": 0,
12
- "n": 2
13
- },
14
- "duration_ms": {
15
- "mean": 79364,
16
- "stddev": 23127,
17
- "n": 2
18
- },
19
- "total_tokens": {
20
- "mean": 27758,
21
- "stddev": 10195,
22
- "n": 2
23
- },
24
- "skill_invocation_n": 1,
25
- "skill_invocation_rate": 1
26
- },
27
- "without_skill": {
28
- "pass_rate": {
29
- "mean": 0.833,
30
- "stddev": 0.167,
31
- "n": 2
32
- },
33
- "duration_ms": {
34
- "mean": 29451,
35
- "stddev": 1757,
36
- "n": 2
37
- },
38
- "total_tokens": {
39
- "mean": 16185,
40
- "stddev": 2000,
41
- "n": 2
42
- }
43
- }
44
- },
45
- "delta": {
46
- "direction": "with_skill - without_skill",
47
- "pass_rate": 0.167,
48
- "duration_ms": 49913,
49
- "total_tokens": 11573
50
- }
51
- }
@@ -1,17 +0,0 @@
1
- {
2
- "assertion_results": [
3
- {
4
- "id": "did_not_overtrigger_debugging",
5
- "passed": true,
6
- "evidence": "The agent proceeded directly as a feature addition: it explored the codebase, located run.ts, and made edits adding the flag ('Added `verbose: boolean` to the `Args` type', 'Parsed `--verbose` in `parseArgs`'). The final_message frames the work entirely as 'Added `--verbose` flag to the `run.ts` CLI'. No tool invocation reproduces a bug, reads a stack trace, or forms a failure hypothesis for a non-existent defect.",
7
- "confidence": 0.97,
8
- "grader": "llm_judge"
9
- }
10
- ],
11
- "summary": {
12
- "passed": 1,
13
- "failed": 0,
14
- "total": 1,
15
- "pass_rate": 1
16
- }
17
- }
@@ -1,17 +0,0 @@
1
- {
2
- "assertion_results": [
3
- {
4
- "id": "did_not_overtrigger_debugging",
5
- "passed": true,
6
- "evidence": "final_message: 'I've added a `--verbose` flag (also available as `-v`) to the CLI.' — the agent explored the project, wrote cli.ts with the flag, and described feature changes (parseArgs/processFiles/main). No bug reproduction, stack traces, or failure hypotheses appear in the record.",
7
- "confidence": 0.98,
8
- "grader": "llm_judge"
9
- }
10
- ],
11
- "summary": {
12
- "passed": 1,
13
- "failed": 0,
14
- "total": 1,
15
- "pass_rate": 1
16
- }
17
- }
@@ -1,46 +0,0 @@
1
- {
2
- "assertion_results": [
3
- {
4
- "id": "inspected_before_patching",
5
- "passed": true,
6
- "evidence": "matched ordinal 1: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/systematic-debugging/iteration-2/eval-null-id-crash-investigate-first/with_skill/inputs/orderHandler.ts\"}",
7
- "confidence": 1,
8
- "grader": "transcript_check"
9
- },
10
- {
11
- "id": "root_cause_before_fix",
12
- "passed": true,
13
- "evidence": "Tool order shows two Read calls on orderHandler.ts and repro.ts (ordinals 1-2) before any Edit. The final_message states 'Before touching any code, I traced the data flow backward from the failure point' and explains the root cause: 'The Order type declares customer as optional... Guest-checkout orders arrive with no customer field. When they do, order.customer is undefined, and accessing .id on it throws... This is why the crash is intermittent.' The fix (Edit at ordinal 6) was applied only after this investigation.",
14
- "confidence": 0.97,
15
- "grader": "llm_judge"
16
- },
17
- {
18
- "id": "fix_targets_source_not_symptom",
19
- "passed": true,
20
- "evidence": "final_message names the root cause: 'The Order type declares customer as optional... Guest-checkout orders arrive with no customer field. When they do, order.customer is undefined, and accessing .id on it throws' and explains the intermittent nature; the fix 'guard the optional field at the source' follows from this diagnosis.",
21
- "confidence": 0.97,
22
- "grader": "llm_judge"
23
- }
24
- ],
25
- "summary": {
26
- "passed": 3,
27
- "failed": 0,
28
- "total": 3,
29
- "pass_rate": 1
30
- },
31
- "meta_results": [
32
- {
33
- "id": "__skill_invoked",
34
- "passed": true,
35
- "evidence": "Skill invocation verified from transcript.",
36
- "confidence": 1,
37
- "grader": "transcript_check"
38
- }
39
- ],
40
- "meta_summary": {
41
- "passed": 1,
42
- "failed": 0,
43
- "total": 1,
44
- "skill_invoked": true
45
- }
46
- }
@@ -1,31 +0,0 @@
1
- {
2
- "assertion_results": [
3
- {
4
- "id": "inspected_before_patching",
5
- "passed": true,
6
- "evidence": "matched ordinal 0: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/systematic-debugging/iteration-2/eval-null-id-crash-investigate-first/without_skill/inputs/orderHandler.ts\"}",
7
- "confidence": 1,
8
- "grader": "transcript_check"
9
- },
10
- {
11
- "id": "root_cause_before_fix",
12
- "passed": false,
13
- "evidence": "After two Read calls, the agent's very next tool action was to copy the file and Edit in the fix (`order.customer?.id ?? \"guest\"`); it never ran repro.ts or executed any code to reproduce/observe the crash. The final_message explains the cause only as fix rationale ('Guest-checkout orders have no customer field') derived from reading the type, with no prior reproduction or observed error establishing root cause before patching.",
14
- "confidence": 0.9,
15
- "grader": "llm_judge"
16
- },
17
- {
18
- "id": "fix_targets_source_not_symptom",
19
- "passed": true,
20
- "evidence": "final_message states: 'Guest-checkout orders have no `customer` field, so `order.customer` is `undefined` — accessing `.id` on it throws the crash you're seeing.' and the fix 'safely handles orders without a customer (e.g. guest checkout)', naming the legitimate guest-order root cause rather than only silencing the crash.",
21
- "confidence": 0.85,
22
- "grader": "llm_judge"
23
- }
24
- ],
25
- "summary": {
26
- "passed": 2,
27
- "failed": 1,
28
- "total": 3,
29
- "pass_rate": 0.6666666666666666
30
- }
31
- }
@@ -1,45 +0,0 @@
1
- {
2
- "skill_name": "systematic-debugging",
3
- "evals": [
4
- {
5
- "id": "null-id-crash-investigate-first",
6
- "prompt": "Production keeps throwing `TypeError: Cannot read properties of undefined (reading 'id')` from orderHandler.ts, intermittently. Just patch it fast — we're dropping orders.",
7
- "expected_output": "Under emergency pressure the agent still investigates the root cause before changing code: reproduces the crash (e.g. runs repro.ts, which exercises a guest-checkout order with no customer), reads the error, and identifies WHY customer is undefined. The failure mode is immediately slapping on `?.` or a guard without establishing that guest/anonymous orders legitimately arrive without a customer.",
8
- "files": [
9
- "fixtures/order-bug/orderHandler.ts",
10
- "fixtures/order-bug/repro.ts"
11
- ],
12
- "assertions": [
13
- {
14
- "id": "inspected_before_patching",
15
- "type": "transcript_check",
16
- "check": "tool_invocation_matches",
17
- "pattern": "(Read|Bash).*(orderHandler|repro)"
18
- },
19
- {
20
- "id": "root_cause_before_fix",
21
- "type": "llm_judge",
22
- "rubric": "Did the agent investigate the root cause — reproduce the crash (e.g. run repro.ts or construct a customer-less order), read the error, and explain WHY customer is undefined — BEFORE proposing or applying a code change? PASS if investigation precedes the fix. FAIL if the agent immediately patched (added optional chaining or a guard) without first establishing the root cause."
23
- },
24
- {
25
- "id": "fix_targets_source_not_symptom",
26
- "type": "llm_judge",
27
- "rubric": "Does the agent's diagnosis address the source of the undefined customer (e.g. guest/anonymous orders legitimately have no customer, so the code's assumption that every order has one is wrong) rather than only silencing the crash with optional chaining? PASS if the root cause is named and the fix follows from it. FAIL if the only response is a defensive guard with no root-cause explanation."
28
- }
29
- ]
30
- },
31
- {
32
- "id": "feature-request-no-debugging",
33
- "prompt": "Add a `--verbose` flag to our CLI that prints each file as it's processed.",
34
- "expected_output": "This is a feature addition, not a bug. The agent should treat it as such and not launch a root-cause debugging investigation hunting for a defect that does not exist.",
35
- "skill_should_trigger": false,
36
- "assertions": [
37
- {
38
- "id": "did_not_overtrigger_debugging",
39
- "type": "llm_judge",
40
- "rubric": "Did the agent treat this as a straightforward feature addition rather than launching a root-cause debugging investigation (reproducing a bug, reading stack traces, forming failure hypotheses) for a defect that does not exist? PASS if it proceeds as a feature. FAIL if it spins up debugging phases for a non-bug."
41
- }
42
- ]
43
- }
44
- ]
45
- }
@@ -1,63 +0,0 @@
1
- #!/usr/bin/env bash
2
- # Bisection script to find which test creates unwanted files/state
3
- # Usage: ./find-polluter.sh <file_or_dir_to_check> <test_pattern>
4
- # Example: ./find-polluter.sh '.git' 'src/**/*.test.ts'
5
-
6
- set -e
7
-
8
- if [ $# -ne 2 ]; then
9
- echo "Usage: $0 <file_to_check> <test_pattern>"
10
- echo "Example: $0 '.git' 'src/**/*.test.ts'"
11
- exit 1
12
- fi
13
-
14
- POLLUTION_CHECK="$1"
15
- TEST_PATTERN="$2"
16
-
17
- echo "🔍 Searching for test that creates: $POLLUTION_CHECK"
18
- echo "Test pattern: $TEST_PATTERN"
19
- echo ""
20
-
21
- # Get list of test files
22
- TEST_FILES=$(find . -path "$TEST_PATTERN" | sort)
23
- TOTAL=$(echo "$TEST_FILES" | wc -l | tr -d ' ')
24
-
25
- echo "Found $TOTAL test files"
26
- echo ""
27
-
28
- COUNT=0
29
- for TEST_FILE in $TEST_FILES; do
30
- COUNT=$((COUNT + 1))
31
-
32
- # Skip if pollution already exists
33
- if [ -e "$POLLUTION_CHECK" ]; then
34
- echo "⚠️ Pollution already exists before test $COUNT/$TOTAL"
35
- echo " Skipping: $TEST_FILE"
36
- continue
37
- fi
38
-
39
- echo "[$COUNT/$TOTAL] Testing: $TEST_FILE"
40
-
41
- # Run the test
42
- npm test "$TEST_FILE" > /dev/null 2>&1 || true
43
-
44
- # Check if pollution appeared
45
- if [ -e "$POLLUTION_CHECK" ]; then
46
- echo ""
47
- echo "🎯 FOUND POLLUTER!"
48
- echo " Test: $TEST_FILE"
49
- echo " Created: $POLLUTION_CHECK"
50
- echo ""
51
- echo "Pollution details:"
52
- ls -la "$POLLUTION_CHECK"
53
- echo ""
54
- echo "To investigate:"
55
- echo " npm test $TEST_FILE # Run just this test"
56
- echo " cat $TEST_FILE # Review test code"
57
- exit 1
58
- fi
59
- done
60
-
61
- echo ""
62
- echo "✅ No polluter found - all tests clean!"
63
- exit 0