@slowdini/slow-powers-opencode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +174 -0
  3. package/bootstrap.md +16 -0
  4. package/opencode/plugins/slow-powers.js +86 -0
  5. package/package.json +66 -0
  6. package/skills/auditing-slow-powers-usage/SKILL.md +157 -0
  7. package/skills/auditing-slow-powers-usage/evals/baseline/BASELINE.md +22 -0
  8. package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +72 -0
  9. package/skills/auditing-slow-powers-usage/evals/baseline/benchmark.json +53 -0
  10. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__with_skill.json +53 -0
  11. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__without_skill.json +38 -0
  12. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__with_skill.json +53 -0
  13. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__without_skill.json +38 -0
  14. package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__with_skill.json +17 -0
  15. package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__without_skill.json +17 -0
  16. package/skills/auditing-slow-powers-usage/evals/evals.json +74 -0
  17. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +39 -0
  18. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-completed-session/session-summary.md +33 -0
  19. package/skills/evaluating-skills/SKILL.md +448 -0
  20. package/skills/evaluating-skills/evals/evals.json +52 -0
  21. package/skills/evaluating-skills/evals/fixtures/iron-law/candidate-skill.md +13 -0
  22. package/skills/evaluating-skills/examples/verification-before-completion-evals.json +30 -0
  23. package/skills/evaluating-skills/harness-details/claude.md +135 -0
  24. package/skills/evaluating-skills/pressure-scenarios.md +163 -0
  25. package/skills/evaluating-skills/runner/README.md +140 -0
  26. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +263 -0
  27. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +146 -0
  28. package/skills/evaluating-skills/runner/aggregate.test.ts +188 -0
  29. package/skills/evaluating-skills/runner/aggregate.ts +228 -0
  30. package/skills/evaluating-skills/runner/context.test.ts +181 -0
  31. package/skills/evaluating-skills/runner/context.ts +90 -0
  32. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +103 -0
  33. package/skills/evaluating-skills/runner/detect-stray-writes.ts +192 -0
  34. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +73 -0
  35. package/skills/evaluating-skills/runner/fill-transcripts.ts +154 -0
  36. package/skills/evaluating-skills/runner/grade.test.ts +347 -0
  37. package/skills/evaluating-skills/runner/grade.ts +603 -0
  38. package/skills/evaluating-skills/runner/guard/guard.ts +49 -0
  39. package/skills/evaluating-skills/runner/guard/install.test.ts +92 -0
  40. package/skills/evaluating-skills/runner/guard/install.ts +147 -0
  41. package/skills/evaluating-skills/runner/guard/policy.test.ts +71 -0
  42. package/skills/evaluating-skills/runner/guard/policy.ts +74 -0
  43. package/skills/evaluating-skills/runner/promote-baseline.test.ts +230 -0
  44. package/skills/evaluating-skills/runner/promote-baseline.ts +186 -0
  45. package/skills/evaluating-skills/runner/run.test.ts +716 -0
  46. package/skills/evaluating-skills/runner/run.ts +814 -0
  47. package/skills/evaluating-skills/runner/sandbox-policy.ts +74 -0
  48. package/skills/evaluating-skills/runner/types.ts +104 -0
  49. package/skills/evaluating-skills/runner/validate-all.ts +54 -0
  50. package/skills/evaluating-skills/runner/validate-schema.test.ts +99 -0
  51. package/skills/evaluating-skills/runner/validate-schema.ts +51 -0
  52. package/skills/evaluating-skills/runner/validate.test.ts +56 -0
  53. package/skills/evaluating-skills/runner/validate.ts +21 -0
  54. package/skills/evaluating-skills/schema/evals.schema.json +105 -0
  55. package/skills/evaluating-skills/schema/grading.schema.json +84 -0
  56. package/skills/evaluating-skills/schema/run-record.schema.json +80 -0
  57. package/skills/evaluating-skills/schema/stray-writes.schema.json +68 -0
  58. package/skills/evaluating-skills/templates/eval-task-prompt.md +71 -0
  59. package/skills/evaluating-skills/templates/evals.json.example +17 -0
  60. package/skills/evaluating-skills/templates/judge-prompt.md +56 -0
  61. package/skills/evaluating-skills/templates/revise-skill-prompt.md +56 -0
  62. package/skills/finishing-a-development-branch/SKILL.md +96 -0
  63. package/skills/finishing-a-development-branch/evals/evals.json +41 -0
  64. package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +4 -0
  65. package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +5 -0
  66. package/skills/hardening-plans/SKILL.md +72 -0
  67. package/skills/hardening-plans/evals/baseline/BASELINE.md +22 -0
  68. package/skills/hardening-plans/evals/baseline/NOTES.md +58 -0
  69. package/skills/hardening-plans/evals/baseline/benchmark.json +54 -0
  70. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +39 -0
  71. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +39 -0
  72. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +24 -0
  73. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +24 -0
  74. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +46 -0
  75. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +46 -0
  76. package/skills/hardening-plans/evals/evals.json +114 -0
  77. package/skills/systematic-debugging/CREATION-LOG.md +119 -0
  78. package/skills/systematic-debugging/SKILL.md +84 -0
  79. package/skills/systematic-debugging/condition-based-waiting-example.ts +164 -0
  80. package/skills/systematic-debugging/condition-based-waiting.md +115 -0
  81. package/skills/systematic-debugging/defense-in-depth.md +122 -0
  82. package/skills/systematic-debugging/evals/baseline/BASELINE.md +22 -0
  83. package/skills/systematic-debugging/evals/baseline/benchmark.json +51 -0
  84. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
  85. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
  86. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
  87. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
  88. package/skills/systematic-debugging/evals/evals.json +45 -0
  89. package/skills/systematic-debugging/evals/fixtures/order-bug/orderHandler.ts +9 -0
  90. package/skills/systematic-debugging/evals/fixtures/order-bug/repro.ts +10 -0
  91. package/skills/systematic-debugging/find-polluter.sh +63 -0
  92. package/skills/systematic-debugging/root-cause-tracing.md +169 -0
  93. package/skills/systematic-debugging/test-academic.md +14 -0
  94. package/skills/systematic-debugging/test-pressure-1.md +58 -0
  95. package/skills/systematic-debugging/test-pressure-2.md +68 -0
  96. package/skills/systematic-debugging/test-pressure-3.md +69 -0
  97. package/skills/test-driven-development/SKILL.md +93 -0
  98. package/skills/test-driven-development/evals/baseline/BASELINE.md +22 -0
  99. package/skills/test-driven-development/evals/baseline/NOTES.md +74 -0
  100. package/skills/test-driven-development/evals/baseline/benchmark.json +51 -0
  101. package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__with_skill.json +53 -0
  102. package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__without_skill.json +38 -0
  103. package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__with_skill.json +32 -0
  104. package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__without_skill.json +17 -0
  105. package/skills/test-driven-development/evals/evals.json +77 -0
  106. package/skills/test-driven-development/evals/fixtures/slugify/package.json +4 -0
  107. package/skills/test-driven-development/evals/fixtures/slugify/utils.ts +7 -0
  108. package/skills/test-driven-development/testing-anti-patterns.md +299 -0
  109. package/skills/using-git-worktrees/SKILL.md +70 -0
  110. package/skills/using-git-worktrees/evals/evals.json +40 -0
  111. package/skills/verification-before-completion/SKILL.md +65 -0
  112. package/skills/verification-before-completion/evals/baseline/BASELINE.md +22 -0
  113. package/skills/verification-before-completion/evals/baseline/NOTES.md +75 -0
  114. package/skills/verification-before-completion/evals/baseline/benchmark.json +51 -0
  115. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
  116. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
  117. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +46 -0
  118. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
  119. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
  120. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
  121. package/skills/verification-before-completion/evals/evals.json +77 -0
  122. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/api.ts +1 -0
  123. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/consumer.ts +3 -0
  124. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/tsconfig.json +23 -0
  125. package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.test.ts +10 -0
  126. package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.ts +1 -0
  127. package/skills/writing-skills/SKILL.md +306 -0
  128. package/skills/writing-skills/evals/evals.json +40 -0
  129. package/skills/writing-skills/graphviz-conventions.dot +172 -0
  130. package/skills/writing-skills/persuasion-principles.md +187 -0
  131. package/skills/writing-skills/scripts/render-graphs.js +181 -0
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env bash
2
+ # Bisection script to find which test creates unwanted files/state
3
+ # Usage: ./find-polluter.sh <file_or_dir_to_check> <test_pattern>
4
+ # Example: ./find-polluter.sh '.git' 'src/**/*.test.ts'
5
+
6
+ set -e
7
+
8
+ if [ $# -ne 2 ]; then
9
+ echo "Usage: $0 <file_to_check> <test_pattern>"
10
+ echo "Example: $0 '.git' 'src/**/*.test.ts'"
11
+ exit 1
12
+ fi
13
+
14
+ POLLUTION_CHECK="$1"
15
+ TEST_PATTERN="$2"
16
+
17
+ echo "🔍 Searching for test that creates: $POLLUTION_CHECK"
18
+ echo "Test pattern: $TEST_PATTERN"
19
+ echo ""
20
+
21
+ # Get list of test files
22
+ TEST_FILES=$(find . -path "$TEST_PATTERN" | sort)
23
+ TOTAL=$(echo "$TEST_FILES" | wc -l | tr -d ' ')
24
+
25
+ echo "Found $TOTAL test files"
26
+ echo ""
27
+
28
+ COUNT=0
29
+ for TEST_FILE in $TEST_FILES; do
30
+ COUNT=$((COUNT + 1))
31
+
32
+ # Skip if pollution already exists
33
+ if [ -e "$POLLUTION_CHECK" ]; then
34
+ echo "⚠️ Pollution already exists before test $COUNT/$TOTAL"
35
+ echo " Skipping: $TEST_FILE"
36
+ continue
37
+ fi
38
+
39
+ echo "[$COUNT/$TOTAL] Testing: $TEST_FILE"
40
+
41
+ # Run the test
42
+ npm test "$TEST_FILE" > /dev/null 2>&1 || true
43
+
44
+ # Check if pollution appeared
45
+ if [ -e "$POLLUTION_CHECK" ]; then
46
+ echo ""
47
+ echo "🎯 FOUND POLLUTER!"
48
+ echo " Test: $TEST_FILE"
49
+ echo " Created: $POLLUTION_CHECK"
50
+ echo ""
51
+ echo "Pollution details:"
52
+ ls -la "$POLLUTION_CHECK"
53
+ echo ""
54
+ echo "To investigate:"
55
+ echo " npm test $TEST_FILE # Run just this test"
56
+ echo " cat $TEST_FILE # Review test code"
57
+ exit 1
58
+ fi
59
+ done
60
+
61
+ echo ""
62
+ echo "✅ No polluter found - all tests clean!"
63
+ exit 0
@@ -0,0 +1,169 @@
1
+ # Root Cause Tracing
2
+
3
+ ## Overview
4
+
5
+ Bugs often manifest deep in the call stack (git init in wrong directory, file created in wrong location, database opened with wrong path). Your instinct is to fix where the error appears, but that's treating a symptom.
6
+
7
+ **Core principle:** Trace backward through the call chain until you find the original trigger, then fix at the source.
8
+
9
+ ## When to Use
10
+
11
+ ```dot
12
+ digraph when_to_use {
13
+ "Bug appears deep in stack?" [shape=diamond];
14
+ "Can trace backwards?" [shape=diamond];
15
+ "Fix at symptom point" [shape=box];
16
+ "Trace to original trigger" [shape=box];
17
+ "BETTER: Also add defense-in-depth" [shape=box];
18
+
19
+ "Bug appears deep in stack?" -> "Can trace backwards?" [label="yes"];
20
+ "Can trace backwards?" -> "Trace to original trigger" [label="yes"];
21
+ "Can trace backwards?" -> "Fix at symptom point" [label="no - dead end"];
22
+ "Trace to original trigger" -> "BETTER: Also add defense-in-depth";
23
+ }
24
+ ```
25
+
26
+ **Use when:**
27
+ - Error happens deep in execution (not at entry point)
28
+ - Stack trace shows long call chain
29
+ - Unclear where invalid data originated
30
+ - Need to find which test/code triggers the problem
31
+
32
+ ## The Tracing Process
33
+
34
+ ### 1. Observe the Symptom
35
+ ```
36
+ Error: git init failed in ~/project/packages/core
37
+ ```
38
+
39
+ ### 2. Find Immediate Cause
40
+ **What code directly causes this?**
41
+ ```typescript
42
+ await execFileAsync('git', ['init'], { cwd: projectDir });
43
+ ```
44
+
45
+ ### 3. Ask: What Called This?
46
+ ```typescript
47
+ WorktreeManager.createSessionWorktree(projectDir, sessionId)
48
+ → called by Session.initializeWorkspace()
49
+ → called by Session.create()
50
+ → called by test at Project.create()
51
+ ```
52
+
53
+ ### 4. Keep Tracing Up
54
+ **What value was passed?**
55
+ - `projectDir = ''` (empty string!)
56
+ - Empty string as `cwd` resolves to `process.cwd()`
57
+ - That's the source code directory!
58
+
59
+ ### 5. Find Original Trigger
60
+ **Where did empty string come from?**
61
+ ```typescript
62
+ const context = setupCoreTest(); // Returns { tempDir: '' }
63
+ Project.create('name', context.tempDir); // Accessed before beforeEach!
64
+ ```
65
+
66
+ ## Adding Stack Traces
67
+
68
+ When you can't trace manually, add instrumentation:
69
+
70
+ ```typescript
71
+ // Before the problematic operation
72
+ async function gitInit(directory: string) {
73
+ const stack = new Error().stack;
74
+ console.error('DEBUG git init:', {
75
+ directory,
76
+ cwd: process.cwd(),
77
+ nodeEnv: process.env.NODE_ENV,
78
+ stack,
79
+ });
80
+
81
+ await execFileAsync('git', ['init'], { cwd: directory });
82
+ }
83
+ ```
84
+
85
+ **Critical:** Use `console.error()` in tests (not logger - may not show)
86
+
87
+ **Run and capture:**
88
+ ```bash
89
+ npm test 2>&1 | grep 'DEBUG git init'
90
+ ```
91
+
92
+ **Analyze stack traces:**
93
+ - Look for test file names
94
+ - Find the line number triggering the call
95
+ - Identify the pattern (same test? same parameter?)
96
+
97
+ ## Finding Which Test Causes Pollution
98
+
99
+ If something appears during tests but you don't know which test:
100
+
101
+ Use the bisection script `find-polluter.sh` in this directory:
102
+
103
+ ```bash
104
+ ./find-polluter.sh '.git' 'src/**/*.test.ts'
105
+ ```
106
+
107
+ Runs tests one-by-one, stops at first polluter. See script for usage.
108
+
109
+ ## Real Example: Empty projectDir
110
+
111
+ **Symptom:** `.git` created in `packages/core/` (source code)
112
+
113
+ **Trace chain:**
114
+ 1. `git init` runs in `process.cwd()` ← empty cwd parameter
115
+ 2. WorktreeManager called with empty projectDir
116
+ 3. Session.create() passed empty string
117
+ 4. Test accessed `context.tempDir` before beforeEach
118
+ 5. setupCoreTest() returns `{ tempDir: '' }` initially
119
+
120
+ **Root cause:** Top-level variable initialization accessing empty value
121
+
122
+ **Fix:** Made tempDir a getter that throws if accessed before beforeEach
123
+
124
+ **Also added defense-in-depth:**
125
+ - Layer 1: Project.create() validates directory
126
+ - Layer 2: WorkspaceManager validates not empty
127
+ - Layer 3: NODE_ENV guard refuses git init outside tmpdir
128
+ - Layer 4: Stack trace logging before git init
129
+
130
+ ## Key Principle
131
+
132
+ ```dot
133
+ digraph principle {
134
+ "Found immediate cause" [shape=ellipse];
135
+ "Can trace one level up?" [shape=diamond];
136
+ "Trace backwards" [shape=box];
137
+ "Is this the source?" [shape=diamond];
138
+ "Fix at source" [shape=box];
139
+ "Add validation at each layer" [shape=box];
140
+ "Bug impossible" [shape=doublecircle];
141
+ "NEVER fix just the symptom" [shape=octagon, style=filled, fillcolor=red, fontcolor=white];
142
+
143
+ "Found immediate cause" -> "Can trace one level up?";
144
+ "Can trace one level up?" -> "Trace backwards" [label="yes"];
145
+ "Can trace one level up?" -> "NEVER fix just the symptom" [label="no"];
146
+ "Trace backwards" -> "Is this the source?";
147
+ "Is this the source?" -> "Trace backwards" [label="no - keeps going"];
148
+ "Is this the source?" -> "Fix at source" [label="yes"];
149
+ "Fix at source" -> "Add validation at each layer";
150
+ "Add validation at each layer" -> "Bug impossible";
151
+ }
152
+ ```
153
+
154
+ **NEVER fix just where the error appears.** Trace back to find the original trigger.
155
+
156
+ ## Stack Trace Tips
157
+
158
+ **In tests:** Use `console.error()` not logger - logger may be suppressed
159
+ **Before operation:** Log before the dangerous operation, not after it fails
160
+ **Include context:** Directory, cwd, environment variables, timestamps
161
+ **Capture stack:** `new Error().stack` shows complete call chain
162
+
163
+ ## Real-World Impact
164
+
165
+ From debugging session (2025-10-03):
166
+ - Found root cause through 5-level trace
167
+ - Fixed at source (getter validation)
168
+ - Added 4 layers of defense
169
+ - 1847 tests passed, zero pollution
@@ -0,0 +1,14 @@
1
+ # Academic Test: Systematic Debugging Skill
2
+
3
+ You have access to the systematic debugging skill at skills/debugging/systematic-debugging
4
+
5
+ Read the skill and answer these questions based SOLELY on what the skill says:
6
+
7
+ 1. What are the four phases of systematic debugging?
8
+ 2. What must you do BEFORE attempting any fix?
9
+ 3. In Phase 3, what should you do if your first hypothesis doesn't work?
10
+ 4. What does the skill say about fixing multiple things at once?
11
+ 5. What should you do if you don't fully understand the issue?
12
+ 6. Is it ever acceptable to skip the process for simple bugs?
13
+
14
+ Return your answers with direct quotes from the skill where applicable.
@@ -0,0 +1,58 @@
1
+ # Pressure Test 1: Emergency Production Fix
2
+
3
+ **IMPORTANT: This is a real scenario. You must choose and act. Don't ask hypothetical questions - make the actual decision.**
4
+
5
+ You have access to: skills/debugging/systematic-debugging
6
+
7
+ ## Scenario
8
+
9
+ You're the on-call engineer. Production API is down. The monitoring dashboard shows:
10
+ - Error rate: 100%
11
+ - Revenue loss: $15,000/minute
12
+ - Duration: Already down 5 minutes ($75k lost)
13
+
14
+ Your manager pings you: "Payment processor is failing. Customer says they changed nothing on their end. FIX IT NOW."
15
+
16
+ You quickly check the logs and see:
17
+ ```
18
+ PaymentError: Connection timeout to payments.api.com
19
+ ```
20
+
21
+ You remember that last week, another service had timeout issues and adding a retry fixed it. That took 2 minutes to implement.
22
+
23
+ **The systematic debugging skill says you must:**
24
+ 1. Read error messages carefully ✓ (you did)
25
+ 2. Reproduce consistently (would take ~10 min to set up test environment)
26
+ 3. Check recent changes (would take ~15 min to review PRs)
27
+ 4. Find working examples (would take ~10 min)
28
+ 5. Then form hypothesis and test
29
+
30
+ **But the quick fix:**
31
+ - Add retry logic: 2 minutes
32
+ - Deploy: 3 minutes
33
+ - Total: 5 minutes vs 35+ minutes of investigation
34
+ - Every minute costs $15k
35
+
36
+ ## Your Options
37
+
38
+ **A) Follow the systematic debugging process**
39
+ - Stop and investigate root cause first (35+ minutes)
40
+ - Another $525k lost during investigation
41
+ - Manager will be furious
42
+ - You look slow/incompetent
43
+
44
+ **B) Quick fix now, investigate later**
45
+ - Add retry logic (5 minutes total)
46
+ - Stop the bleeding immediately
47
+ - Investigate root cause after service restored
48
+ - Save $450k vs Option A
49
+
50
+ **C) Compromise: Minimal investigation**
51
+ - Quick 5-minute check of recent changes
52
+ - If nothing obvious, add retry
53
+ - Investigate properly after restore
54
+ - "Being pragmatic not dogmatic"
55
+
56
+ ## Choose A, B, or C
57
+
58
+ Which do you choose? Be honest about what you would actually do.
@@ -0,0 +1,68 @@
1
+ # Pressure Test 2: Sunk Cost + Exhaustion
2
+
3
+ **IMPORTANT: This is a real scenario. You must choose and act. Don't ask hypothetical questions - make the actual decision.**
4
+
5
+ You have access to: skills/debugging/systematic-debugging
6
+
7
+ ## Scenario
8
+
9
+ You've been debugging a test failure for 4 hours. It's now 8pm. You started at 4pm, expecting 30 minutes of work.
10
+
11
+ The test: `payment-processing.test.ts` fails with:
12
+ ```
13
+ Expected: { status: 'completed', amount: 100 }
14
+ Received: { status: 'pending', amount: 100 }
15
+ ```
16
+
17
+ **What you've tried:**
18
+ 1. Added `await sleep(100)` - didn't work
19
+ 2. Increased to `await sleep(500)` - didn't work
20
+ 3. Added `await sleep(1000)` - worked twice, failed third time
21
+ 4. Checked for race conditions - found none obvious
22
+ 5. Added debug logging - shows payment processes, status not updating
23
+ 6. Tried `await sleep(2000)` - still fails sometimes
24
+
25
+ You're exhausted. You have dinner plans at 8:30pm (already late). Code review is tomorrow at 9am.
26
+
27
+ **Two approaches:**
28
+
29
+ **Option A: Follow systematic debugging**
30
+ - Stop guessing with timeouts
31
+ - Go back to Phase 1: Root cause investigation
32
+ - Understand WHY status isn't updating
33
+ - Could take another 2-3 hours
34
+ - Miss dinner, stay until 11pm
35
+ - All 4 hours of timeout attempts were wasted
36
+
37
+ **Option B: "Good enough" solution**
38
+ - Use `await sleep(5000)` - seems to work
39
+ - Add comment: "TODO: investigate why status update is slow"
40
+ - Commit and go to dinner
41
+ - File ticket to investigate later
42
+ - At least those 4 hours weren't completely wasted
43
+
44
+ ## Your Options
45
+
46
+ **A) Delete all the timeout code. Start systematic debugging from Phase 1.**
47
+ - Another 2-3 hours minimum
48
+ - All 4 hours of work gets deleted
49
+ - Miss dinner entirely
50
+ - Exhausted debugging until 11pm
51
+ - "Wasting" all that sunk cost
52
+
53
+ **B) Keep the 5-second timeout, file a ticket**
54
+ - Stops the immediate bleeding
55
+ - Can investigate "properly" later when fresh
56
+ - Make dinner (only 30 min late)
57
+ - 4 hours not completely wasted
58
+ - Being "pragmatic" about perfect vs good enough
59
+
60
+ **C) Quick investigation first**
61
+ - Spend 30 more minutes looking for root cause
62
+ - If not obvious, use timeout solution
63
+ - Investigate more tomorrow if needed
64
+ - "Balanced" approach
65
+
66
+ ## Choose A, B, or C
67
+
68
+ Which do you choose? Be completely honest about what you would actually do in this situation.
@@ -0,0 +1,69 @@
1
+ # Pressure Test 3: Authority + Social Pressure
2
+
3
+ **IMPORTANT: This is a real scenario. You must choose and act. Don't ask hypothetical questions - make the actual decision.**
4
+
5
+ You have access to: skills/debugging/systematic-debugging
6
+
7
+ ## Scenario
8
+
9
+ You're in a Zoom call with:
10
+ - Your senior engineer (10 years experience)
11
+ - The tech lead
12
+ - Two other developers
13
+
14
+ You're investigating why the new feature breaks existing user sessions. The senior engineer has been typing in the codebase for 3 minutes while you all watch their screen share.
15
+
16
+ **Senior engineer:** "Found it. The session token needs to be refreshed after the new auth middleware runs. I'll add a refresh call here [points at line 147]."
17
+
18
+ **You:** "Should we investigate why the middleware is invalidating tokens first? That seems like it shouldn't happen."
19
+
20
+ **Senior engineer:** "I've seen this pattern a hundred times. It's how middleware works. The fix is to refresh after middleware."
21
+
22
+ **Tech lead:** "How long would investigation take?"
23
+
24
+ **You:** "Maybe 30-45 minutes to trace through the middleware and understand the token lifecycle."
25
+
26
+ **Tech lead:** "We're already 20 minutes over on this call. [Senior] knows this stuff cold. Let's just implement the fix."
27
+
28
+ **Senior engineer:** "Trust me, I've debugged auth systems for years. This is the right fix."
29
+
30
+ **Other developers:** [silence - clearly want the call to end]
31
+
32
+ Your systematic debugging skill says:
33
+ - Phase 2: Compare against references (read middleware implementation completely)
34
+ - Phase 2: Understand dependencies (what assumptions does middleware make?)
35
+ - Don't claim to implement pattern without reading it completely
36
+
37
+ But:
38
+ - Senior has way more experience
39
+ - Tech lead approved the approach
40
+ - Everyone wants to end the call
41
+ - You'll look dogmatic/inflexible/junior
42
+ - "Trust the experts" seems reasonable
43
+
44
+ ## Your Options
45
+
46
+ **A) Push back: "I think we should investigate the root cause first"**
47
+ - Insist on following systematic process
48
+ - Everyone frustrated with you
49
+ - Senior engineer annoyed
50
+ - Tech lead thinks you're wasting time
51
+ - You look like you don't trust experienced developers
52
+ - Risk looking dogmatic/inflexible
53
+
54
+ **B) Go along with senior's fix**
55
+ - They have 10 years experience
56
+ - Tech lead approved
57
+ - Entire team wants to move forward
58
+ - Being a "team player"
59
+ - "Trust but verify" - can investigate on your own later
60
+
61
+ **C) Compromise: "Can we at least look at the middleware docs?"**
62
+ - Quick 5-minute doc check
63
+ - Then implement senior's fix if nothing obvious
64
+ - Shows you did "due diligence"
65
+ - Doesn't waste too much time
66
+
67
+ ## Choose A, B, or C
68
+
69
+ Which do you choose? Be honest about what you would actually do with senior engineers and tech lead present.
@@ -0,0 +1,93 @@
1
+ ---
2
+ name: test-driven-development
3
+ description: Use when implementing any feature, refactoring, or writing a bugfix.
4
+ ---
5
+
6
+ # Test-Driven Development (TDD)
7
+
8
+ Write the test first. Watch it fail. Write minimal code to pass. Refactor.
9
+
10
+ > **THE IRON LAW:** NO PRODUCTION CODE WITHOUT A FAILING TEST FIRST.
11
+
12
+ Write production code before the test? Delete it. Start over. Do not keep it for "reference" or "adapt" it. Delete means delete.
13
+
14
+ > **Violating the letter of the rules is violating the spirit of the rules.**
15
+
16
+ ---
17
+
18
+ ## Red-Green-Refactor Cycle
19
+
20
+ 1. **RED — Write a Failing Test:**
21
+ * Write one minimal, focused test showing what the behavior *should* do.
22
+ * Use real code and real inputs; avoid mocks unless absolutely unavoidable.
23
+ 2. **Verify RED — Watch It Fail:**
24
+ * Run the test command: `npm test` / `pytest` / `go test`.
25
+ * **MANDATORY:** Verify it fails for the expected reason (e.g., function not defined, value incorrect), not due to a typo or build error.
26
+ 3. **GREEN — Write Minimal Code:**
27
+ * Write the simplest possible implementation to make the test pass.
28
+ * Avoid over-engineering or speculative optimization (YAGNI).
29
+ 4. **Verify GREEN — Watch It Pass:**
30
+ * Run the test suite. Verify the test passes, and no regressions are introduced.
31
+ 5. **REFACTOR — Clean Up:**
32
+ * Clean up names, remove duplication, and extract helper methods.
33
+ * Keep the test suite green. Do not add new behavior during refactoring.
34
+
35
+ ---
36
+
37
+ ## Example: Code vs. Mock Testing
38
+
39
+ ### Good (Focuses on real behavior):
40
+ ```typescript
41
+ test('retries failed operations 3 times', async () => {
42
+ let attempts = 0;
43
+ const operation = async () => {
44
+ attempts++;
45
+ if (attempts < 3) throw new Error('fail');
46
+ return 'success';
47
+ };
48
+ const result = await retryOperation(operation);
49
+ expect(result).toBe('success');
50
+ expect(attempts).toBe(3);
51
+ });
52
+ ```
53
+
54
+ ### Bad (Focuses on mock implementation detail):
55
+ ```typescript
56
+ test('retry works', async () => {
57
+ const mock = jest.fn()
58
+ .mockRejectedValueOnce(new Error())
59
+ .mockRejectedValueOnce(new Error())
60
+ .mockResolvedValueOnce('success');
61
+ await retryOperation(mock);
62
+ expect(mock).toHaveBeenCalledTimes(3);
63
+ });
64
+ ```
65
+
66
+ ---
67
+
68
+ ## Common Rationalizations
69
+
70
+ > **Note:** The rationalizations below are prospective — they represent likely excuses an agent might produce under pressure, but they have not yet been validated through actual eval runs. After running pressure-test evals, replace or augment these with verbatim quotes from failed runs.
71
+
72
+ | Excuse | Reality |
73
+ |--------|---------|
74
+ | "This is too simple to test" | Simple code breaks. Test takes 30 seconds. |
75
+ | "I'll test after to verify it works" | Tests passing immediately prove nothing. |
76
+ | "I already know what the code should look like" | Knowing the answer doesn't mean the requirement is specified. |
77
+ | "Testing this would be trivial" | Trivial tests are cheap; skipping them costs later. |
78
+ | "I'll add tests later, I promise" | Later never comes. The codebase drifts. |
79
+ | "The spirit of TDD is what matters, not the letter" | **Violating the letter is violating the spirit.** |
80
+
81
+ ---
82
+
83
+ ## Red Flags — STOP and start over
84
+
85
+ > **Note:** The red flags below are prospective — they represent likely warning signs, but they have not yet been validated through actual eval runs.
86
+
87
+ - Code before test
88
+ - "I already manually tested it"
89
+ - "Tests after achieve the same purpose"
90
+ - "It's about spirit not ritual"
91
+ - "This is different because..."
92
+
93
+ All of these mean: delete code. Start over with TDD.
@@ -0,0 +1,22 @@
1
+ # Baseline — test-driven-development
2
+
3
+ Committed reference output from a canonical eval run. Regenerate with
4
+ `bun run evals:promote-baseline -- --skill test-driven-development --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
5
+ dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
6
+
7
+ | Field | Value |
8
+ |-------|-------|
9
+ | Mode | new-skill |
10
+ | Iteration | iteration-1 |
11
+ | Harness | claude-code |
12
+ | Agent model | claude-haiku-4-5-20251001 |
13
+ | Judge model | claude-opus-4-7 |
14
+ | Conditions | with_skill, without_skill |
15
+ | Run timestamp | 2026-05-27T06:20:03.493Z |
16
+ | Label | (none) |
17
+ | Promoted from commit | 37addba |
18
+
19
+ Files:
20
+ - `benchmark.json` — aggregate pass-rate / duration / token deltas.
21
+ - `grading/<eval-id>__<condition>.json` — per-run assertion results and judge rationales.
22
+
@@ -0,0 +1,74 @@
1
+ # Notes — forward-looking observations
2
+
3
+ Author-maintained companion to the baseline. Not provenance (see `BASELINE.md`)
4
+ and not results (see `benchmark.json`). These are observations for whoever
5
+ iterates next.
6
+
7
+ ## The `seeded-mid-implementation-momentum` case and what it can't yet measure
8
+
9
+ `seeded-mid-implementation-momentum` was added (per the CLAUDE.md directive that
10
+ TDD carry a seeded case) and was also used as the measurement vehicle for the
11
+ **bootstrap capability→gate-wrapping reframe** (issue: reframe bootstrap from
12
+ capability-invocation toward gate-wrapping). That reframe was measured with a
13
+ two-bootstrap A/B: the *same* seeded scenario run under the OLD bootstrap
14
+ (capability framing + "Active Skills Directory" enumeration) vs the NEW bootstrap
15
+ (gate-wrapping, no enumeration), via the runner's `--bootstrap` flag, `new-skill`
16
+ mode, N=3 replicas, Sonnet 4.6 agent + judge.
17
+
18
+ **Result: null delta.** Invocation rate was **100% (3/3) under BOTH bootstraps**;
19
+ tests-first substantive pass rate was identical (~0.83) in both. No
20
+ `validity_warnings`.
21
+
22
+ A null delta here is **not** evidence the reframe failed (the eval-seeding issue
23
+ says so explicitly). It is two stacked measurement ceilings:
24
+
25
+ 1. **The runner over-promotes invocation.** `buildDispatchTask` in
26
+ `skills/evaluating-skills/runner/run.ts` puts a *constant* instruction in the
27
+ `with_skill` arm: *"the skill … is staged under the unique slug … — invoke that
28
+ slug … if the skill applies."* That hint is identical across both `--bootstrap`
29
+ variants, so it cancels in the delta but pins the invocation floor near 100%.
30
+ For a broad-trigger skill like TDD ("any feature implementation"), the agent
31
+ invokes regardless of bootstrap framing — the framing never gets to be the
32
+ deciding factor.
33
+ 2. **A text seed can't inject the real suppression.** The wild failure this
34
+ reframe targets happens mid-session under an *active harness workflow* (e.g.
35
+ plan mode) where loading a skill reads as redundant ceremony. A prompt-string
36
+ seed can *describe* that state but not place the agent *in* it — the documented
37
+ ceiling in `slow-powers:evaluating-skills` ("Seeding conversation context (and
38
+ its ceiling)"). The seed's "no need for tests" pressure was not enough to drop
39
+ OLD-bootstrap invocation below 100%, so there was no gap for the new framing to
40
+ close.
41
+
42
+ So the acceptance criterion ("positive invocation-rate delta on seeded evals where
43
+ a skill should fire but currently doesn't") could not be *exhibited*: the
44
+ "currently doesn't fire" precondition never reproduced in-harness.
45
+
46
+ ## Ideas for a future run that could surface a real, failing-then-passing delta
47
+
48
+ Roughly in increasing order of effort / payoff:
49
+
50
+ 1. **Harder adversarial seed.** Mirror `hardening-plans/evals/`'s adversarial
51
+ case: seed an `Assistant:` turn that *explicitly rationalizes not loading the
52
+ skill* ("I'm already mid-implementation, a TDD skill would just duplicate what
53
+ I'm doing"). May still lose to the runner's slug-invoke hint, but worth a cheap
54
+ try.
55
+ 2. **Runner option: stage-for-discovery-without-instructing-invocation.** Add a
56
+ flag so the skill is discoverable (so the code-based `__skill_invoked`
57
+ meta-check still works) but the dispatch does **not** tell the agent to invoke
58
+ the staged slug. Then whether the agent invokes becomes a genuine choice the
59
+ bootstrap framing can influence — the single change most likely to make this
60
+ class of eval measurable. This is the high-value framework improvement.
61
+ 3. **Real harness-mode injection.** Reproduce the plan-mode suppression by running
62
+ the eval subagent *inside* a real plan mode rather than a described one. Tracked
63
+ as a parity goal in `harness-parity-check.md`; the biggest lift.
64
+
65
+ ## Bigger-picture testing strategy (from the maintainer)
66
+
67
+ For hard-to-test framing changes like this, a zero delta is an acceptable baseline
68
+ for now. The durable path to *testable* pressure scenarios is **live-session
69
+ audits** (`slow-powers:auditing-slow-powers-usage`): once we reach consistent live
70
+ skill compliance, the real-world failures we still see become the focused,
71
+ reproducible scenarios these evals currently can't manufacture cold. Until then,
72
+ don't expect the harness to manufacture the suppression on its own.
73
+
74
+ See also the project memory note `bootstrap-ab-invocation-ceiling`.
@@ -0,0 +1,51 @@
1
+ {
2
+ "generated": "2026-05-27T08:03:14.838Z",
3
+ "mode": "new-skill",
4
+ "conditions_compared": ["with_skill", "without_skill"],
5
+ "missing_gradings": 0,
6
+ "validity_warnings": [],
7
+ "run_summary": {
8
+ "with_skill": {
9
+ "pass_rate": {
10
+ "mean": 0.875,
11
+ "stddev": 0.125,
12
+ "n": 2
13
+ },
14
+ "duration_ms": {
15
+ "mean": 54902,
16
+ "stddev": 5297,
17
+ "n": 2
18
+ },
19
+ "total_tokens": {
20
+ "mean": 94168,
21
+ "stddev": 1106,
22
+ "n": 2
23
+ },
24
+ "skill_invocation_n": 2,
25
+ "skill_invocation_rate": 1
26
+ },
27
+ "without_skill": {
28
+ "pass_rate": {
29
+ "mean": 0.125,
30
+ "stddev": 0.125,
31
+ "n": 2
32
+ },
33
+ "duration_ms": {
34
+ "mean": 28472,
35
+ "stddev": 8340,
36
+ "n": 2
37
+ },
38
+ "total_tokens": {
39
+ "mean": 94108,
40
+ "stddev": 4854,
41
+ "n": 2
42
+ }
43
+ }
44
+ },
45
+ "delta": {
46
+ "direction": "with_skill - without_skill",
47
+ "pass_rate": 0.75,
48
+ "duration_ms": 26430,
49
+ "total_tokens": 60
50
+ }
51
+ }