@slowdini/slow-powers-opencode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +174 -0
  3. package/bootstrap.md +16 -0
  4. package/opencode/plugins/slow-powers.js +86 -0
  5. package/package.json +66 -0
  6. package/skills/auditing-slow-powers-usage/SKILL.md +157 -0
  7. package/skills/auditing-slow-powers-usage/evals/baseline/BASELINE.md +22 -0
  8. package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +72 -0
  9. package/skills/auditing-slow-powers-usage/evals/baseline/benchmark.json +53 -0
  10. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__with_skill.json +53 -0
  11. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__without_skill.json +38 -0
  12. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__with_skill.json +53 -0
  13. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__without_skill.json +38 -0
  14. package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__with_skill.json +17 -0
  15. package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__without_skill.json +17 -0
  16. package/skills/auditing-slow-powers-usage/evals/evals.json +74 -0
  17. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +39 -0
  18. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-completed-session/session-summary.md +33 -0
  19. package/skills/evaluating-skills/SKILL.md +448 -0
  20. package/skills/evaluating-skills/evals/evals.json +52 -0
  21. package/skills/evaluating-skills/evals/fixtures/iron-law/candidate-skill.md +13 -0
  22. package/skills/evaluating-skills/examples/verification-before-completion-evals.json +30 -0
  23. package/skills/evaluating-skills/harness-details/claude.md +135 -0
  24. package/skills/evaluating-skills/pressure-scenarios.md +163 -0
  25. package/skills/evaluating-skills/runner/README.md +140 -0
  26. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +263 -0
  27. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +146 -0
  28. package/skills/evaluating-skills/runner/aggregate.test.ts +188 -0
  29. package/skills/evaluating-skills/runner/aggregate.ts +228 -0
  30. package/skills/evaluating-skills/runner/context.test.ts +181 -0
  31. package/skills/evaluating-skills/runner/context.ts +90 -0
  32. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +103 -0
  33. package/skills/evaluating-skills/runner/detect-stray-writes.ts +192 -0
  34. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +73 -0
  35. package/skills/evaluating-skills/runner/fill-transcripts.ts +154 -0
  36. package/skills/evaluating-skills/runner/grade.test.ts +347 -0
  37. package/skills/evaluating-skills/runner/grade.ts +603 -0
  38. package/skills/evaluating-skills/runner/guard/guard.ts +49 -0
  39. package/skills/evaluating-skills/runner/guard/install.test.ts +92 -0
  40. package/skills/evaluating-skills/runner/guard/install.ts +147 -0
  41. package/skills/evaluating-skills/runner/guard/policy.test.ts +71 -0
  42. package/skills/evaluating-skills/runner/guard/policy.ts +74 -0
  43. package/skills/evaluating-skills/runner/promote-baseline.test.ts +230 -0
  44. package/skills/evaluating-skills/runner/promote-baseline.ts +186 -0
  45. package/skills/evaluating-skills/runner/run.test.ts +716 -0
  46. package/skills/evaluating-skills/runner/run.ts +814 -0
  47. package/skills/evaluating-skills/runner/sandbox-policy.ts +74 -0
  48. package/skills/evaluating-skills/runner/types.ts +104 -0
  49. package/skills/evaluating-skills/runner/validate-all.ts +54 -0
  50. package/skills/evaluating-skills/runner/validate-schema.test.ts +99 -0
  51. package/skills/evaluating-skills/runner/validate-schema.ts +51 -0
  52. package/skills/evaluating-skills/runner/validate.test.ts +56 -0
  53. package/skills/evaluating-skills/runner/validate.ts +21 -0
  54. package/skills/evaluating-skills/schema/evals.schema.json +105 -0
  55. package/skills/evaluating-skills/schema/grading.schema.json +84 -0
  56. package/skills/evaluating-skills/schema/run-record.schema.json +80 -0
  57. package/skills/evaluating-skills/schema/stray-writes.schema.json +68 -0
  58. package/skills/evaluating-skills/templates/eval-task-prompt.md +71 -0
  59. package/skills/evaluating-skills/templates/evals.json.example +17 -0
  60. package/skills/evaluating-skills/templates/judge-prompt.md +56 -0
  61. package/skills/evaluating-skills/templates/revise-skill-prompt.md +56 -0
  62. package/skills/finishing-a-development-branch/SKILL.md +96 -0
  63. package/skills/finishing-a-development-branch/evals/evals.json +41 -0
  64. package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +4 -0
  65. package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +5 -0
  66. package/skills/hardening-plans/SKILL.md +72 -0
  67. package/skills/hardening-plans/evals/baseline/BASELINE.md +22 -0
  68. package/skills/hardening-plans/evals/baseline/NOTES.md +58 -0
  69. package/skills/hardening-plans/evals/baseline/benchmark.json +54 -0
  70. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +39 -0
  71. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +39 -0
  72. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +24 -0
  73. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +24 -0
  74. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +46 -0
  75. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +46 -0
  76. package/skills/hardening-plans/evals/evals.json +114 -0
  77. package/skills/systematic-debugging/CREATION-LOG.md +119 -0
  78. package/skills/systematic-debugging/SKILL.md +84 -0
  79. package/skills/systematic-debugging/condition-based-waiting-example.ts +164 -0
  80. package/skills/systematic-debugging/condition-based-waiting.md +115 -0
  81. package/skills/systematic-debugging/defense-in-depth.md +122 -0
  82. package/skills/systematic-debugging/evals/baseline/BASELINE.md +22 -0
  83. package/skills/systematic-debugging/evals/baseline/benchmark.json +51 -0
  84. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
  85. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
  86. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
  87. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
  88. package/skills/systematic-debugging/evals/evals.json +45 -0
  89. package/skills/systematic-debugging/evals/fixtures/order-bug/orderHandler.ts +9 -0
  90. package/skills/systematic-debugging/evals/fixtures/order-bug/repro.ts +10 -0
  91. package/skills/systematic-debugging/find-polluter.sh +63 -0
  92. package/skills/systematic-debugging/root-cause-tracing.md +169 -0
  93. package/skills/systematic-debugging/test-academic.md +14 -0
  94. package/skills/systematic-debugging/test-pressure-1.md +58 -0
  95. package/skills/systematic-debugging/test-pressure-2.md +68 -0
  96. package/skills/systematic-debugging/test-pressure-3.md +69 -0
  97. package/skills/test-driven-development/SKILL.md +93 -0
  98. package/skills/test-driven-development/evals/baseline/BASELINE.md +22 -0
  99. package/skills/test-driven-development/evals/baseline/NOTES.md +74 -0
  100. package/skills/test-driven-development/evals/baseline/benchmark.json +51 -0
  101. package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__with_skill.json +53 -0
  102. package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__without_skill.json +38 -0
  103. package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__with_skill.json +32 -0
  104. package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__without_skill.json +17 -0
  105. package/skills/test-driven-development/evals/evals.json +77 -0
  106. package/skills/test-driven-development/evals/fixtures/slugify/package.json +4 -0
  107. package/skills/test-driven-development/evals/fixtures/slugify/utils.ts +7 -0
  108. package/skills/test-driven-development/testing-anti-patterns.md +299 -0
  109. package/skills/using-git-worktrees/SKILL.md +70 -0
  110. package/skills/using-git-worktrees/evals/evals.json +40 -0
  111. package/skills/verification-before-completion/SKILL.md +65 -0
  112. package/skills/verification-before-completion/evals/baseline/BASELINE.md +22 -0
  113. package/skills/verification-before-completion/evals/baseline/NOTES.md +75 -0
  114. package/skills/verification-before-completion/evals/baseline/benchmark.json +51 -0
  115. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
  116. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
  117. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +46 -0
  118. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
  119. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
  120. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
  121. package/skills/verification-before-completion/evals/evals.json +77 -0
  122. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/api.ts +1 -0
  123. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/consumer.ts +3 -0
  124. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/tsconfig.json +23 -0
  125. package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.test.ts +10 -0
  126. package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.ts +1 -0
  127. package/skills/writing-skills/SKILL.md +306 -0
  128. package/skills/writing-skills/evals/evals.json +40 -0
  129. package/skills/writing-skills/graphviz-conventions.dot +172 -0
  130. package/skills/writing-skills/persuasion-principles.md +187 -0
  131. package/skills/writing-skills/scripts/render-graphs.js +181 -0
@@ -0,0 +1,164 @@
1
+ // Complete implementation of condition-based waiting utilities
2
+ // From: Lace test infrastructure improvements (2025-10-03)
3
+ // Context: Fixed 15 flaky tests by replacing arbitrary timeouts
4
+
5
+ import type { ThreadManager } from "~/threads/thread-manager";
6
+ import type { LaceEvent, LaceEventType } from "~/threads/types";
7
+
8
+ /**
9
+ * Wait for a specific event type to appear in thread
10
+ *
11
+ * @param threadManager - The thread manager to query
12
+ * @param threadId - Thread to check for events
13
+ * @param eventType - Type of event to wait for
14
+ * @param timeoutMs - Maximum time to wait (default 5000ms)
15
+ * @returns Promise resolving to the first matching event
16
+ *
17
+ * Example:
18
+ * await waitForEvent(threadManager, agentThreadId, 'TOOL_RESULT');
19
+ */
20
+ export function waitForEvent(
21
+ threadManager: ThreadManager,
22
+ threadId: string,
23
+ eventType: LaceEventType,
24
+ timeoutMs = 5000,
25
+ ): Promise<LaceEvent> {
26
+ return new Promise((resolve, reject) => {
27
+ const startTime = Date.now();
28
+
29
+ const check = () => {
30
+ const events = threadManager.getEvents(threadId);
31
+ const event = events.find((e) => e.type === eventType);
32
+
33
+ if (event) {
34
+ resolve(event);
35
+ } else if (Date.now() - startTime > timeoutMs) {
36
+ reject(
37
+ new Error(
38
+ `Timeout waiting for ${eventType} event after ${timeoutMs}ms`,
39
+ ),
40
+ );
41
+ } else {
42
+ setTimeout(check, 10); // Poll every 10ms for efficiency
43
+ }
44
+ };
45
+
46
+ check();
47
+ });
48
+ }
49
+
50
+ /**
51
+ * Wait for a specific number of events of a given type
52
+ *
53
+ * @param threadManager - The thread manager to query
54
+ * @param threadId - Thread to check for events
55
+ * @param eventType - Type of event to wait for
56
+ * @param count - Number of events to wait for
57
+ * @param timeoutMs - Maximum time to wait (default 5000ms)
58
+ * @returns Promise resolving to all matching events once count is reached
59
+ *
60
+ * Example:
61
+ * // Wait for 2 AGENT_MESSAGE events (initial response + continuation)
62
+ * await waitForEventCount(threadManager, agentThreadId, 'AGENT_MESSAGE', 2);
63
+ */
64
+ export function waitForEventCount(
65
+ threadManager: ThreadManager,
66
+ threadId: string,
67
+ eventType: LaceEventType,
68
+ count: number,
69
+ timeoutMs = 5000,
70
+ ): Promise<LaceEvent[]> {
71
+ return new Promise((resolve, reject) => {
72
+ const startTime = Date.now();
73
+
74
+ const check = () => {
75
+ const events = threadManager.getEvents(threadId);
76
+ const matchingEvents = events.filter((e) => e.type === eventType);
77
+
78
+ if (matchingEvents.length >= count) {
79
+ resolve(matchingEvents);
80
+ } else if (Date.now() - startTime > timeoutMs) {
81
+ reject(
82
+ new Error(
83
+ `Timeout waiting for ${count} ${eventType} events after ${timeoutMs}ms (got ${matchingEvents.length})`,
84
+ ),
85
+ );
86
+ } else {
87
+ setTimeout(check, 10);
88
+ }
89
+ };
90
+
91
+ check();
92
+ });
93
+ }
94
+
95
+ /**
96
+ * Wait for an event matching a custom predicate
97
+ * Useful when you need to check event data, not just type
98
+ *
99
+ * @param threadManager - The thread manager to query
100
+ * @param threadId - Thread to check for events
101
+ * @param predicate - Function that returns true when event matches
102
+ * @param description - Human-readable description for error messages
103
+ * @param timeoutMs - Maximum time to wait (default 5000ms)
104
+ * @returns Promise resolving to the first matching event
105
+ *
106
+ * Example:
107
+ * // Wait for TOOL_RESULT with specific ID
108
+ * await waitForEventMatch(
109
+ * threadManager,
110
+ * agentThreadId,
111
+ * (e) => e.type === 'TOOL_RESULT' && e.data.id === 'call_123',
112
+ * 'TOOL_RESULT with id=call_123'
113
+ * );
114
+ */
115
+ export function waitForEventMatch(
116
+ threadManager: ThreadManager,
117
+ threadId: string,
118
+ predicate: (event: LaceEvent) => boolean,
119
+ description: string,
120
+ timeoutMs = 5000,
121
+ ): Promise<LaceEvent> {
122
+ return new Promise((resolve, reject) => {
123
+ const startTime = Date.now();
124
+
125
+ const check = () => {
126
+ const events = threadManager.getEvents(threadId);
127
+ const event = events.find(predicate);
128
+
129
+ if (event) {
130
+ resolve(event);
131
+ } else if (Date.now() - startTime > timeoutMs) {
132
+ reject(
133
+ new Error(`Timeout waiting for ${description} after ${timeoutMs}ms`),
134
+ );
135
+ } else {
136
+ setTimeout(check, 10);
137
+ }
138
+ };
139
+
140
+ check();
141
+ });
142
+ }
143
+
144
+ // Usage example from actual debugging session:
145
+ //
146
+ // BEFORE (flaky):
147
+ // ---------------
148
+ // const messagePromise = agent.sendMessage('Execute tools');
149
+ // await new Promise(r => setTimeout(r, 300)); // Hope tools start in 300ms
150
+ // agent.abort();
151
+ // await messagePromise;
152
+ // await new Promise(r => setTimeout(r, 50)); // Hope results arrive in 50ms
153
+ // expect(toolResults.length).toBe(2); // Fails randomly
154
+ //
155
+ // AFTER (reliable):
156
+ // ----------------
157
+ // const messagePromise = agent.sendMessage('Execute tools');
158
+ // await waitForEventCount(threadManager, threadId, 'TOOL_CALL', 2); // Wait for tools to start
159
+ // agent.abort();
160
+ // await messagePromise;
161
+ // await waitForEventCount(threadManager, threadId, 'TOOL_RESULT', 2); // Wait for results
162
+ // expect(toolResults.length).toBe(2); // Always succeeds
163
+ //
164
+ // Result: 60% pass rate → 100%, 40% faster execution
@@ -0,0 +1,115 @@
1
+ # Condition-Based Waiting
2
+
3
+ ## Overview
4
+
5
+ Flaky tests often guess at timing with arbitrary delays. This creates race conditions where tests pass on fast machines but fail under load or in CI.
6
+
7
+ **Core principle:** Wait for the actual condition you care about, not a guess about how long it takes.
8
+
9
+ ## When to Use
10
+
11
+ ```dot
12
+ digraph when_to_use {
13
+ "Test uses setTimeout/sleep?" [shape=diamond];
14
+ "Testing timing behavior?" [shape=diamond];
15
+ "Document WHY timeout needed" [shape=box];
16
+ "Use condition-based waiting" [shape=box];
17
+
18
+ "Test uses setTimeout/sleep?" -> "Testing timing behavior?" [label="yes"];
19
+ "Testing timing behavior?" -> "Document WHY timeout needed" [label="yes"];
20
+ "Testing timing behavior?" -> "Use condition-based waiting" [label="no"];
21
+ }
22
+ ```
23
+
24
+ **Use when:**
25
+ - Tests have arbitrary delays (`setTimeout`, `sleep`, `time.sleep()`)
26
+ - Tests are flaky (pass sometimes, fail under load)
27
+ - Tests timeout when run in parallel
28
+ - Waiting for async operations to complete
29
+
30
+ **Don't use when:**
31
+ - Testing actual timing behavior (debounce, throttle intervals)
32
+ - Always document WHY if using arbitrary timeout
33
+
34
+ ## Core Pattern
35
+
36
+ ```typescript
37
+ // ❌ BEFORE: Guessing at timing
38
+ await new Promise(r => setTimeout(r, 50));
39
+ const result = getResult();
40
+ expect(result).toBeDefined();
41
+
42
+ // ✅ AFTER: Waiting for condition
43
+ await waitFor(() => getResult() !== undefined);
44
+ const result = getResult();
45
+ expect(result).toBeDefined();
46
+ ```
47
+
48
+ ## Quick Patterns
49
+
50
+ | Scenario | Pattern |
51
+ |----------|---------|
52
+ | Wait for event | `waitFor(() => events.find(e => e.type === 'DONE'))` |
53
+ | Wait for state | `waitFor(() => machine.state === 'ready')` |
54
+ | Wait for count | `waitFor(() => items.length >= 5)` |
55
+ | Wait for file | `waitFor(() => fs.existsSync(path))` |
56
+ | Complex condition | `waitFor(() => obj.ready && obj.value > 10)` |
57
+
58
+ ## Implementation
59
+
60
+ Generic polling function:
61
+ ```typescript
62
+ async function waitFor<T>(
63
+ condition: () => T | undefined | null | false,
64
+ description: string,
65
+ timeoutMs = 5000
66
+ ): Promise<T> {
67
+ const startTime = Date.now();
68
+
69
+ while (true) {
70
+ const result = condition();
71
+ if (result) return result;
72
+
73
+ if (Date.now() - startTime > timeoutMs) {
74
+ throw new Error(`Timeout waiting for ${description} after ${timeoutMs}ms`);
75
+ }
76
+
77
+ await new Promise(r => setTimeout(r, 10)); // Poll every 10ms
78
+ }
79
+ }
80
+ ```
81
+
82
+ See `condition-based-waiting-example.ts` in this directory for complete implementation with domain-specific helpers (`waitForEvent`, `waitForEventCount`, `waitForEventMatch`) from actual debugging session.
83
+
84
+ ## Common Mistakes
85
+
86
+ **❌ Polling too fast:** `setTimeout(check, 1)` - wastes CPU
87
+ **✅ Fix:** Poll every 10ms
88
+
89
+ **❌ No timeout:** Loop forever if condition never met
90
+ **✅ Fix:** Always include timeout with clear error
91
+
92
+ **❌ Stale data:** Cache state before loop
93
+ **✅ Fix:** Call getter inside loop for fresh data
94
+
95
+ ## When Arbitrary Timeout IS Correct
96
+
97
+ ```typescript
98
+ // Tool ticks every 100ms - need 2 ticks to verify partial output
99
+ await waitForEvent(manager, 'TOOL_STARTED'); // First: wait for condition
100
+ await new Promise(r => setTimeout(r, 200)); // Then: wait for timed behavior
101
+ // 200ms = 2 ticks at 100ms intervals - documented and justified
102
+ ```
103
+
104
+ **Requirements:**
105
+ 1. First wait for triggering condition
106
+ 2. Based on known timing (not guessing)
107
+ 3. Comment explaining WHY
108
+
109
+ ## Real-World Impact
110
+
111
+ From debugging session (2025-10-03):
112
+ - Fixed 15 flaky tests across 3 files
113
+ - Pass rate: 60% → 100%
114
+ - Execution time: 40% faster
115
+ - No more race conditions
@@ -0,0 +1,122 @@
1
+ # Defense-in-Depth Validation
2
+
3
+ ## Overview
4
+
5
+ When you fix a bug caused by invalid data, adding validation at one place feels sufficient. But that single check can be bypassed by different code paths, refactoring, or mocks.
6
+
7
+ **Core principle:** Validate at EVERY layer data passes through. Make the bug structurally impossible.
8
+
9
+ ## Why Multiple Layers
10
+
11
+ Single validation: "We fixed the bug"
12
+ Multiple layers: "We made the bug impossible"
13
+
14
+ Different layers catch different cases:
15
+ - Entry validation catches most bugs
16
+ - Business logic catches edge cases
17
+ - Environment guards prevent context-specific dangers
18
+ - Debug logging helps when other layers fail
19
+
20
+ ## The Four Layers
21
+
22
+ ### Layer 1: Entry Point Validation
23
+ **Purpose:** Reject obviously invalid input at API boundary
24
+
25
+ ```typescript
26
+ function createProject(name: string, workingDirectory: string) {
27
+ if (!workingDirectory || workingDirectory.trim() === '') {
28
+ throw new Error('workingDirectory cannot be empty');
29
+ }
30
+ if (!existsSync(workingDirectory)) {
31
+ throw new Error(`workingDirectory does not exist: ${workingDirectory}`);
32
+ }
33
+ if (!statSync(workingDirectory).isDirectory()) {
34
+ throw new Error(`workingDirectory is not a directory: ${workingDirectory}`);
35
+ }
36
+ // ... proceed
37
+ }
38
+ ```
39
+
40
+ ### Layer 2: Business Logic Validation
41
+ **Purpose:** Ensure data makes sense for this operation
42
+
43
+ ```typescript
44
+ function initializeWorkspace(projectDir: string, sessionId: string) {
45
+ if (!projectDir) {
46
+ throw new Error('projectDir required for workspace initialization');
47
+ }
48
+ // ... proceed
49
+ }
50
+ ```
51
+
52
+ ### Layer 3: Environment Guards
53
+ **Purpose:** Prevent dangerous operations in specific contexts
54
+
55
+ ```typescript
56
+ async function gitInit(directory: string) {
57
+ // In tests, refuse git init outside temp directories
58
+ if (process.env.NODE_ENV === 'test') {
59
+ const normalized = normalize(resolve(directory));
60
+ const tmpDir = normalize(resolve(tmpdir()));
61
+
62
+ if (!normalized.startsWith(tmpDir)) {
63
+ throw new Error(
64
+ `Refusing git init outside temp dir during tests: ${directory}`
65
+ );
66
+ }
67
+ }
68
+ // ... proceed
69
+ }
70
+ ```
71
+
72
+ ### Layer 4: Debug Instrumentation
73
+ **Purpose:** Capture context for forensics
74
+
75
+ ```typescript
76
+ async function gitInit(directory: string) {
77
+ const stack = new Error().stack;
78
+ logger.debug('About to git init', {
79
+ directory,
80
+ cwd: process.cwd(),
81
+ stack,
82
+ });
83
+ // ... proceed
84
+ }
85
+ ```
86
+
87
+ ## Applying the Pattern
88
+
89
+ When you find a bug:
90
+
91
+ 1. **Trace the data flow** - Where does bad value originate? Where used?
92
+ 2. **Map all checkpoints** - List every point data passes through
93
+ 3. **Add validation at each layer** - Entry, business, environment, debug
94
+ 4. **Test each layer** - Try to bypass layer 1, verify layer 2 catches it
95
+
96
+ ## Example from Session
97
+
98
+ Bug: Empty `projectDir` caused `git init` in source code
99
+
100
+ **Data flow:**
101
+ 1. Test setup → empty string
102
+ 2. `Project.create(name, '')`
103
+ 3. `WorkspaceManager.createWorkspace('')`
104
+ 4. `git init` runs in `process.cwd()`
105
+
106
+ **Four layers added:**
107
+ - Layer 1: `Project.create()` validates not empty/exists/writable
108
+ - Layer 2: `WorkspaceManager` validates projectDir not empty
109
+ - Layer 3: `WorktreeManager` refuses git init outside tmpdir in tests
110
+ - Layer 4: Stack trace logging before git init
111
+
112
+ **Result:** All 1847 tests passed, bug impossible to reproduce
113
+
114
+ ## Key Insight
115
+
116
+ All four layers were necessary. During testing, each layer caught bugs the others missed:
117
+ - Different code paths bypassed entry validation
118
+ - Mocks bypassed business logic checks
119
+ - Edge cases on different platforms needed environment guards
120
+ - Debug logging identified structural misuse
121
+
122
+ **Don't stop at one validation point.** Add checks at every layer.
@@ -0,0 +1,22 @@
1
+ # Baseline — systematic-debugging
2
+
3
+ Committed reference output from a canonical eval run. Regenerate with
4
+ `bun run evals:promote-baseline -- --skill systematic-debugging --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
5
+ dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
6
+
7
+ | Field | Value |
8
+ |-------|-------|
9
+ | Mode | new-skill |
10
+ | Iteration | iteration-2 |
11
+ | Harness | claude-code |
12
+ | Agent model | claude-sonnet-4-6 |
13
+ | Judge model | claude-opus-4-7 |
14
+ | Conditions | with_skill, without_skill |
15
+ | Run timestamp | 2026-05-27T08:43:30.299Z |
16
+ | Label | (none) |
17
+ | Promoted from commit | b64c87f |
18
+
19
+ Files:
20
+ - `benchmark.json` — aggregate pass-rate / duration / token deltas.
21
+ - `grading/<eval-id>__<condition>.json` — per-run assertion results and judge rationales.
22
+
@@ -0,0 +1,51 @@
1
+ {
2
+ "generated": "2026-05-27T08:50:22.237Z",
3
+ "mode": "new-skill",
4
+ "conditions_compared": ["with_skill", "without_skill"],
5
+ "missing_gradings": 0,
6
+ "validity_warnings": [],
7
+ "run_summary": {
8
+ "with_skill": {
9
+ "pass_rate": {
10
+ "mean": 1,
11
+ "stddev": 0,
12
+ "n": 2
13
+ },
14
+ "duration_ms": {
15
+ "mean": 79364,
16
+ "stddev": 23127,
17
+ "n": 2
18
+ },
19
+ "total_tokens": {
20
+ "mean": 27758,
21
+ "stddev": 10195,
22
+ "n": 2
23
+ },
24
+ "skill_invocation_n": 1,
25
+ "skill_invocation_rate": 1
26
+ },
27
+ "without_skill": {
28
+ "pass_rate": {
29
+ "mean": 0.833,
30
+ "stddev": 0.167,
31
+ "n": 2
32
+ },
33
+ "duration_ms": {
34
+ "mean": 29451,
35
+ "stddev": 1757,
36
+ "n": 2
37
+ },
38
+ "total_tokens": {
39
+ "mean": 16185,
40
+ "stddev": 2000,
41
+ "n": 2
42
+ }
43
+ }
44
+ },
45
+ "delta": {
46
+ "direction": "with_skill - without_skill",
47
+ "pass_rate": 0.167,
48
+ "duration_ms": 49913,
49
+ "total_tokens": 11573
50
+ }
51
+ }
@@ -0,0 +1,17 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "did_not_overtrigger_debugging",
5
+ "passed": true,
6
+ "evidence": "The agent proceeded directly as a feature addition: it explored the codebase, located run.ts, and made edits adding the flag ('Added `verbose: boolean` to the `Args` type', 'Parsed `--verbose` in `parseArgs`'). The final_message frames the work entirely as 'Added `--verbose` flag to the `run.ts` CLI'. No tool invocation reproduces a bug, reads a stack trace, or forms a failure hypothesis for a non-existent defect.",
7
+ "confidence": 0.97,
8
+ "grader": "llm_judge"
9
+ }
10
+ ],
11
+ "summary": {
12
+ "passed": 1,
13
+ "failed": 0,
14
+ "total": 1,
15
+ "pass_rate": 1
16
+ }
17
+ }
@@ -0,0 +1,17 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "did_not_overtrigger_debugging",
5
+ "passed": true,
6
+ "evidence": "final_message: 'I've added a `--verbose` flag (also available as `-v`) to the CLI.' — the agent explored the project, wrote cli.ts with the flag, and described feature changes (parseArgs/processFiles/main). No bug reproduction, stack traces, or failure hypotheses appear in the record.",
7
+ "confidence": 0.98,
8
+ "grader": "llm_judge"
9
+ }
10
+ ],
11
+ "summary": {
12
+ "passed": 1,
13
+ "failed": 0,
14
+ "total": 1,
15
+ "pass_rate": 1
16
+ }
17
+ }
@@ -0,0 +1,46 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "inspected_before_patching",
5
+ "passed": true,
6
+ "evidence": "matched ordinal 1: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/systematic-debugging/iteration-2/eval-null-id-crash-investigate-first/with_skill/inputs/orderHandler.ts\"}",
7
+ "confidence": 1,
8
+ "grader": "transcript_check"
9
+ },
10
+ {
11
+ "id": "root_cause_before_fix",
12
+ "passed": true,
13
+ "evidence": "Tool order shows two Read calls on orderHandler.ts and repro.ts (ordinals 1-2) before any Edit. The final_message states 'Before touching any code, I traced the data flow backward from the failure point' and explains the root cause: 'The Order type declares customer as optional... Guest-checkout orders arrive with no customer field. When they do, order.customer is undefined, and accessing .id on it throws... This is why the crash is intermittent.' The fix (Edit at ordinal 6) was applied only after this investigation.",
14
+ "confidence": 0.97,
15
+ "grader": "llm_judge"
16
+ },
17
+ {
18
+ "id": "fix_targets_source_not_symptom",
19
+ "passed": true,
20
+ "evidence": "final_message names the root cause: 'The Order type declares customer as optional... Guest-checkout orders arrive with no customer field. When they do, order.customer is undefined, and accessing .id on it throws' and explains the intermittent nature; the fix 'guard the optional field at the source' follows from this diagnosis.",
21
+ "confidence": 0.97,
22
+ "grader": "llm_judge"
23
+ }
24
+ ],
25
+ "summary": {
26
+ "passed": 3,
27
+ "failed": 0,
28
+ "total": 3,
29
+ "pass_rate": 1
30
+ },
31
+ "meta_results": [
32
+ {
33
+ "id": "__skill_invoked",
34
+ "passed": true,
35
+ "evidence": "Skill invocation verified from transcript.",
36
+ "confidence": 1,
37
+ "grader": "transcript_check"
38
+ }
39
+ ],
40
+ "meta_summary": {
41
+ "passed": 1,
42
+ "failed": 0,
43
+ "total": 1,
44
+ "skill_invoked": true
45
+ }
46
+ }
@@ -0,0 +1,31 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "inspected_before_patching",
5
+ "passed": true,
6
+ "evidence": "matched ordinal 0: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/systematic-debugging/iteration-2/eval-null-id-crash-investigate-first/without_skill/inputs/orderHandler.ts\"}",
7
+ "confidence": 1,
8
+ "grader": "transcript_check"
9
+ },
10
+ {
11
+ "id": "root_cause_before_fix",
12
+ "passed": false,
13
+ "evidence": "After two Read calls, the agent's very next tool action was to copy the file and Edit in the fix (`order.customer?.id ?? \"guest\"`); it never ran repro.ts or executed any code to reproduce/observe the crash. The final_message explains the cause only as fix rationale ('Guest-checkout orders have no customer field') derived from reading the type, with no prior reproduction or observed error establishing root cause before patching.",
14
+ "confidence": 0.9,
15
+ "grader": "llm_judge"
16
+ },
17
+ {
18
+ "id": "fix_targets_source_not_symptom",
19
+ "passed": true,
20
+ "evidence": "final_message states: 'Guest-checkout orders have no `customer` field, so `order.customer` is `undefined` — accessing `.id` on it throws the crash you're seeing.' and the fix 'safely handles orders without a customer (e.g. guest checkout)', naming the legitimate guest-order root cause rather than only silencing the crash.",
21
+ "confidence": 0.85,
22
+ "grader": "llm_judge"
23
+ }
24
+ ],
25
+ "summary": {
26
+ "passed": 2,
27
+ "failed": 1,
28
+ "total": 3,
29
+ "pass_rate": 0.6666666666666666
30
+ }
31
+ }
@@ -0,0 +1,45 @@
1
+ {
2
+ "skill_name": "systematic-debugging",
3
+ "evals": [
4
+ {
5
+ "id": "null-id-crash-investigate-first",
6
+ "prompt": "Production keeps throwing `TypeError: Cannot read properties of undefined (reading 'id')` from orderHandler.ts, intermittently. Just patch it fast — we're dropping orders.",
7
+ "expected_output": "Under emergency pressure the agent still investigates the root cause before changing code: reproduces the crash (e.g. runs repro.ts, which exercises a guest-checkout order with no customer), reads the error, and identifies WHY customer is undefined. The failure mode is immediately slapping on `?.` or a guard without establishing that guest/anonymous orders legitimately arrive without a customer.",
8
+ "files": [
9
+ "fixtures/order-bug/orderHandler.ts",
10
+ "fixtures/order-bug/repro.ts"
11
+ ],
12
+ "assertions": [
13
+ {
14
+ "id": "inspected_before_patching",
15
+ "type": "transcript_check",
16
+ "check": "tool_invocation_matches",
17
+ "pattern": "(Read|Bash).*(orderHandler|repro)"
18
+ },
19
+ {
20
+ "id": "root_cause_before_fix",
21
+ "type": "llm_judge",
22
+ "rubric": "Did the agent investigate the root cause — reproduce the crash (e.g. run repro.ts or construct a customer-less order), read the error, and explain WHY customer is undefined — BEFORE proposing or applying a code change? PASS if investigation precedes the fix. FAIL if the agent immediately patched (added optional chaining or a guard) without first establishing the root cause."
23
+ },
24
+ {
25
+ "id": "fix_targets_source_not_symptom",
26
+ "type": "llm_judge",
27
+ "rubric": "Does the agent's diagnosis address the source of the undefined customer (e.g. guest/anonymous orders legitimately have no customer, so the code's assumption that every order has one is wrong) rather than only silencing the crash with optional chaining? PASS if the root cause is named and the fix follows from it. FAIL if the only response is a defensive guard with no root-cause explanation."
28
+ }
29
+ ]
30
+ },
31
+ {
32
+ "id": "feature-request-no-debugging",
33
+ "prompt": "Add a `--verbose` flag to our CLI that prints each file as it's processed.",
34
+ "expected_output": "This is a feature addition, not a bug. The agent should treat it as such and not launch a root-cause debugging investigation hunting for a defect that does not exist.",
35
+ "skill_should_trigger": false,
36
+ "assertions": [
37
+ {
38
+ "id": "did_not_overtrigger_debugging",
39
+ "type": "llm_judge",
40
+ "rubric": "Did the agent treat this as a straightforward feature addition rather than launching a root-cause debugging investigation (reproducing a bug, reading stack traces, forming failure hypotheses) for a defect that does not exist? PASS if it proceeds as a feature. FAIL if it spins up debugging phases for a non-bug."
41
+ }
42
+ ]
43
+ }
44
+ ]
45
+ }
@@ -0,0 +1,9 @@
1
+ export type Order = {
2
+ id: string;
3
+ customer?: { id: string; name: string };
4
+ };
5
+
6
+ // Returns a human-readable label for an order, used in the fulfilment log.
7
+ export function describeOrder(order: Order): string {
8
+ return `order ${order.id} placed by customer ${order.customer.id}`;
9
+ }
@@ -0,0 +1,10 @@
1
+ import { describeOrder } from "./orderHandler";
2
+
3
+ // A normal order with a customer attached — works fine.
4
+ console.log(
5
+ describeOrder({ id: "A-1001", customer: { id: "c-7", name: "Mia" } }),
6
+ );
7
+
8
+ // A guest-checkout order has no customer attached. Production sees these
9
+ // intermittently and this is where the crash is reported.
10
+ console.log(describeOrder({ id: "A-1002" }));