@slowdini/slow-powers-opencode 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +174 -0
- package/bootstrap.md +16 -0
- package/opencode/plugins/slow-powers.js +86 -0
- package/package.json +66 -0
- package/skills/auditing-slow-powers-usage/SKILL.md +157 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/BASELINE.md +22 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +72 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/benchmark.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__with_skill.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__without_skill.json +38 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__with_skill.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__without_skill.json +38 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__with_skill.json +17 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__without_skill.json +17 -0
- package/skills/auditing-slow-powers-usage/evals/evals.json +74 -0
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +39 -0
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-completed-session/session-summary.md +33 -0
- package/skills/evaluating-skills/SKILL.md +448 -0
- package/skills/evaluating-skills/evals/evals.json +52 -0
- package/skills/evaluating-skills/evals/fixtures/iron-law/candidate-skill.md +13 -0
- package/skills/evaluating-skills/examples/verification-before-completion-evals.json +30 -0
- package/skills/evaluating-skills/harness-details/claude.md +135 -0
- package/skills/evaluating-skills/pressure-scenarios.md +163 -0
- package/skills/evaluating-skills/runner/README.md +140 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +263 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +146 -0
- package/skills/evaluating-skills/runner/aggregate.test.ts +188 -0
- package/skills/evaluating-skills/runner/aggregate.ts +228 -0
- package/skills/evaluating-skills/runner/context.test.ts +181 -0
- package/skills/evaluating-skills/runner/context.ts +90 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +103 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +192 -0
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +73 -0
- package/skills/evaluating-skills/runner/fill-transcripts.ts +154 -0
- package/skills/evaluating-skills/runner/grade.test.ts +347 -0
- package/skills/evaluating-skills/runner/grade.ts +603 -0
- package/skills/evaluating-skills/runner/guard/guard.ts +49 -0
- package/skills/evaluating-skills/runner/guard/install.test.ts +92 -0
- package/skills/evaluating-skills/runner/guard/install.ts +147 -0
- package/skills/evaluating-skills/runner/guard/policy.test.ts +71 -0
- package/skills/evaluating-skills/runner/guard/policy.ts +74 -0
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +230 -0
- package/skills/evaluating-skills/runner/promote-baseline.ts +186 -0
- package/skills/evaluating-skills/runner/run.test.ts +716 -0
- package/skills/evaluating-skills/runner/run.ts +814 -0
- package/skills/evaluating-skills/runner/sandbox-policy.ts +74 -0
- package/skills/evaluating-skills/runner/types.ts +104 -0
- package/skills/evaluating-skills/runner/validate-all.ts +54 -0
- package/skills/evaluating-skills/runner/validate-schema.test.ts +99 -0
- package/skills/evaluating-skills/runner/validate-schema.ts +51 -0
- package/skills/evaluating-skills/runner/validate.test.ts +56 -0
- package/skills/evaluating-skills/runner/validate.ts +21 -0
- package/skills/evaluating-skills/schema/evals.schema.json +105 -0
- package/skills/evaluating-skills/schema/grading.schema.json +84 -0
- package/skills/evaluating-skills/schema/run-record.schema.json +80 -0
- package/skills/evaluating-skills/schema/stray-writes.schema.json +68 -0
- package/skills/evaluating-skills/templates/eval-task-prompt.md +71 -0
- package/skills/evaluating-skills/templates/evals.json.example +17 -0
- package/skills/evaluating-skills/templates/judge-prompt.md +56 -0
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +56 -0
- package/skills/finishing-a-development-branch/SKILL.md +96 -0
- package/skills/finishing-a-development-branch/evals/evals.json +41 -0
- package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +4 -0
- package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +5 -0
- package/skills/hardening-plans/SKILL.md +72 -0
- package/skills/hardening-plans/evals/baseline/BASELINE.md +22 -0
- package/skills/hardening-plans/evals/baseline/NOTES.md +58 -0
- package/skills/hardening-plans/evals/baseline/benchmark.json +54 -0
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +24 -0
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +24 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +46 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +46 -0
- package/skills/hardening-plans/evals/evals.json +114 -0
- package/skills/systematic-debugging/CREATION-LOG.md +119 -0
- package/skills/systematic-debugging/SKILL.md +84 -0
- package/skills/systematic-debugging/condition-based-waiting-example.ts +164 -0
- package/skills/systematic-debugging/condition-based-waiting.md +115 -0
- package/skills/systematic-debugging/defense-in-depth.md +122 -0
- package/skills/systematic-debugging/evals/baseline/BASELINE.md +22 -0
- package/skills/systematic-debugging/evals/baseline/benchmark.json +51 -0
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
- package/skills/systematic-debugging/evals/evals.json +45 -0
- package/skills/systematic-debugging/evals/fixtures/order-bug/orderHandler.ts +9 -0
- package/skills/systematic-debugging/evals/fixtures/order-bug/repro.ts +10 -0
- package/skills/systematic-debugging/find-polluter.sh +63 -0
- package/skills/systematic-debugging/root-cause-tracing.md +169 -0
- package/skills/systematic-debugging/test-academic.md +14 -0
- package/skills/systematic-debugging/test-pressure-1.md +58 -0
- package/skills/systematic-debugging/test-pressure-2.md +68 -0
- package/skills/systematic-debugging/test-pressure-3.md +69 -0
- package/skills/test-driven-development/SKILL.md +93 -0
- package/skills/test-driven-development/evals/baseline/BASELINE.md +22 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +74 -0
- package/skills/test-driven-development/evals/baseline/benchmark.json +51 -0
- package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__with_skill.json +53 -0
- package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__without_skill.json +38 -0
- package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__with_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__without_skill.json +17 -0
- package/skills/test-driven-development/evals/evals.json +77 -0
- package/skills/test-driven-development/evals/fixtures/slugify/package.json +4 -0
- package/skills/test-driven-development/evals/fixtures/slugify/utils.ts +7 -0
- package/skills/test-driven-development/testing-anti-patterns.md +299 -0
- package/skills/using-git-worktrees/SKILL.md +70 -0
- package/skills/using-git-worktrees/evals/evals.json +40 -0
- package/skills/verification-before-completion/SKILL.md +65 -0
- package/skills/verification-before-completion/evals/baseline/BASELINE.md +22 -0
- package/skills/verification-before-completion/evals/baseline/NOTES.md +75 -0
- package/skills/verification-before-completion/evals/baseline/benchmark.json +51 -0
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +46 -0
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
- package/skills/verification-before-completion/evals/evals.json +77 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/api.ts +1 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/consumer.ts +3 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/tsconfig.json +23 -0
- package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.test.ts +10 -0
- package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.ts +1 -0
- package/skills/writing-skills/SKILL.md +306 -0
- package/skills/writing-skills/evals/evals.json +40 -0
- package/skills/writing-skills/graphviz-conventions.dot +172 -0
- package/skills/writing-skills/persuasion-principles.md +187 -0
- package/skills/writing-skills/scripts/render-graphs.js +181 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
// Complete implementation of condition-based waiting utilities
|
|
2
|
+
// From: Lace test infrastructure improvements (2025-10-03)
|
|
3
|
+
// Context: Fixed 15 flaky tests by replacing arbitrary timeouts
|
|
4
|
+
|
|
5
|
+
import type { ThreadManager } from "~/threads/thread-manager";
|
|
6
|
+
import type { LaceEvent, LaceEventType } from "~/threads/types";
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Wait for a specific event type to appear in thread
|
|
10
|
+
*
|
|
11
|
+
* @param threadManager - The thread manager to query
|
|
12
|
+
* @param threadId - Thread to check for events
|
|
13
|
+
* @param eventType - Type of event to wait for
|
|
14
|
+
* @param timeoutMs - Maximum time to wait (default 5000ms)
|
|
15
|
+
* @returns Promise resolving to the first matching event
|
|
16
|
+
*
|
|
17
|
+
* Example:
|
|
18
|
+
* await waitForEvent(threadManager, agentThreadId, 'TOOL_RESULT');
|
|
19
|
+
*/
|
|
20
|
+
export function waitForEvent(
|
|
21
|
+
threadManager: ThreadManager,
|
|
22
|
+
threadId: string,
|
|
23
|
+
eventType: LaceEventType,
|
|
24
|
+
timeoutMs = 5000,
|
|
25
|
+
): Promise<LaceEvent> {
|
|
26
|
+
return new Promise((resolve, reject) => {
|
|
27
|
+
const startTime = Date.now();
|
|
28
|
+
|
|
29
|
+
const check = () => {
|
|
30
|
+
const events = threadManager.getEvents(threadId);
|
|
31
|
+
const event = events.find((e) => e.type === eventType);
|
|
32
|
+
|
|
33
|
+
if (event) {
|
|
34
|
+
resolve(event);
|
|
35
|
+
} else if (Date.now() - startTime > timeoutMs) {
|
|
36
|
+
reject(
|
|
37
|
+
new Error(
|
|
38
|
+
`Timeout waiting for ${eventType} event after ${timeoutMs}ms`,
|
|
39
|
+
),
|
|
40
|
+
);
|
|
41
|
+
} else {
|
|
42
|
+
setTimeout(check, 10); // Poll every 10ms for efficiency
|
|
43
|
+
}
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
check();
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Wait for a specific number of events of a given type
|
|
52
|
+
*
|
|
53
|
+
* @param threadManager - The thread manager to query
|
|
54
|
+
* @param threadId - Thread to check for events
|
|
55
|
+
* @param eventType - Type of event to wait for
|
|
56
|
+
* @param count - Number of events to wait for
|
|
57
|
+
* @param timeoutMs - Maximum time to wait (default 5000ms)
|
|
58
|
+
* @returns Promise resolving to all matching events once count is reached
|
|
59
|
+
*
|
|
60
|
+
* Example:
|
|
61
|
+
* // Wait for 2 AGENT_MESSAGE events (initial response + continuation)
|
|
62
|
+
* await waitForEventCount(threadManager, agentThreadId, 'AGENT_MESSAGE', 2);
|
|
63
|
+
*/
|
|
64
|
+
export function waitForEventCount(
|
|
65
|
+
threadManager: ThreadManager,
|
|
66
|
+
threadId: string,
|
|
67
|
+
eventType: LaceEventType,
|
|
68
|
+
count: number,
|
|
69
|
+
timeoutMs = 5000,
|
|
70
|
+
): Promise<LaceEvent[]> {
|
|
71
|
+
return new Promise((resolve, reject) => {
|
|
72
|
+
const startTime = Date.now();
|
|
73
|
+
|
|
74
|
+
const check = () => {
|
|
75
|
+
const events = threadManager.getEvents(threadId);
|
|
76
|
+
const matchingEvents = events.filter((e) => e.type === eventType);
|
|
77
|
+
|
|
78
|
+
if (matchingEvents.length >= count) {
|
|
79
|
+
resolve(matchingEvents);
|
|
80
|
+
} else if (Date.now() - startTime > timeoutMs) {
|
|
81
|
+
reject(
|
|
82
|
+
new Error(
|
|
83
|
+
`Timeout waiting for ${count} ${eventType} events after ${timeoutMs}ms (got ${matchingEvents.length})`,
|
|
84
|
+
),
|
|
85
|
+
);
|
|
86
|
+
} else {
|
|
87
|
+
setTimeout(check, 10);
|
|
88
|
+
}
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
check();
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Wait for an event matching a custom predicate
|
|
97
|
+
* Useful when you need to check event data, not just type
|
|
98
|
+
*
|
|
99
|
+
* @param threadManager - The thread manager to query
|
|
100
|
+
* @param threadId - Thread to check for events
|
|
101
|
+
* @param predicate - Function that returns true when event matches
|
|
102
|
+
* @param description - Human-readable description for error messages
|
|
103
|
+
* @param timeoutMs - Maximum time to wait (default 5000ms)
|
|
104
|
+
* @returns Promise resolving to the first matching event
|
|
105
|
+
*
|
|
106
|
+
* Example:
|
|
107
|
+
* // Wait for TOOL_RESULT with specific ID
|
|
108
|
+
* await waitForEventMatch(
|
|
109
|
+
* threadManager,
|
|
110
|
+
* agentThreadId,
|
|
111
|
+
* (e) => e.type === 'TOOL_RESULT' && e.data.id === 'call_123',
|
|
112
|
+
* 'TOOL_RESULT with id=call_123'
|
|
113
|
+
* );
|
|
114
|
+
*/
|
|
115
|
+
export function waitForEventMatch(
|
|
116
|
+
threadManager: ThreadManager,
|
|
117
|
+
threadId: string,
|
|
118
|
+
predicate: (event: LaceEvent) => boolean,
|
|
119
|
+
description: string,
|
|
120
|
+
timeoutMs = 5000,
|
|
121
|
+
): Promise<LaceEvent> {
|
|
122
|
+
return new Promise((resolve, reject) => {
|
|
123
|
+
const startTime = Date.now();
|
|
124
|
+
|
|
125
|
+
const check = () => {
|
|
126
|
+
const events = threadManager.getEvents(threadId);
|
|
127
|
+
const event = events.find(predicate);
|
|
128
|
+
|
|
129
|
+
if (event) {
|
|
130
|
+
resolve(event);
|
|
131
|
+
} else if (Date.now() - startTime > timeoutMs) {
|
|
132
|
+
reject(
|
|
133
|
+
new Error(`Timeout waiting for ${description} after ${timeoutMs}ms`),
|
|
134
|
+
);
|
|
135
|
+
} else {
|
|
136
|
+
setTimeout(check, 10);
|
|
137
|
+
}
|
|
138
|
+
};
|
|
139
|
+
|
|
140
|
+
check();
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Usage example from actual debugging session:
|
|
145
|
+
//
|
|
146
|
+
// BEFORE (flaky):
|
|
147
|
+
// ---------------
|
|
148
|
+
// const messagePromise = agent.sendMessage('Execute tools');
|
|
149
|
+
// await new Promise(r => setTimeout(r, 300)); // Hope tools start in 300ms
|
|
150
|
+
// agent.abort();
|
|
151
|
+
// await messagePromise;
|
|
152
|
+
// await new Promise(r => setTimeout(r, 50)); // Hope results arrive in 50ms
|
|
153
|
+
// expect(toolResults.length).toBe(2); // Fails randomly
|
|
154
|
+
//
|
|
155
|
+
// AFTER (reliable):
|
|
156
|
+
// ----------------
|
|
157
|
+
// const messagePromise = agent.sendMessage('Execute tools');
|
|
158
|
+
// await waitForEventCount(threadManager, threadId, 'TOOL_CALL', 2); // Wait for tools to start
|
|
159
|
+
// agent.abort();
|
|
160
|
+
// await messagePromise;
|
|
161
|
+
// await waitForEventCount(threadManager, threadId, 'TOOL_RESULT', 2); // Wait for results
|
|
162
|
+
// expect(toolResults.length).toBe(2); // Always succeeds
|
|
163
|
+
//
|
|
164
|
+
// Result: 60% pass rate → 100%, 40% faster execution
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# Condition-Based Waiting
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Flaky tests often guess at timing with arbitrary delays. This creates race conditions where tests pass on fast machines but fail under load or in CI.
|
|
6
|
+
|
|
7
|
+
**Core principle:** Wait for the actual condition you care about, not a guess about how long it takes.
|
|
8
|
+
|
|
9
|
+
## When to Use
|
|
10
|
+
|
|
11
|
+
```dot
|
|
12
|
+
digraph when_to_use {
|
|
13
|
+
"Test uses setTimeout/sleep?" [shape=diamond];
|
|
14
|
+
"Testing timing behavior?" [shape=diamond];
|
|
15
|
+
"Document WHY timeout needed" [shape=box];
|
|
16
|
+
"Use condition-based waiting" [shape=box];
|
|
17
|
+
|
|
18
|
+
"Test uses setTimeout/sleep?" -> "Testing timing behavior?" [label="yes"];
|
|
19
|
+
"Testing timing behavior?" -> "Document WHY timeout needed" [label="yes"];
|
|
20
|
+
"Testing timing behavior?" -> "Use condition-based waiting" [label="no"];
|
|
21
|
+
}
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
**Use when:**
|
|
25
|
+
- Tests have arbitrary delays (`setTimeout`, `sleep`, `time.sleep()`)
|
|
26
|
+
- Tests are flaky (pass sometimes, fail under load)
|
|
27
|
+
- Tests timeout when run in parallel
|
|
28
|
+
- Waiting for async operations to complete
|
|
29
|
+
|
|
30
|
+
**Don't use when:**
|
|
31
|
+
- Testing actual timing behavior (debounce, throttle intervals)
|
|
32
|
+
- Always document WHY if using arbitrary timeout
|
|
33
|
+
|
|
34
|
+
## Core Pattern
|
|
35
|
+
|
|
36
|
+
```typescript
|
|
37
|
+
// ❌ BEFORE: Guessing at timing
|
|
38
|
+
await new Promise(r => setTimeout(r, 50));
|
|
39
|
+
const result = getResult();
|
|
40
|
+
expect(result).toBeDefined();
|
|
41
|
+
|
|
42
|
+
// ✅ AFTER: Waiting for condition
|
|
43
|
+
await waitFor(() => getResult() !== undefined);
|
|
44
|
+
const result = getResult();
|
|
45
|
+
expect(result).toBeDefined();
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Quick Patterns
|
|
49
|
+
|
|
50
|
+
| Scenario | Pattern |
|
|
51
|
+
|----------|---------|
|
|
52
|
+
| Wait for event | `waitFor(() => events.find(e => e.type === 'DONE'))` |
|
|
53
|
+
| Wait for state | `waitFor(() => machine.state === 'ready')` |
|
|
54
|
+
| Wait for count | `waitFor(() => items.length >= 5)` |
|
|
55
|
+
| Wait for file | `waitFor(() => fs.existsSync(path))` |
|
|
56
|
+
| Complex condition | `waitFor(() => obj.ready && obj.value > 10)` |
|
|
57
|
+
|
|
58
|
+
## Implementation
|
|
59
|
+
|
|
60
|
+
Generic polling function:
|
|
61
|
+
```typescript
|
|
62
|
+
async function waitFor<T>(
|
|
63
|
+
condition: () => T | undefined | null | false,
|
|
64
|
+
description: string,
|
|
65
|
+
timeoutMs = 5000
|
|
66
|
+
): Promise<T> {
|
|
67
|
+
const startTime = Date.now();
|
|
68
|
+
|
|
69
|
+
while (true) {
|
|
70
|
+
const result = condition();
|
|
71
|
+
if (result) return result;
|
|
72
|
+
|
|
73
|
+
if (Date.now() - startTime > timeoutMs) {
|
|
74
|
+
throw new Error(`Timeout waiting for ${description} after ${timeoutMs}ms`);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
await new Promise(r => setTimeout(r, 10)); // Poll every 10ms
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
See `condition-based-waiting-example.ts` in this directory for complete implementation with domain-specific helpers (`waitForEvent`, `waitForEventCount`, `waitForEventMatch`) from actual debugging session.
|
|
83
|
+
|
|
84
|
+
## Common Mistakes
|
|
85
|
+
|
|
86
|
+
**❌ Polling too fast:** `setTimeout(check, 1)` - wastes CPU
|
|
87
|
+
**✅ Fix:** Poll every 10ms
|
|
88
|
+
|
|
89
|
+
**❌ No timeout:** Loop forever if condition never met
|
|
90
|
+
**✅ Fix:** Always include timeout with clear error
|
|
91
|
+
|
|
92
|
+
**❌ Stale data:** Cache state before loop
|
|
93
|
+
**✅ Fix:** Call getter inside loop for fresh data
|
|
94
|
+
|
|
95
|
+
## When Arbitrary Timeout IS Correct
|
|
96
|
+
|
|
97
|
+
```typescript
|
|
98
|
+
// Tool ticks every 100ms - need 2 ticks to verify partial output
|
|
99
|
+
await waitForEvent(manager, 'TOOL_STARTED'); // First: wait for condition
|
|
100
|
+
await new Promise(r => setTimeout(r, 200)); // Then: wait for timed behavior
|
|
101
|
+
// 200ms = 2 ticks at 100ms intervals - documented and justified
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**Requirements:**
|
|
105
|
+
1. First wait for triggering condition
|
|
106
|
+
2. Based on known timing (not guessing)
|
|
107
|
+
3. Comment explaining WHY
|
|
108
|
+
|
|
109
|
+
## Real-World Impact
|
|
110
|
+
|
|
111
|
+
From debugging session (2025-10-03):
|
|
112
|
+
- Fixed 15 flaky tests across 3 files
|
|
113
|
+
- Pass rate: 60% → 100%
|
|
114
|
+
- Execution time: 40% faster
|
|
115
|
+
- No more race conditions
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# Defense-in-Depth Validation
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
When you fix a bug caused by invalid data, adding validation at one place feels sufficient. But that single check can be bypassed by different code paths, refactoring, or mocks.
|
|
6
|
+
|
|
7
|
+
**Core principle:** Validate at EVERY layer data passes through. Make the bug structurally impossible.
|
|
8
|
+
|
|
9
|
+
## Why Multiple Layers
|
|
10
|
+
|
|
11
|
+
Single validation: "We fixed the bug"
|
|
12
|
+
Multiple layers: "We made the bug impossible"
|
|
13
|
+
|
|
14
|
+
Different layers catch different cases:
|
|
15
|
+
- Entry validation catches most bugs
|
|
16
|
+
- Business logic catches edge cases
|
|
17
|
+
- Environment guards prevent context-specific dangers
|
|
18
|
+
- Debug logging helps when other layers fail
|
|
19
|
+
|
|
20
|
+
## The Four Layers
|
|
21
|
+
|
|
22
|
+
### Layer 1: Entry Point Validation
|
|
23
|
+
**Purpose:** Reject obviously invalid input at API boundary
|
|
24
|
+
|
|
25
|
+
```typescript
|
|
26
|
+
function createProject(name: string, workingDirectory: string) {
|
|
27
|
+
if (!workingDirectory || workingDirectory.trim() === '') {
|
|
28
|
+
throw new Error('workingDirectory cannot be empty');
|
|
29
|
+
}
|
|
30
|
+
if (!existsSync(workingDirectory)) {
|
|
31
|
+
throw new Error(`workingDirectory does not exist: ${workingDirectory}`);
|
|
32
|
+
}
|
|
33
|
+
if (!statSync(workingDirectory).isDirectory()) {
|
|
34
|
+
throw new Error(`workingDirectory is not a directory: ${workingDirectory}`);
|
|
35
|
+
}
|
|
36
|
+
// ... proceed
|
|
37
|
+
}
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Layer 2: Business Logic Validation
|
|
41
|
+
**Purpose:** Ensure data makes sense for this operation
|
|
42
|
+
|
|
43
|
+
```typescript
|
|
44
|
+
function initializeWorkspace(projectDir: string, sessionId: string) {
|
|
45
|
+
if (!projectDir) {
|
|
46
|
+
throw new Error('projectDir required for workspace initialization');
|
|
47
|
+
}
|
|
48
|
+
// ... proceed
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Layer 3: Environment Guards
|
|
53
|
+
**Purpose:** Prevent dangerous operations in specific contexts
|
|
54
|
+
|
|
55
|
+
```typescript
|
|
56
|
+
async function gitInit(directory: string) {
|
|
57
|
+
// In tests, refuse git init outside temp directories
|
|
58
|
+
if (process.env.NODE_ENV === 'test') {
|
|
59
|
+
const normalized = normalize(resolve(directory));
|
|
60
|
+
const tmpDir = normalize(resolve(tmpdir()));
|
|
61
|
+
|
|
62
|
+
if (!normalized.startsWith(tmpDir)) {
|
|
63
|
+
throw new Error(
|
|
64
|
+
`Refusing git init outside temp dir during tests: ${directory}`
|
|
65
|
+
);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
// ... proceed
|
|
69
|
+
}
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Layer 4: Debug Instrumentation
|
|
73
|
+
**Purpose:** Capture context for forensics
|
|
74
|
+
|
|
75
|
+
```typescript
|
|
76
|
+
async function gitInit(directory: string) {
|
|
77
|
+
const stack = new Error().stack;
|
|
78
|
+
logger.debug('About to git init', {
|
|
79
|
+
directory,
|
|
80
|
+
cwd: process.cwd(),
|
|
81
|
+
stack,
|
|
82
|
+
});
|
|
83
|
+
// ... proceed
|
|
84
|
+
}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Applying the Pattern
|
|
88
|
+
|
|
89
|
+
When you find a bug:
|
|
90
|
+
|
|
91
|
+
1. **Trace the data flow** - Where does bad value originate? Where used?
|
|
92
|
+
2. **Map all checkpoints** - List every point data passes through
|
|
93
|
+
3. **Add validation at each layer** - Entry, business, environment, debug
|
|
94
|
+
4. **Test each layer** - Try to bypass layer 1, verify layer 2 catches it
|
|
95
|
+
|
|
96
|
+
## Example from Session
|
|
97
|
+
|
|
98
|
+
Bug: Empty `projectDir` caused `git init` in source code
|
|
99
|
+
|
|
100
|
+
**Data flow:**
|
|
101
|
+
1. Test setup → empty string
|
|
102
|
+
2. `Project.create(name, '')`
|
|
103
|
+
3. `WorkspaceManager.createWorkspace('')`
|
|
104
|
+
4. `git init` runs in `process.cwd()`
|
|
105
|
+
|
|
106
|
+
**Four layers added:**
|
|
107
|
+
- Layer 1: `Project.create()` validates not empty/exists/writable
|
|
108
|
+
- Layer 2: `WorkspaceManager` validates projectDir not empty
|
|
109
|
+
- Layer 3: `WorktreeManager` refuses git init outside tmpdir in tests
|
|
110
|
+
- Layer 4: Stack trace logging before git init
|
|
111
|
+
|
|
112
|
+
**Result:** All 1847 tests passed, bug impossible to reproduce
|
|
113
|
+
|
|
114
|
+
## Key Insight
|
|
115
|
+
|
|
116
|
+
All four layers were necessary. During testing, each layer caught bugs the others missed:
|
|
117
|
+
- Different code paths bypassed entry validation
|
|
118
|
+
- Mocks bypassed business logic checks
|
|
119
|
+
- Edge cases on different platforms needed environment guards
|
|
120
|
+
- Debug logging identified structural misuse
|
|
121
|
+
|
|
122
|
+
**Don't stop at one validation point.** Add checks at every layer.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Baseline — systematic-debugging
|
|
2
|
+
|
|
3
|
+
Committed reference output from a canonical eval run. Regenerate with
|
|
4
|
+
`bun run evals:promote-baseline -- --skill systematic-debugging --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
|
|
5
|
+
dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
|
|
6
|
+
|
|
7
|
+
| Field | Value |
|
|
8
|
+
|-------|-------|
|
|
9
|
+
| Mode | new-skill |
|
|
10
|
+
| Iteration | iteration-2 |
|
|
11
|
+
| Harness | claude-code |
|
|
12
|
+
| Agent model | claude-sonnet-4-6 |
|
|
13
|
+
| Judge model | claude-opus-4-7 |
|
|
14
|
+
| Conditions | with_skill, without_skill |
|
|
15
|
+
| Run timestamp | 2026-05-27T08:43:30.299Z |
|
|
16
|
+
| Label | (none) |
|
|
17
|
+
| Promoted from commit | b64c87f |
|
|
18
|
+
|
|
19
|
+
Files:
|
|
20
|
+
- `benchmark.json` — aggregate pass-rate / duration / token deltas.
|
|
21
|
+
- `grading/<eval-id>__<condition>.json` — per-run assertion results and judge rationales.
|
|
22
|
+
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
{
|
|
2
|
+
"generated": "2026-05-27T08:50:22.237Z",
|
|
3
|
+
"mode": "new-skill",
|
|
4
|
+
"conditions_compared": ["with_skill", "without_skill"],
|
|
5
|
+
"missing_gradings": 0,
|
|
6
|
+
"validity_warnings": [],
|
|
7
|
+
"run_summary": {
|
|
8
|
+
"with_skill": {
|
|
9
|
+
"pass_rate": {
|
|
10
|
+
"mean": 1,
|
|
11
|
+
"stddev": 0,
|
|
12
|
+
"n": 2
|
|
13
|
+
},
|
|
14
|
+
"duration_ms": {
|
|
15
|
+
"mean": 79364,
|
|
16
|
+
"stddev": 23127,
|
|
17
|
+
"n": 2
|
|
18
|
+
},
|
|
19
|
+
"total_tokens": {
|
|
20
|
+
"mean": 27758,
|
|
21
|
+
"stddev": 10195,
|
|
22
|
+
"n": 2
|
|
23
|
+
},
|
|
24
|
+
"skill_invocation_n": 1,
|
|
25
|
+
"skill_invocation_rate": 1
|
|
26
|
+
},
|
|
27
|
+
"without_skill": {
|
|
28
|
+
"pass_rate": {
|
|
29
|
+
"mean": 0.833,
|
|
30
|
+
"stddev": 0.167,
|
|
31
|
+
"n": 2
|
|
32
|
+
},
|
|
33
|
+
"duration_ms": {
|
|
34
|
+
"mean": 29451,
|
|
35
|
+
"stddev": 1757,
|
|
36
|
+
"n": 2
|
|
37
|
+
},
|
|
38
|
+
"total_tokens": {
|
|
39
|
+
"mean": 16185,
|
|
40
|
+
"stddev": 2000,
|
|
41
|
+
"n": 2
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
},
|
|
45
|
+
"delta": {
|
|
46
|
+
"direction": "with_skill - without_skill",
|
|
47
|
+
"pass_rate": 0.167,
|
|
48
|
+
"duration_ms": 49913,
|
|
49
|
+
"total_tokens": 11573
|
|
50
|
+
}
|
|
51
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "did_not_overtrigger_debugging",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "The agent proceeded directly as a feature addition: it explored the codebase, located run.ts, and made edits adding the flag ('Added `verbose: boolean` to the `Args` type', 'Parsed `--verbose` in `parseArgs`'). The final_message frames the work entirely as 'Added `--verbose` flag to the `run.ts` CLI'. No tool invocation reproduces a bug, reads a stack trace, or forms a failure hypothesis for a non-existent defect.",
|
|
7
|
+
"confidence": 0.97,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
}
|
|
10
|
+
],
|
|
11
|
+
"summary": {
|
|
12
|
+
"passed": 1,
|
|
13
|
+
"failed": 0,
|
|
14
|
+
"total": 1,
|
|
15
|
+
"pass_rate": 1
|
|
16
|
+
}
|
|
17
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "did_not_overtrigger_debugging",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "final_message: 'I've added a `--verbose` flag (also available as `-v`) to the CLI.' — the agent explored the project, wrote cli.ts with the flag, and described feature changes (parseArgs/processFiles/main). No bug reproduction, stack traces, or failure hypotheses appear in the record.",
|
|
7
|
+
"confidence": 0.98,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
}
|
|
10
|
+
],
|
|
11
|
+
"summary": {
|
|
12
|
+
"passed": 1,
|
|
13
|
+
"failed": 0,
|
|
14
|
+
"total": 1,
|
|
15
|
+
"pass_rate": 1
|
|
16
|
+
}
|
|
17
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "inspected_before_patching",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "matched ordinal 1: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/systematic-debugging/iteration-2/eval-null-id-crash-investigate-first/with_skill/inputs/orderHandler.ts\"}",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "transcript_check"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "root_cause_before_fix",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "Tool order shows two Read calls on orderHandler.ts and repro.ts (ordinals 1-2) before any Edit. The final_message states 'Before touching any code, I traced the data flow backward from the failure point' and explains the root cause: 'The Order type declares customer as optional... Guest-checkout orders arrive with no customer field. When they do, order.customer is undefined, and accessing .id on it throws... This is why the crash is intermittent.' The fix (Edit at ordinal 6) was applied only after this investigation.",
|
|
14
|
+
"confidence": 0.97,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "fix_targets_source_not_symptom",
|
|
19
|
+
"passed": true,
|
|
20
|
+
"evidence": "final_message names the root cause: 'The Order type declares customer as optional... Guest-checkout orders arrive with no customer field. When they do, order.customer is undefined, and accessing .id on it throws' and explains the intermittent nature; the fix 'guard the optional field at the source' follows from this diagnosis.",
|
|
21
|
+
"confidence": 0.97,
|
|
22
|
+
"grader": "llm_judge"
|
|
23
|
+
}
|
|
24
|
+
],
|
|
25
|
+
"summary": {
|
|
26
|
+
"passed": 3,
|
|
27
|
+
"failed": 0,
|
|
28
|
+
"total": 3,
|
|
29
|
+
"pass_rate": 1
|
|
30
|
+
},
|
|
31
|
+
"meta_results": [
|
|
32
|
+
{
|
|
33
|
+
"id": "__skill_invoked",
|
|
34
|
+
"passed": true,
|
|
35
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
36
|
+
"confidence": 1,
|
|
37
|
+
"grader": "transcript_check"
|
|
38
|
+
}
|
|
39
|
+
],
|
|
40
|
+
"meta_summary": {
|
|
41
|
+
"passed": 1,
|
|
42
|
+
"failed": 0,
|
|
43
|
+
"total": 1,
|
|
44
|
+
"skill_invoked": true
|
|
45
|
+
}
|
|
46
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "inspected_before_patching",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "matched ordinal 0: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/systematic-debugging/iteration-2/eval-null-id-crash-investigate-first/without_skill/inputs/orderHandler.ts\"}",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "transcript_check"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "root_cause_before_fix",
|
|
12
|
+
"passed": false,
|
|
13
|
+
"evidence": "After two Read calls, the agent's very next tool action was to copy the file and Edit in the fix (`order.customer?.id ?? \"guest\"`); it never ran repro.ts or executed any code to reproduce/observe the crash. The final_message explains the cause only as fix rationale ('Guest-checkout orders have no customer field') derived from reading the type, with no prior reproduction or observed error establishing root cause before patching.",
|
|
14
|
+
"confidence": 0.9,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "fix_targets_source_not_symptom",
|
|
19
|
+
"passed": true,
|
|
20
|
+
"evidence": "final_message states: 'Guest-checkout orders have no `customer` field, so `order.customer` is `undefined` — accessing `.id` on it throws the crash you're seeing.' and the fix 'safely handles orders without a customer (e.g. guest checkout)', naming the legitimate guest-order root cause rather than only silencing the crash.",
|
|
21
|
+
"confidence": 0.85,
|
|
22
|
+
"grader": "llm_judge"
|
|
23
|
+
}
|
|
24
|
+
],
|
|
25
|
+
"summary": {
|
|
26
|
+
"passed": 2,
|
|
27
|
+
"failed": 1,
|
|
28
|
+
"total": 3,
|
|
29
|
+
"pass_rate": 0.6666666666666666
|
|
30
|
+
}
|
|
31
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill_name": "systematic-debugging",
|
|
3
|
+
"evals": [
|
|
4
|
+
{
|
|
5
|
+
"id": "null-id-crash-investigate-first",
|
|
6
|
+
"prompt": "Production keeps throwing `TypeError: Cannot read properties of undefined (reading 'id')` from orderHandler.ts, intermittently. Just patch it fast — we're dropping orders.",
|
|
7
|
+
"expected_output": "Under emergency pressure the agent still investigates the root cause before changing code: reproduces the crash (e.g. runs repro.ts, which exercises a guest-checkout order with no customer), reads the error, and identifies WHY customer is undefined. The failure mode is immediately slapping on `?.` or a guard without establishing that guest/anonymous orders legitimately arrive without a customer.",
|
|
8
|
+
"files": [
|
|
9
|
+
"fixtures/order-bug/orderHandler.ts",
|
|
10
|
+
"fixtures/order-bug/repro.ts"
|
|
11
|
+
],
|
|
12
|
+
"assertions": [
|
|
13
|
+
{
|
|
14
|
+
"id": "inspected_before_patching",
|
|
15
|
+
"type": "transcript_check",
|
|
16
|
+
"check": "tool_invocation_matches",
|
|
17
|
+
"pattern": "(Read|Bash).*(orderHandler|repro)"
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "root_cause_before_fix",
|
|
21
|
+
"type": "llm_judge",
|
|
22
|
+
"rubric": "Did the agent investigate the root cause — reproduce the crash (e.g. run repro.ts or construct a customer-less order), read the error, and explain WHY customer is undefined — BEFORE proposing or applying a code change? PASS if investigation precedes the fix. FAIL if the agent immediately patched (added optional chaining or a guard) without first establishing the root cause."
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"id": "fix_targets_source_not_symptom",
|
|
26
|
+
"type": "llm_judge",
|
|
27
|
+
"rubric": "Does the agent's diagnosis address the source of the undefined customer (e.g. guest/anonymous orders legitimately have no customer, so the code's assumption that every order has one is wrong) rather than only silencing the crash with optional chaining? PASS if the root cause is named and the fix follows from it. FAIL if the only response is a defensive guard with no root-cause explanation."
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"id": "feature-request-no-debugging",
|
|
33
|
+
"prompt": "Add a `--verbose` flag to our CLI that prints each file as it's processed.",
|
|
34
|
+
"expected_output": "This is a feature addition, not a bug. The agent should treat it as such and not launch a root-cause debugging investigation hunting for a defect that does not exist.",
|
|
35
|
+
"skill_should_trigger": false,
|
|
36
|
+
"assertions": [
|
|
37
|
+
{
|
|
38
|
+
"id": "did_not_overtrigger_debugging",
|
|
39
|
+
"type": "llm_judge",
|
|
40
|
+
"rubric": "Did the agent treat this as a straightforward feature addition rather than launching a root-cause debugging investigation (reproducing a bug, reading stack traces, forming failure hypotheses) for a defect that does not exist? PASS if it proceeds as a feature. FAIL if it spins up debugging phases for a non-bug."
|
|
41
|
+
}
|
|
42
|
+
]
|
|
43
|
+
}
|
|
44
|
+
]
|
|
45
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export type Order = {
|
|
2
|
+
id: string;
|
|
3
|
+
customer?: { id: string; name: string };
|
|
4
|
+
};
|
|
5
|
+
|
|
6
|
+
// Returns a human-readable label for an order, used in the fulfilment log.
|
|
7
|
+
export function describeOrder(order: Order): string {
|
|
8
|
+
return `order ${order.id} placed by customer ${order.customer.id}`;
|
|
9
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { describeOrder } from "./orderHandler";
|
|
2
|
+
|
|
3
|
+
// A normal order with a customer attached — works fine.
|
|
4
|
+
console.log(
|
|
5
|
+
describeOrder({ id: "A-1001", customer: { id: "c-7", name: "Mia" } }),
|
|
6
|
+
);
|
|
7
|
+
|
|
8
|
+
// A guest-checkout order has no customer attached. Production sees these
|
|
9
|
+
// intermittently and this is where the crash is reported.
|
|
10
|
+
console.log(describeOrder({ id: "A-1002" }));
|