@directive-run/knowledge 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +63 -0
- package/ai/ai-adapters.md +250 -0
- package/ai/ai-agents-streaming.md +269 -0
- package/ai/ai-budget-resilience.md +235 -0
- package/ai/ai-communication.md +281 -0
- package/ai/ai-debug-observability.md +243 -0
- package/ai/ai-guardrails-memory.md +332 -0
- package/ai/ai-mcp-rag.md +288 -0
- package/ai/ai-multi-agent.md +274 -0
- package/ai/ai-orchestrator.md +227 -0
- package/ai/ai-security.md +293 -0
- package/ai/ai-tasks.md +261 -0
- package/ai/ai-testing-evals.md +378 -0
- package/api-skeleton.md +5 -0
- package/core/anti-patterns.md +382 -0
- package/core/constraints.md +263 -0
- package/core/core-patterns.md +228 -0
- package/core/error-boundaries.md +322 -0
- package/core/multi-module.md +315 -0
- package/core/naming.md +283 -0
- package/core/plugins.md +344 -0
- package/core/react-adapter.md +262 -0
- package/core/resolvers.md +357 -0
- package/core/schema-types.md +262 -0
- package/core/system-api.md +271 -0
- package/core/testing.md +257 -0
- package/core/time-travel.md +238 -0
- package/dist/index.cjs +111 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +10 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.js +102 -0
- package/dist/index.js.map +1 -0
- package/examples/ab-testing.ts +385 -0
- package/examples/ai-checkpoint.ts +509 -0
- package/examples/ai-guardrails.ts +319 -0
- package/examples/ai-orchestrator.ts +589 -0
- package/examples/async-chains.ts +287 -0
- package/examples/auth-flow.ts +371 -0
- package/examples/batch-resolver.ts +341 -0
- package/examples/checkers.ts +589 -0
- package/examples/contact-form.ts +176 -0
- package/examples/counter.ts +393 -0
- package/examples/dashboard-loader.ts +512 -0
- package/examples/debounce-constraints.ts +105 -0
- package/examples/dynamic-modules.ts +293 -0
- package/examples/error-boundaries.ts +430 -0
- package/examples/feature-flags.ts +220 -0
- package/examples/form-wizard.ts +347 -0
- package/examples/fraud-analysis.ts +663 -0
- package/examples/goal-heist.ts +341 -0
- package/examples/multi-module.ts +57 -0
- package/examples/newsletter.ts +241 -0
- package/examples/notifications.ts +210 -0
- package/examples/optimistic-updates.ts +317 -0
- package/examples/pagination.ts +260 -0
- package/examples/permissions.ts +337 -0
- package/examples/provider-routing.ts +403 -0
- package/examples/server.ts +316 -0
- package/examples/shopping-cart.ts +422 -0
- package/examples/sudoku.ts +630 -0
- package/examples/theme-locale.ts +204 -0
- package/examples/time-machine.ts +225 -0
- package/examples/topic-guard.ts +306 -0
- package/examples/url-sync.ts +333 -0
- package/examples/websocket.ts +404 -0
- package/package.json +65 -0
package/ai/ai-tasks.md
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# AI Tasks
|
|
2
|
+
|
|
3
|
+
Tasks are deterministic, non-LLM work units that run alongside agents in orchestration patterns. Use tasks for data transformation, API calls, file I/O, or any work that does not need an LLM.
|
|
4
|
+
|
|
5
|
+
## Decision Tree: "Should this be an agent or a task?"
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
Does this work need an LLM?
|
|
9
|
+
├── Yes → Use an agent (AgentLike)
|
|
10
|
+
└── No → Use a task (TaskRegistration)
|
|
11
|
+
│
|
|
12
|
+
Does it need to call other agents?
|
|
13
|
+
├── Yes → Make it an agent, not a task (#33)
|
|
14
|
+
└── No → Task is correct
|
|
15
|
+
│
|
|
16
|
+
What does it receive?
|
|
17
|
+
└── Always a string → parse with JSON.parse() if structured (#34)
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## TaskRegistration Shape
|
|
21
|
+
|
|
22
|
+
```typescript
|
|
23
|
+
interface TaskRegistration {
|
|
24
|
+
// The work function — input is ALWAYS a string
|
|
25
|
+
run: (input: string, context: TaskContext) => Promise<string>;
|
|
26
|
+
|
|
27
|
+
// Human-readable label for debugging/logging
|
|
28
|
+
label?: string;
|
|
29
|
+
|
|
30
|
+
// Abort after this many ms
|
|
31
|
+
timeout?: number;
|
|
32
|
+
|
|
33
|
+
// Max parallel executions of this task
|
|
34
|
+
maxConcurrent?: number;
|
|
35
|
+
|
|
36
|
+
// Retry on failure
|
|
37
|
+
retry?: {
|
|
38
|
+
attempts: number;
|
|
39
|
+
backoff: "exponential" | "linear" | "none";
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## TaskContext Shape
|
|
45
|
+
|
|
46
|
+
```typescript
|
|
47
|
+
interface TaskContext {
|
|
48
|
+
// Shared memory across runs
|
|
49
|
+
memory: AgentMemory;
|
|
50
|
+
|
|
51
|
+
// Ephemeral key-value store for the current pattern execution
|
|
52
|
+
scratchpad: Record<string, unknown>;
|
|
53
|
+
|
|
54
|
+
// Read another agent's current state (read-only)
|
|
55
|
+
readAgentState: (agentName: string) => AgentState;
|
|
56
|
+
|
|
57
|
+
// Report progress (0-100)
|
|
58
|
+
reportProgress: (percent: number, message?: string) => void;
|
|
59
|
+
|
|
60
|
+
// Cancellation signal
|
|
61
|
+
signal: AbortSignal;
|
|
62
|
+
}
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Registering Tasks
|
|
66
|
+
|
|
67
|
+
```typescript
|
|
68
|
+
import { createMultiAgentOrchestrator } from "@directive-run/ai";
|
|
69
|
+
|
|
70
|
+
const orchestrator = createMultiAgentOrchestrator({
|
|
71
|
+
agents: {
|
|
72
|
+
researcher: { name: "researcher", instructions: "...", model: "claude-sonnet-4-5" },
|
|
73
|
+
writer: { name: "writer", instructions: "...", model: "claude-sonnet-4-5" },
|
|
74
|
+
},
|
|
75
|
+
tasks: {
|
|
76
|
+
formatter: {
|
|
77
|
+
label: "Format research output",
|
|
78
|
+
timeout: 5000,
|
|
79
|
+
maxConcurrent: 3,
|
|
80
|
+
retry: { attempts: 2, backoff: "linear" },
|
|
81
|
+
run: async (input, context) => {
|
|
82
|
+
context.reportProgress(0, "Parsing input");
|
|
83
|
+
const data = JSON.parse(input);
|
|
84
|
+
|
|
85
|
+
context.reportProgress(50, "Formatting");
|
|
86
|
+
const formatted = formatData(data);
|
|
87
|
+
|
|
88
|
+
context.reportProgress(100, "Done");
|
|
89
|
+
|
|
90
|
+
return JSON.stringify(formatted);
|
|
91
|
+
},
|
|
92
|
+
},
|
|
93
|
+
validate: {
|
|
94
|
+
label: "Validate output schema",
|
|
95
|
+
run: async (input, context) => {
|
|
96
|
+
const parsed = JSON.parse(input);
|
|
97
|
+
if (!parsed.title || !parsed.body) {
|
|
98
|
+
throw new Error("Missing required fields: title, body");
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return input;
|
|
102
|
+
},
|
|
103
|
+
},
|
|
104
|
+
},
|
|
105
|
+
runner,
|
|
106
|
+
});
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Tasks in Patterns
|
|
110
|
+
|
|
111
|
+
Tasks work in all composition patterns. Reference them by their key, just like agents:
|
|
112
|
+
|
|
113
|
+
### Sequential
|
|
114
|
+
|
|
115
|
+
```typescript
|
|
116
|
+
patterns: {
|
|
117
|
+
pipeline: sequential(["researcher", "formatter", "writer"]),
|
|
118
|
+
// researcher (agent) → formatter (task) → writer (agent)
|
|
119
|
+
},
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### DAG
|
|
123
|
+
|
|
124
|
+
```typescript
|
|
125
|
+
patterns: {
|
|
126
|
+
workflow: dag([
|
|
127
|
+
{ id: "research", handler: "researcher" },
|
|
128
|
+
{ id: "format", handler: "formatter", dependencies: ["research"] },
|
|
129
|
+
{ id: "validate", handler: "validate", dependencies: ["format"] },
|
|
130
|
+
{ id: "write", handler: "writer", dependencies: ["validate"] },
|
|
131
|
+
]),
|
|
132
|
+
},
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Parallel
|
|
136
|
+
|
|
137
|
+
```typescript
|
|
138
|
+
patterns: {
|
|
139
|
+
gather: parallel(["researcher", "formatter"], mergeResults),
|
|
140
|
+
},
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Reading Agent State from a Task
|
|
144
|
+
|
|
145
|
+
```typescript
|
|
146
|
+
tasks: {
|
|
147
|
+
summarize: {
|
|
148
|
+
run: async (input, context) => {
|
|
149
|
+
const researcherState = context.readAgentState("researcher");
|
|
150
|
+
const lastOutput = researcherState.lastOutput;
|
|
151
|
+
|
|
152
|
+
return `Summary of: ${lastOutput}`;
|
|
153
|
+
},
|
|
154
|
+
},
|
|
155
|
+
},
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Using the Scratchpad
|
|
159
|
+
|
|
160
|
+
The scratchpad persists across tasks within a single pattern execution:
|
|
161
|
+
|
|
162
|
+
```typescript
|
|
163
|
+
tasks: {
|
|
164
|
+
step1: {
|
|
165
|
+
run: async (input, context) => {
|
|
166
|
+
const parsed = JSON.parse(input);
|
|
167
|
+
context.scratchpad.itemCount = parsed.items.length;
|
|
168
|
+
|
|
169
|
+
return input;
|
|
170
|
+
},
|
|
171
|
+
},
|
|
172
|
+
step2: {
|
|
173
|
+
run: async (input, context) => {
|
|
174
|
+
const count = context.scratchpad.itemCount as number;
|
|
175
|
+
|
|
176
|
+
return `Processed ${count} items: ${input}`;
|
|
177
|
+
},
|
|
178
|
+
},
|
|
179
|
+
},
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
## Anti-Patterns
|
|
183
|
+
|
|
184
|
+
### #33: Tasks calling agents internally
|
|
185
|
+
|
|
186
|
+
```typescript
|
|
187
|
+
// WRONG — tasks cannot invoke agents
|
|
188
|
+
tasks: {
|
|
189
|
+
enhance: {
|
|
190
|
+
run: async (input, context) => {
|
|
191
|
+
// Tasks have no runner access — this won't work
|
|
192
|
+
const result = await runner.run(someAgent, input);
|
|
193
|
+
|
|
194
|
+
return result.output;
|
|
195
|
+
},
|
|
196
|
+
},
|
|
197
|
+
},
|
|
198
|
+
|
|
199
|
+
// CORRECT — use a pattern to compose agents and tasks
|
|
200
|
+
patterns: {
|
|
201
|
+
enhance: sequential(["enhancer-agent", "format-task"]),
|
|
202
|
+
},
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### #34: Expecting structured input (not a string)
|
|
206
|
+
|
|
207
|
+
```typescript
|
|
208
|
+
// WRONG — task input is always a string
|
|
209
|
+
tasks: {
|
|
210
|
+
process: {
|
|
211
|
+
run: async (input, context) => {
|
|
212
|
+
// input.items is undefined — input is a string
|
|
213
|
+
return input.items.map((i) => i.name).join(", ");
|
|
214
|
+
},
|
|
215
|
+
},
|
|
216
|
+
},
|
|
217
|
+
|
|
218
|
+
// CORRECT — parse the string input
|
|
219
|
+
tasks: {
|
|
220
|
+
process: {
|
|
221
|
+
run: async (input, context) => {
|
|
222
|
+
const data = JSON.parse(input);
|
|
223
|
+
|
|
224
|
+
return data.items.map((i: { name: string }) => i.name).join(", ");
|
|
225
|
+
},
|
|
226
|
+
},
|
|
227
|
+
},
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### #35: Task and agent IDs collide
|
|
231
|
+
|
|
232
|
+
```typescript
|
|
233
|
+
// WRONG — "researcher" exists as both agent and task
|
|
234
|
+
agents: {
|
|
235
|
+
researcher: { name: "researcher", instructions: "...", model: "claude-sonnet-4-5" },
|
|
236
|
+
},
|
|
237
|
+
tasks: {
|
|
238
|
+
researcher: { run: async (input) => input }, // Name collision!
|
|
239
|
+
},
|
|
240
|
+
|
|
241
|
+
// CORRECT — use distinct names
|
|
242
|
+
agents: {
|
|
243
|
+
researcher: { name: "researcher", instructions: "...", model: "claude-sonnet-4-5" },
|
|
244
|
+
},
|
|
245
|
+
tasks: {
|
|
246
|
+
formatResearch: { run: async (input) => input },
|
|
247
|
+
},
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## Quick Reference
|
|
251
|
+
|
|
252
|
+
| Feature | Agent | Task |
|
|
253
|
+
|---|---|---|
|
|
254
|
+
| Uses LLM | Yes | No |
|
|
255
|
+
| Input type | string | string |
|
|
256
|
+
| Can call other agents | Via patterns | No |
|
|
257
|
+
| Retry support | Via orchestrator | Built-in retry config |
|
|
258
|
+
| Progress reporting | No | `context.reportProgress()` |
|
|
259
|
+
| Concurrency control | Via patterns | `maxConcurrent` |
|
|
260
|
+
| Scratchpad access | No | Yes |
|
|
261
|
+
| Works in all patterns | Yes | Yes |
|
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
# AI Testing and Evaluations
|
|
2
|
+
|
|
3
|
+
Mock runners, test orchestrators, assertion helpers, simulators, and an evaluation framework with LLM-as-judge and dataset-driven quality gates.
|
|
4
|
+
|
|
5
|
+
## Decision Tree: "How do I test my AI code?"
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
What are you testing?
|
|
9
|
+
├── Single agent behavior → createTestOrchestrator + createMockRunner
|
|
10
|
+
├── Multi-agent patterns → createTestMultiAgentOrchestrator
|
|
11
|
+
├── Specific agent was called → assertAgentCalled(mockRunner, name)
|
|
12
|
+
├── Token budget behavior → createMockRunner with token counts
|
|
13
|
+
├── Guardrail logic → createTestOrchestrator with guardrails
|
|
14
|
+
│
|
|
15
|
+
What are you evaluating?
|
|
16
|
+
├── Output quality → createEvaluator with criteria
|
|
17
|
+
├── Automated grading → LLM-as-judge evaluator
|
|
18
|
+
├── Regression testing → dataset-driven evaluation
|
|
19
|
+
├── CI quality gates → evaluation thresholds
|
|
20
|
+
│
|
|
21
|
+
Where do I import from?
|
|
22
|
+
├── Test utilities → '@directive-run/ai/testing'
|
|
23
|
+
├── Evaluators → '@directive-run/ai/testing'
|
|
24
|
+
└── Schema builders → '@directive-run/ai' (main, for t.*())
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Mock Runners
|
|
28
|
+
|
|
29
|
+
Create deterministic runners that return predefined responses:
|
|
30
|
+
|
|
31
|
+
```typescript
|
|
32
|
+
import { createMockRunner } from "@directive-run/ai/testing";
|
|
33
|
+
|
|
34
|
+
// Pattern-matched responses
|
|
35
|
+
const mockRunner = createMockRunner([
|
|
36
|
+
{ input: /analyze/, output: "Analysis complete: positive trend", tokens: 100 },
|
|
37
|
+
{ input: /summarize/, output: "Summary: key findings are...", tokens: 50 },
|
|
38
|
+
{ input: /translate/, output: "Translation: Hola mundo", tokens: 30 },
|
|
39
|
+
]);
|
|
40
|
+
|
|
41
|
+
// Catch-all default response
|
|
42
|
+
const mockRunner = createMockRunner([
|
|
43
|
+
{ input: /specific/, output: "Matched specific" },
|
|
44
|
+
{ input: /.*/, output: "Default response", tokens: 10 },
|
|
45
|
+
]);
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Mock Runner with Side Effects
|
|
49
|
+
|
|
50
|
+
```typescript
|
|
51
|
+
const mockRunner = createMockRunner([
|
|
52
|
+
{
|
|
53
|
+
input: /analyze/,
|
|
54
|
+
output: "Analysis complete",
|
|
55
|
+
tokens: 100,
|
|
56
|
+
// Simulate tool calls
|
|
57
|
+
toolCalls: [
|
|
58
|
+
{ name: "search", arguments: '{"query":"data"}', result: "found 5 items" },
|
|
59
|
+
],
|
|
60
|
+
// Simulate latency
|
|
61
|
+
delayMs: 200,
|
|
62
|
+
},
|
|
63
|
+
]);
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Test Orchestrator
|
|
67
|
+
|
|
68
|
+
Lightweight orchestrator for unit testing:
|
|
69
|
+
|
|
70
|
+
```typescript
|
|
71
|
+
import { createTestOrchestrator, createMockRunner, t } from "@directive-run/ai/testing";
|
|
72
|
+
|
|
73
|
+
const mockRunner = createMockRunner([
|
|
74
|
+
{ input: /analyze/, output: "Analysis: positive", tokens: 100 },
|
|
75
|
+
]);
|
|
76
|
+
|
|
77
|
+
const orchestrator = createTestOrchestrator({
|
|
78
|
+
runner: mockRunner,
|
|
79
|
+
factsSchema: {
|
|
80
|
+
result: t.string(),
|
|
81
|
+
confidence: t.number(),
|
|
82
|
+
},
|
|
83
|
+
init: (facts) => {
|
|
84
|
+
facts.result = "";
|
|
85
|
+
facts.confidence = 0;
|
|
86
|
+
},
|
|
87
|
+
constraints: {
|
|
88
|
+
lowConfidence: {
|
|
89
|
+
when: (facts) => facts.confidence < 0.5,
|
|
90
|
+
require: { type: "RE_ANALYZE" },
|
|
91
|
+
},
|
|
92
|
+
},
|
|
93
|
+
resolvers: {
|
|
94
|
+
reAnalyze: {
|
|
95
|
+
requirement: "RE_ANALYZE",
|
|
96
|
+
resolve: async (req, context) => {
|
|
97
|
+
context.facts.confidence = 0.8;
|
|
98
|
+
},
|
|
99
|
+
},
|
|
100
|
+
},
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
const agent = {
|
|
104
|
+
name: "analyst",
|
|
105
|
+
instructions: "You are a data analyst.",
|
|
106
|
+
model: "claude-sonnet-4-5",
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
const result = await orchestrator.run(agent, "Analyze this dataset");
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Assertion Helpers
|
|
113
|
+
|
|
114
|
+
Verify agent behavior after a test run:
|
|
115
|
+
|
|
116
|
+
```typescript
|
|
117
|
+
import {
|
|
118
|
+
assertAgentCalled,
|
|
119
|
+
assertAgentNotCalled,
|
|
120
|
+
assertTokensUsed,
|
|
121
|
+
assertGuardrailPassed,
|
|
122
|
+
assertGuardrailBlocked,
|
|
123
|
+
} from "@directive-run/ai/testing";
|
|
124
|
+
|
|
125
|
+
// Assert an agent was called with a matching input
|
|
126
|
+
assertAgentCalled(mockRunner, "analyst");
|
|
127
|
+
assertAgentCalled(mockRunner, "analyst", /analyze/);
|
|
128
|
+
|
|
129
|
+
// Assert an agent was NOT called
|
|
130
|
+
assertAgentNotCalled(mockRunner, "editor");
|
|
131
|
+
|
|
132
|
+
// Assert token usage within bounds
|
|
133
|
+
assertTokensUsed(result, { min: 50, max: 200 });
|
|
134
|
+
|
|
135
|
+
// Assert guardrail behavior
|
|
136
|
+
assertGuardrailPassed(result, "pii-detection");
|
|
137
|
+
assertGuardrailBlocked(result, "content-filter");
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Test Multi-Agent Orchestrator
|
|
141
|
+
|
|
142
|
+
```typescript
|
|
143
|
+
import {
|
|
144
|
+
createTestMultiAgentOrchestrator,
|
|
145
|
+
createMockRunner,
|
|
146
|
+
assertMultiAgentState,
|
|
147
|
+
} from "@directive-run/ai/testing";
|
|
148
|
+
|
|
149
|
+
const mockRunner = createMockRunner([
|
|
150
|
+
{ input: /research/, output: "Research findings: ...", tokens: 150 },
|
|
151
|
+
{ input: /write/, output: "Draft article: ...", tokens: 200 },
|
|
152
|
+
]);
|
|
153
|
+
|
|
154
|
+
const orchestrator = createTestMultiAgentOrchestrator({
|
|
155
|
+
agents: {
|
|
156
|
+
researcher: { name: "researcher", instructions: "Research.", model: "claude-sonnet-4-5" },
|
|
157
|
+
writer: { name: "writer", instructions: "Write.", model: "claude-sonnet-4-5" },
|
|
158
|
+
},
|
|
159
|
+
patterns: {
|
|
160
|
+
pipeline: sequential(["researcher", "writer"]),
|
|
161
|
+
},
|
|
162
|
+
runner: mockRunner,
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
orchestrator.start();
|
|
166
|
+
const result = await orchestrator.runPattern("pipeline", "Write about AI");
|
|
167
|
+
|
|
168
|
+
// Assert multi-agent state
|
|
169
|
+
assertMultiAgentState(orchestrator, {
|
|
170
|
+
completedAgents: ["researcher", "writer"],
|
|
171
|
+
activePattern: null,
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
assertAgentCalled(mockRunner, "researcher");
|
|
175
|
+
assertAgentCalled(mockRunner, "writer");
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## Simulators
|
|
179
|
+
|
|
180
|
+
Simulate specific conditions for testing edge cases:
|
|
181
|
+
|
|
182
|
+
```typescript
|
|
183
|
+
import { createErrorSimulator, createLatencySimulator } from "@directive-run/ai/testing";
|
|
184
|
+
|
|
185
|
+
// Simulate errors on specific calls
|
|
186
|
+
const errorRunner = createErrorSimulator(baseRunner, {
|
|
187
|
+
failOnCall: [2, 5], // Fail on 2nd and 5th calls
|
|
188
|
+
error: new Error("Rate limit exceeded"),
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
// Simulate variable latency
|
|
192
|
+
const slowRunner = createLatencySimulator(baseRunner, {
|
|
193
|
+
minDelay: 100,
|
|
194
|
+
maxDelay: 2000,
|
|
195
|
+
distribution: "normal", // "uniform" | "normal"
|
|
196
|
+
});
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Evaluation Framework
|
|
202
|
+
|
|
203
|
+
Measure and gate AI output quality with structured evaluations.
|
|
204
|
+
|
|
205
|
+
### Built-In Criteria
|
|
206
|
+
|
|
207
|
+
10+ evaluation criteria available out of the box:
|
|
208
|
+
|
|
209
|
+
```typescript
|
|
210
|
+
import { createEvaluator, criteria } from "@directive-run/ai/testing";
|
|
211
|
+
|
|
212
|
+
const evaluator = createEvaluator({
|
|
213
|
+
criteria: [
|
|
214
|
+
criteria.relevance(), // Is the output relevant to the input?
|
|
215
|
+
criteria.coherence(), // Is the output logically coherent?
|
|
216
|
+
criteria.completeness(), // Does it fully address the prompt?
|
|
217
|
+
criteria.accuracy(), // Is the information correct?
|
|
218
|
+
criteria.conciseness(), // Is it free of unnecessary content?
|
|
219
|
+
criteria.helpfulness(), // Is it useful to the user?
|
|
220
|
+
criteria.harmlessness(), // Is it free of harmful content?
|
|
221
|
+
criteria.factuality(), // Are claims factually supported?
|
|
222
|
+
criteria.creativity(), // Does it show original thinking?
|
|
223
|
+
criteria.instructionFollow(),// Does it follow the prompt instructions?
|
|
224
|
+
],
|
|
225
|
+
});
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
### Custom Criteria
|
|
229
|
+
|
|
230
|
+
```typescript
|
|
231
|
+
const evaluator = createEvaluator({
|
|
232
|
+
criteria: [
|
|
233
|
+
{
|
|
234
|
+
name: "code-quality",
|
|
235
|
+
description: "Does the output contain valid, well-structured code?",
|
|
236
|
+
scorer: (input, output) => {
|
|
237
|
+
const hasCode = output.includes("function") || output.includes("const ");
|
|
238
|
+
const hasExplanation = output.length > 100;
|
|
239
|
+
|
|
240
|
+
if (hasCode && hasExplanation) {
|
|
241
|
+
return { score: 1.0, reason: "Contains code with explanation" };
|
|
242
|
+
}
|
|
243
|
+
if (hasCode) {
|
|
244
|
+
return { score: 0.7, reason: "Code present but no explanation" };
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
return { score: 0.2, reason: "No code block found" };
|
|
248
|
+
},
|
|
249
|
+
},
|
|
250
|
+
],
|
|
251
|
+
});
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
### Anti-Pattern #32: Side effects in evaluator scorer
|
|
255
|
+
|
|
256
|
+
```typescript
|
|
257
|
+
// WRONG — scorers must be pure functions
|
|
258
|
+
{
|
|
259
|
+
name: "quality",
|
|
260
|
+
scorer: (input, output) => {
|
|
261
|
+
// Side effects: writing files, calling APIs, mutating state
|
|
262
|
+
fs.writeFileSync("eval.log", output);
|
|
263
|
+
metrics.increment("evals");
|
|
264
|
+
|
|
265
|
+
return { score: 0.8, reason: "OK" };
|
|
266
|
+
},
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// CORRECT — scorers are pure, return score + reason only
|
|
270
|
+
{
|
|
271
|
+
name: "quality",
|
|
272
|
+
scorer: (input, output) => {
|
|
273
|
+
const wordCount = output.split(/\s+/).length;
|
|
274
|
+
const isDetailed = wordCount > 50;
|
|
275
|
+
|
|
276
|
+
return {
|
|
277
|
+
score: isDetailed ? 1.0 : 0.5,
|
|
278
|
+
reason: isDetailed ? "Detailed response" : "Too brief",
|
|
279
|
+
};
|
|
280
|
+
},
|
|
281
|
+
}
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
### LLM-as-Judge
|
|
285
|
+
|
|
286
|
+
Use an LLM to evaluate output quality:
|
|
287
|
+
|
|
288
|
+
```typescript
|
|
289
|
+
import { createLLMJudge } from "@directive-run/ai/testing";
|
|
290
|
+
|
|
291
|
+
const judge = createLLMJudge({
|
|
292
|
+
runner,
|
|
293
|
+
model: "claude-sonnet-4-5",
|
|
294
|
+
criteria: ["relevance", "accuracy", "completeness"],
|
|
295
|
+
rubric: `
|
|
296
|
+
Score 1.0: Fully addresses the prompt with accurate, complete information.
|
|
297
|
+
Score 0.7: Mostly accurate but missing some details.
|
|
298
|
+
Score 0.3: Partially relevant, significant gaps.
|
|
299
|
+
Score 0.0: Irrelevant or incorrect.
|
|
300
|
+
`,
|
|
301
|
+
});
|
|
302
|
+
|
|
303
|
+
const evalResult = await judge.evaluate({
|
|
304
|
+
input: "Explain quantum computing",
|
|
305
|
+
output: agentOutput,
|
|
306
|
+
reference: "Optional reference answer for comparison",
|
|
307
|
+
});
|
|
308
|
+
|
|
309
|
+
console.log(evalResult.score); // 0.85
|
|
310
|
+
console.log(evalResult.reason); // "Accurate explanation with good examples..."
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
### Dataset-Driven Evaluation
|
|
314
|
+
|
|
315
|
+
Run evaluations against a dataset for regression testing:
|
|
316
|
+
|
|
317
|
+
```typescript
|
|
318
|
+
import { createEvaluationSuite } from "@directive-run/ai/testing";
|
|
319
|
+
|
|
320
|
+
const suite = createEvaluationSuite({
|
|
321
|
+
evaluator,
|
|
322
|
+
dataset: [
|
|
323
|
+
{
|
|
324
|
+
input: "What is TypeScript?",
|
|
325
|
+
expectedOutput: "TypeScript is a typed superset of JavaScript...",
|
|
326
|
+
tags: ["basics"],
|
|
327
|
+
},
|
|
328
|
+
{
|
|
329
|
+
input: "Explain monads",
|
|
330
|
+
expectedOutput: "A monad is a design pattern...",
|
|
331
|
+
tags: ["advanced"],
|
|
332
|
+
},
|
|
333
|
+
],
|
|
334
|
+
});
|
|
335
|
+
|
|
336
|
+
const report = await suite.run(agent, runner);
|
|
337
|
+
|
|
338
|
+
console.log(report.averageScore); // 0.82
|
|
339
|
+
console.log(report.passRate); // 0.90 (90% above threshold)
|
|
340
|
+
console.log(report.failedCases); // Cases that scored below threshold
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
### CI Quality Gates
|
|
344
|
+
|
|
345
|
+
Fail CI pipelines when quality drops below a threshold:
|
|
346
|
+
|
|
347
|
+
```typescript
|
|
348
|
+
const report = await suite.run(agent, runner);
|
|
349
|
+
|
|
350
|
+
// Threshold-based gate
|
|
351
|
+
if (report.averageScore < 0.75) {
|
|
352
|
+
console.error(`Quality gate failed: ${report.averageScore} < 0.75`);
|
|
353
|
+
process.exit(1);
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// Per-criteria gates
|
|
357
|
+
for (const criterion of report.criteria) {
|
|
358
|
+
if (criterion.averageScore < 0.6) {
|
|
359
|
+
console.error(`${criterion.name} failed: ${criterion.averageScore}`);
|
|
360
|
+
process.exit(1);
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
```
|
|
364
|
+
|
|
365
|
+
## Quick Reference
|
|
366
|
+
|
|
367
|
+
| API | Import Path | Purpose |
|
|
368
|
+
|---|---|---|
|
|
369
|
+
| `createMockRunner` | `@directive-run/ai/testing` | Deterministic test runner |
|
|
370
|
+
| `createTestOrchestrator` | `@directive-run/ai/testing` | Lightweight test orchestrator |
|
|
371
|
+
| `createTestMultiAgentOrchestrator` | `@directive-run/ai/testing` | Multi-agent test orchestrator |
|
|
372
|
+
| `assertAgentCalled` | `@directive-run/ai/testing` | Verify agent was invoked |
|
|
373
|
+
| `assertMultiAgentState` | `@directive-run/ai/testing` | Verify multi-agent state |
|
|
374
|
+
| `createEvaluator` | `@directive-run/ai/testing` | Rule-based evaluation |
|
|
375
|
+
| `createLLMJudge` | `@directive-run/ai/testing` | LLM-as-judge evaluation |
|
|
376
|
+
| `createEvaluationSuite` | `@directive-run/ai/testing` | Dataset-driven evaluation |
|
|
377
|
+
| `createErrorSimulator` | `@directive-run/ai/testing` | Simulate failures |
|
|
378
|
+
| `createLatencySimulator` | `@directive-run/ai/testing` | Simulate latency |
|