@theokit/sdk 1.7.0 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/bin/init-claude.mjs +34 -0
- package/claude-template/AGENTS.md +139 -0
- package/claude-template/CLAUDE.md +51 -0
- package/claude-template/dot-claude/rules/theokit-conventions.md +33 -0
- package/claude-template/dot-claude/settings.json +16 -0
- package/claude-template/dot-claude/skills/theokit-agent-core/SKILL.md +209 -0
- package/claude-template/dot-claude/skills/theokit-budget/SKILL.md +176 -0
- package/claude-template/dot-claude/skills/theokit-config/SKILL.md +139 -0
- package/claude-template/dot-claude/skills/theokit-cron/SKILL.md +148 -0
- package/claude-template/dot-claude/skills/theokit-di/SKILL.md +233 -0
- package/claude-template/dot-claude/skills/theokit-di-agent/SKILL.md +294 -0
- package/claude-template/dot-claude/skills/theokit-errors/SKILL.md +172 -0
- package/claude-template/dot-claude/skills/theokit-eval/SKILL.md +144 -0
- package/claude-template/dot-claude/skills/theokit-gateways/SKILL.md +209 -0
- package/claude-template/dot-claude/skills/theokit-memory/SKILL.md +176 -0
- package/claude-template/dot-claude/skills/theokit-rag/SKILL.md +226 -0
- package/claude-template/dot-claude/skills/theokit-streaming/SKILL.md +156 -0
- package/claude-template/dot-claude/skills/theokit-subscriptions/SKILL.md +148 -0
- package/claude-template/dot-claude/skills/theokit-tools/SKILL.md +170 -0
- package/claude-template/dot-claude/skills/theokit-workflows/SKILL.md +218 -0
- package/package.json +3 -1
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
---
|
|
2
|
+
user-invocable: false
|
|
3
|
+
description: All 15 agentic decorators from @theokit/di-agent for tools, workflows, evals, cron, and more.
|
|
4
|
+
paths:
|
|
5
|
+
- "**/*decorator*"
|
|
6
|
+
- "**/*Decorator*"
|
|
7
|
+
- "**/di-agent*"
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# TheoKit DI-Agent -- Agentic Decorators
|
|
11
|
+
|
|
12
|
+
Quick reference for `@theokit/di-agent` -- 15 decorators that wire agentic capabilities into DI-managed classes.
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pnpm add @theokit/di-agent @theokit/di @theokit/sdk
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Requires `reflect-metadata` and TypeScript decorator support (see `@theokit/di` docs).
|
|
21
|
+
|
|
22
|
+
## createAgentProvider
|
|
23
|
+
|
|
24
|
+
Bridges `@theokit/di` container with `@theokit/sdk` Agent. Reads decorator metadata from all registered classes and wires tools, workflows, evals, cron jobs, etc.
|
|
25
|
+
|
|
26
|
+
```typescript
|
|
27
|
+
import { Container } from "@theokit/di";
|
|
28
|
+
import { createAgentProvider } from "@theokit/di-agent";
|
|
29
|
+
|
|
30
|
+
const container = new Container();
|
|
31
|
+
container.register(MyToolService);
|
|
32
|
+
container.register(MyWorkflowService);
|
|
33
|
+
|
|
34
|
+
const { agent, dispose } = await createAgentProvider(container, {
|
|
35
|
+
apiKey: process.env.THEOKIT_API_KEY!,
|
|
36
|
+
model: { id: "google/gemini-2.0-flash-001" },
|
|
37
|
+
local: { cwd: process.cwd() },
|
|
38
|
+
});
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## @Tool
|
|
42
|
+
|
|
43
|
+
Registers a method as a custom tool exposed to the LLM.
|
|
44
|
+
|
|
45
|
+
```typescript
|
|
46
|
+
import { Injectable } from "@theokit/di";
|
|
47
|
+
import { Tool } from "@theokit/di-agent";
|
|
48
|
+
import { z } from "zod";
|
|
49
|
+
|
|
50
|
+
@Injectable()
|
|
51
|
+
class MathService {
|
|
52
|
+
@Tool({
|
|
53
|
+
name: "calculate",
|
|
54
|
+
description: "Evaluate a math expression.",
|
|
55
|
+
inputSchema: z.object({ expression: z.string() }),
|
|
56
|
+
})
|
|
57
|
+
calculate(input: { expression: string }): string {
|
|
58
|
+
return String(eval(input.expression));
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## @Workflow
|
|
64
|
+
|
|
65
|
+
Marks a method as a workflow step definition.
|
|
66
|
+
|
|
67
|
+
```typescript
|
|
68
|
+
import { Workflow } from "@theokit/di-agent";
|
|
69
|
+
|
|
70
|
+
@Injectable()
|
|
71
|
+
class PipelineService {
|
|
72
|
+
@Workflow({ name: "data-pipeline", description: "ETL workflow." })
|
|
73
|
+
async run(input: { source: string }) {
|
|
74
|
+
// workflow implementation
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## @EvalDecorator
|
|
80
|
+
|
|
81
|
+
Registers an eval suite on a method.
|
|
82
|
+
|
|
83
|
+
```typescript
|
|
84
|
+
import { EvalDecorator } from "@theokit/di-agent";
|
|
85
|
+
|
|
86
|
+
@Injectable()
|
|
87
|
+
class QAService {
|
|
88
|
+
@EvalDecorator({
|
|
89
|
+
name: "qa-smoke",
|
|
90
|
+
dataset: [{ input: "Say ok.", expected: "ok" }],
|
|
91
|
+
})
|
|
92
|
+
async evaluate() { /* ... */ }
|
|
93
|
+
}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## @Cron
|
|
97
|
+
|
|
98
|
+
Registers a cron-scheduled agent task.
|
|
99
|
+
|
|
100
|
+
```typescript
|
|
101
|
+
import { Cron } from "@theokit/di-agent";
|
|
102
|
+
|
|
103
|
+
@Injectable()
|
|
104
|
+
class ReportService {
|
|
105
|
+
@Cron({
|
|
106
|
+
expression: "0 9 * * *",
|
|
107
|
+
timezone: "America/Sao_Paulo",
|
|
108
|
+
message: "Summarize yesterday's commits.",
|
|
109
|
+
})
|
|
110
|
+
async dailyReport() { /* ... */ }
|
|
111
|
+
}
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## @Subscription
|
|
115
|
+
|
|
116
|
+
Marks a method as a real-time subscription handler.
|
|
117
|
+
|
|
118
|
+
```typescript
|
|
119
|
+
import { Subscription } from "@theokit/di-agent";
|
|
120
|
+
|
|
121
|
+
@Injectable()
|
|
122
|
+
class EventService {
|
|
123
|
+
@Subscription({ topic: "orders.created", description: "Handle new orders." })
|
|
124
|
+
async onOrder(event: unknown) { /* ... */ }
|
|
125
|
+
}
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## @Auth
|
|
129
|
+
|
|
130
|
+
Registers authentication/authorization logic for agent operations.
|
|
131
|
+
|
|
132
|
+
```typescript
|
|
133
|
+
import { Auth } from "@theokit/di-agent";
|
|
134
|
+
|
|
135
|
+
@Injectable()
|
|
136
|
+
class SecurityService {
|
|
137
|
+
@Auth({ strategy: "bearer", description: "JWT validation." })
|
|
138
|
+
async validate(token: string): Promise<boolean> { /* ... */ }
|
|
139
|
+
}
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## @Retriever
|
|
143
|
+
|
|
144
|
+
Registers a retrieval method for RAG pipelines.
|
|
145
|
+
|
|
146
|
+
```typescript
|
|
147
|
+
import { Retriever } from "@theokit/di-agent";
|
|
148
|
+
|
|
149
|
+
@Injectable()
|
|
150
|
+
class SearchService {
|
|
151
|
+
@Retriever({ name: "docs-search", description: "Search documentation." })
|
|
152
|
+
async search(query: string) { /* ... */ }
|
|
153
|
+
}
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## @Reranker
|
|
157
|
+
|
|
158
|
+
Registers a reranking method for RAG pipelines.
|
|
159
|
+
|
|
160
|
+
```typescript
|
|
161
|
+
import { Reranker } from "@theokit/di-agent";
|
|
162
|
+
|
|
163
|
+
@Injectable()
|
|
164
|
+
class RankService {
|
|
165
|
+
@Reranker({ name: "cohere-reranker", model: "rerank-v3.5" })
|
|
166
|
+
async rerank(query: string, docs: unknown[]) { /* ... */ }
|
|
167
|
+
}
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## @TextSplitter
|
|
171
|
+
|
|
172
|
+
Registers a text splitting strategy.
|
|
173
|
+
|
|
174
|
+
```typescript
|
|
175
|
+
import { TextSplitter } from "@theokit/di-agent";
|
|
176
|
+
|
|
177
|
+
@Injectable()
|
|
178
|
+
class SplitterService {
|
|
179
|
+
@TextSplitter({ strategy: "recursive", chunkSize: 1000, overlap: 100 })
|
|
180
|
+
split(text: string) { /* ... */ }
|
|
181
|
+
}
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## @UseSandbox
|
|
185
|
+
|
|
186
|
+
Marks a class or method for sandboxed execution.
|
|
187
|
+
|
|
188
|
+
```typescript
|
|
189
|
+
import { UseSandbox } from "@theokit/di-agent";
|
|
190
|
+
|
|
191
|
+
@Injectable()
|
|
192
|
+
class CodeRunner {
|
|
193
|
+
@UseSandbox({ enabled: true })
|
|
194
|
+
async execute(code: string) { /* ... */ }
|
|
195
|
+
}
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## @SubAgent
|
|
199
|
+
|
|
200
|
+
Declares a subagent definition on a method.
|
|
201
|
+
|
|
202
|
+
```typescript
|
|
203
|
+
import { SubAgent } from "@theokit/di-agent";
|
|
204
|
+
|
|
205
|
+
@Injectable()
|
|
206
|
+
class AgentOrchestrator {
|
|
207
|
+
@SubAgent({
|
|
208
|
+
name: "code-reviewer",
|
|
209
|
+
description: "Expert code reviewer.",
|
|
210
|
+
prompt: "Review for bugs and security issues.",
|
|
211
|
+
})
|
|
212
|
+
async review() { /* ... */ }
|
|
213
|
+
}
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
## @Hitl (Human-in-the-Loop)
|
|
217
|
+
|
|
218
|
+
Marks a method as requiring human approval before proceeding.
|
|
219
|
+
|
|
220
|
+
```typescript
|
|
221
|
+
import { Hitl } from "@theokit/di-agent";
|
|
222
|
+
|
|
223
|
+
@Injectable()
|
|
224
|
+
class ApprovalService {
|
|
225
|
+
@Hitl({ description: "Requires manager approval.", timeout: 3600_000 })
|
|
226
|
+
async approve(request: unknown) { /* ... */ }
|
|
227
|
+
}
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
## @AutoSummarize
|
|
231
|
+
|
|
232
|
+
Enables automatic conversation summarization.
|
|
233
|
+
|
|
234
|
+
```typescript
|
|
235
|
+
import { AutoSummarize } from "@theokit/di-agent";
|
|
236
|
+
|
|
237
|
+
@Injectable()
|
|
238
|
+
class ChatService {
|
|
239
|
+
@AutoSummarize({ maxTurns: 20, strategy: "rolling" })
|
|
240
|
+
async chat() { /* ... */ }
|
|
241
|
+
}
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
## @InjectAgent
|
|
245
|
+
|
|
246
|
+
Injects the current `SDKAgent` instance into a class.
|
|
247
|
+
|
|
248
|
+
```typescript
|
|
249
|
+
import { Injectable } from "@theokit/di";
|
|
250
|
+
import { InjectAgent } from "@theokit/di-agent";
|
|
251
|
+
import type { SDKAgent } from "@theokit/sdk";
|
|
252
|
+
|
|
253
|
+
@Injectable()
|
|
254
|
+
class AgentAwareService {
|
|
255
|
+
constructor(@InjectAgent() private readonly agent: SDKAgent) {}
|
|
256
|
+
|
|
257
|
+
async doWork() {
|
|
258
|
+
const run = await this.agent.send("Do something");
|
|
259
|
+
await run.wait();
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
## @MemoryScopeDecorator
|
|
265
|
+
|
|
266
|
+
Configures memory scope for a class.
|
|
267
|
+
|
|
268
|
+
```typescript
|
|
269
|
+
import { MemoryScopeDecorator } from "@theokit/di-agent";
|
|
270
|
+
|
|
271
|
+
@Injectable()
|
|
272
|
+
@MemoryScopeDecorator({ namespace: "billing", scope: "user" })
|
|
273
|
+
class BillingService { /* ... */ }
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
## Reading metadata (for framework authors)
|
|
277
|
+
|
|
278
|
+
Each decorator has a companion reader function:
|
|
279
|
+
|
|
280
|
+
```typescript
|
|
281
|
+
import { readToolMetadata } from "@theokit/di-agent";
|
|
282
|
+
import { readWorkflowMetadata } from "@theokit/di-agent";
|
|
283
|
+
import { readCronMetadata } from "@theokit/di-agent";
|
|
284
|
+
// ... readEvalDecoratorMetadata, readRetrieverMetadata, etc.
|
|
285
|
+
|
|
286
|
+
const tools = readToolMetadata(MyToolService);
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
## AGENT_TOKEN
|
|
290
|
+
|
|
291
|
+
```typescript
|
|
292
|
+
import { AGENT_TOKEN } from "@theokit/di-agent";
|
|
293
|
+
// Symbol token for agent injection in the DI container
|
|
294
|
+
```
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
---
|
|
2
|
+
user-invocable: false
|
|
3
|
+
paths:
|
|
4
|
+
- "**/*error*"
|
|
5
|
+
- "**/*Error*"
|
|
6
|
+
- "**/*exception*"
|
|
7
|
+
description: TheoKit SDK error hierarchy — TheokitAgentError, error codes, retry patterns
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# TheoKit Error Handling
|
|
11
|
+
|
|
12
|
+
All SDK errors extend `TheokitAgentError`. Use `isRetryable` to drive
|
|
13
|
+
retry/backoff logic without coupling to specific subclasses.
|
|
14
|
+
|
|
15
|
+
## Error hierarchy
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
Error
|
|
19
|
+
+-- TheokitAgentError
|
|
20
|
+
| +-- AuthenticationError
|
|
21
|
+
| +-- RateLimitError
|
|
22
|
+
| +-- ConfigurationError
|
|
23
|
+
| | +-- IntegrationNotConnectedError
|
|
24
|
+
| +-- NetworkError
|
|
25
|
+
| +-- UnknownAgentError
|
|
26
|
+
|
|
|
27
|
+
+-- UnsupportedRunOperationError (separate hierarchy)
|
|
28
|
+
+-- AgentRunError (thrown by Agent.prompt with throwOnError)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Error reference
|
|
32
|
+
|
|
33
|
+
| Error | When | `isRetryable` |
|
|
34
|
+
|---|---|---|
|
|
35
|
+
| `AuthenticationError` | Invalid API key, not logged in, insufficient permissions | `false` |
|
|
36
|
+
| `RateLimitError` | Too many requests or usage limits exceeded | `true` |
|
|
37
|
+
| `ConfigurationError` | Invalid model, bad request parameters, malformed options | `false` |
|
|
38
|
+
| `IntegrationNotConnectedError` | Cloud agent for a repo whose SCM is not connected | `false` |
|
|
39
|
+
| `NetworkError` | Service unavailable, timeout, transport failure | `true` |
|
|
40
|
+
| `UnknownAgentError` | Catch-all for unclassified errors | `false` |
|
|
41
|
+
| `UnsupportedRunOperationError` | Runtime does not support a `Run` operation | n/a |
|
|
42
|
+
| `AgentRunError` | Run finished with error status (only with `throwOnError: true`) | n/a |
|
|
43
|
+
|
|
44
|
+
## `TheokitAgentError` properties
|
|
45
|
+
|
|
46
|
+
```typescript
|
|
47
|
+
class TheokitAgentError extends Error {
|
|
48
|
+
readonly isRetryable: boolean;
|
|
49
|
+
readonly code?: string;
|
|
50
|
+
readonly protoErrorCode?: string;
|
|
51
|
+
readonly cause?: unknown;
|
|
52
|
+
readonly metadata?: ErrorMetadata; // v1.3+ provider HTTP errors
|
|
53
|
+
}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## `ErrorMetadata` (v1.3+)
|
|
57
|
+
|
|
58
|
+
When an error originates from a provider HTTP call:
|
|
59
|
+
|
|
60
|
+
```typescript
|
|
61
|
+
interface ErrorMetadata {
|
|
62
|
+
provider: string; // "anthropic" | "openai" | "openrouter" | ...
|
|
63
|
+
endpoint: string; // "/v1/messages" | "/v1/chat/completions"
|
|
64
|
+
code: ErrorCode;
|
|
65
|
+
statusCode?: number;
|
|
66
|
+
retryAfter?: number; // seconds
|
|
67
|
+
raw?: unknown; // raw response body (truncated ~2KB)
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
type ErrorCode =
|
|
71
|
+
| "rate_limit" | "auth_failed" | "invalid_request"
|
|
72
|
+
| "timeout" | "server_error" | "context_too_long"
|
|
73
|
+
| "content_filtered" | "model_unavailable"
|
|
74
|
+
| "network" | "unknown";
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Retry pattern
|
|
78
|
+
|
|
79
|
+
```typescript
|
|
80
|
+
import { TheokitAgentError, type Run } from "@theokit/sdk";
|
|
81
|
+
|
|
82
|
+
async function withRetry(send: () => Promise<Run>, attempts = 3): Promise<Run> {
|
|
83
|
+
let lastError: unknown;
|
|
84
|
+
for (let i = 0; i < attempts; i++) {
|
|
85
|
+
try {
|
|
86
|
+
return await send();
|
|
87
|
+
} catch (err) {
|
|
88
|
+
lastError = err;
|
|
89
|
+
if (err instanceof TheokitAgentError && err.isRetryable) {
|
|
90
|
+
await new Promise((r) => setTimeout(r, 2 ** i * 1000));
|
|
91
|
+
continue;
|
|
92
|
+
}
|
|
93
|
+
throw err;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
throw lastError;
|
|
97
|
+
}
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Using metadata for programmatic handling
|
|
101
|
+
|
|
102
|
+
```typescript
|
|
103
|
+
try {
|
|
104
|
+
await agent.send("...");
|
|
105
|
+
} catch (err) {
|
|
106
|
+
if (err instanceof TheokitAgentError && err.metadata) {
|
|
107
|
+
switch (err.metadata.code) {
|
|
108
|
+
case "rate_limit":
|
|
109
|
+
await wait(err.metadata.retryAfter ?? 60);
|
|
110
|
+
return retry();
|
|
111
|
+
case "auth_failed":
|
|
112
|
+
throw new Error(`Check API key for ${err.metadata.provider}`);
|
|
113
|
+
case "context_too_long":
|
|
114
|
+
// trigger prompt compression
|
|
115
|
+
break;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
throw err;
|
|
119
|
+
}
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## `IntegrationNotConnectedError`
|
|
123
|
+
|
|
124
|
+
```typescript
|
|
125
|
+
import { IntegrationNotConnectedError } from "@theokit/sdk/errors";
|
|
126
|
+
|
|
127
|
+
try {
|
|
128
|
+
await Agent.create({ /* cloud with disconnected repo */ });
|
|
129
|
+
} catch (err) {
|
|
130
|
+
if (err instanceof IntegrationNotConnectedError) {
|
|
131
|
+
console.error(`Connect ${err.provider} at ${err.helpUrl}`);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## `UnsupportedRunOperationError`
|
|
137
|
+
|
|
138
|
+
Check before calling runtime-dependent operations:
|
|
139
|
+
|
|
140
|
+
```typescript
|
|
141
|
+
if (run.supports("conversation")) {
|
|
142
|
+
const turns = await run.conversation();
|
|
143
|
+
} else {
|
|
144
|
+
console.log(run.unsupportedReason("conversation"));
|
|
145
|
+
}
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## Tree-shaking
|
|
149
|
+
|
|
150
|
+
Import error classes from the `/errors` subpath to avoid pulling the full SDK:
|
|
151
|
+
|
|
152
|
+
```typescript
|
|
153
|
+
import { TheokitAgentError, RateLimitError } from "@theokit/sdk/errors";
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## `throwOnError` on `Agent.prompt`
|
|
157
|
+
|
|
158
|
+
```typescript
|
|
159
|
+
import { Agent, AgentRunError } from "@theokit/sdk";
|
|
160
|
+
|
|
161
|
+
try {
|
|
162
|
+
const result = await Agent.prompt("hi", {
|
|
163
|
+
apiKey: process.env.ANTHROPIC_API_KEY!,
|
|
164
|
+
model: { id: "claude-sonnet-4-5-20250929" },
|
|
165
|
+
throwOnError: true,
|
|
166
|
+
});
|
|
167
|
+
} catch (err) {
|
|
168
|
+
if (err instanceof AgentRunError && err.code === "auth_failed") {
|
|
169
|
+
// bad API key
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
```
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
---
|
|
2
|
+
user-invocable: false
|
|
3
|
+
paths:
|
|
4
|
+
- "**/*eval*"
|
|
5
|
+
- "**/*Eval*"
|
|
6
|
+
- "**/*scorer*"
|
|
7
|
+
description: TheoKit SDK Eval suite API reference — Eval.create, scorers, datasets, EvalRun
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# TheoKit Eval Suite
|
|
11
|
+
|
|
12
|
+
Eval-as-code primitive for production deploy gates. Run evals against real LLM
|
|
13
|
+
providers to measure quality, latency, and cost before shipping.
|
|
14
|
+
|
|
15
|
+
## Quick start
|
|
16
|
+
|
|
17
|
+
```typescript
|
|
18
|
+
import { Eval, Scorers } from "@theokit/sdk";
|
|
19
|
+
|
|
20
|
+
const run = await Eval.create({
|
|
21
|
+
name: "qa-smoke",
|
|
22
|
+
dataset: [
|
|
23
|
+
{ input: "Reply with the word: ok.", expected: "ok" },
|
|
24
|
+
{ input: "Say jazz in one word.", expected: "jazz" },
|
|
25
|
+
],
|
|
26
|
+
scorers: [
|
|
27
|
+
Scorers.containsExpected({ caseSensitive: false }),
|
|
28
|
+
Scorers.regex(/[a-zA-Z]/),
|
|
29
|
+
],
|
|
30
|
+
agent: {
|
|
31
|
+
apiKey: process.env.OPENROUTER_API_KEY,
|
|
32
|
+
model: { id: "openai/gpt-4o-mini" },
|
|
33
|
+
local: { cwd: process.cwd(), sandboxOptions: { enabled: false } },
|
|
34
|
+
},
|
|
35
|
+
concurrency: 4,
|
|
36
|
+
}).run();
|
|
37
|
+
|
|
38
|
+
console.log(run.aggregate.meanScore); // 0.95
|
|
39
|
+
console.log(run.aggregate.passRatio); // 1.0
|
|
40
|
+
console.log(run.aggregate.tokensInTotal); // 142
|
|
41
|
+
console.log(run.aggregate.durationMsP95); // 1830
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Built-in scorers (`Scorers`)
|
|
45
|
+
|
|
46
|
+
| Scorer | What it checks |
|
|
47
|
+
|---|---|
|
|
48
|
+
| `Scorers.exactMatch({ caseSensitive? })` | `output.trim() === expected.trim()` — refuses empty `expected` |
|
|
49
|
+
| `Scorers.containsExpected({ caseSensitive? })` | `output.includes(expected)` — refuses empty `expected` |
|
|
50
|
+
| `Scorers.regex(pattern)` | `pattern.test(output)` — test patterns against adversarial output to avoid ReDoS |
|
|
51
|
+
| `Scorers.jsonShape(zodSchema, { strict? })` | `JSON.parse(output)` + Zod validation — caps output at 1 MB before parse |
|
|
52
|
+
| `Scorers.llmJudge({ model, apiKey, criteria, rubric? })` | Second LLM scores against criteria — requires SEPARATE `apiKey` |
|
|
53
|
+
|
|
54
|
+
### Custom scorer
|
|
55
|
+
|
|
56
|
+
A scorer is an async function returning a number between 0 and 1:
|
|
57
|
+
|
|
58
|
+
```typescript
|
|
59
|
+
const myScorer = async (row: { input: string; output: string; expected?: string }) => {
|
|
60
|
+
return row.output.length < 100 ? 1.0 : 0.5;
|
|
61
|
+
};
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Dataset
|
|
65
|
+
|
|
66
|
+
The `dataset` field accepts an array of objects with `input` and optional `expected`:
|
|
67
|
+
|
|
68
|
+
```typescript
|
|
69
|
+
interface EvalDatasetRow {
|
|
70
|
+
input: string;
|
|
71
|
+
expected?: string;
|
|
72
|
+
}
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Recommended ceiling: ~10k rows (v1 materializes in memory). For larger evals,
|
|
76
|
+
partition into multiple `Eval.create` calls.
|
|
77
|
+
|
|
78
|
+
## `EvalRun` shape
|
|
79
|
+
|
|
80
|
+
```typescript
|
|
81
|
+
interface EvalRun {
|
|
82
|
+
id: string;
|
|
83
|
+
name: string;
|
|
84
|
+
startedAt: number;
|
|
85
|
+
endedAt: number;
|
|
86
|
+
durationMs: number;
|
|
87
|
+
aggregate: EvalAggregate;
|
|
88
|
+
rows: ReadonlyArray<EvalRowResult>;
|
|
89
|
+
metadata?: Record<string, unknown>;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
interface EvalAggregate {
|
|
93
|
+
meanScore: number;
|
|
94
|
+
medianScore: number;
|
|
95
|
+
passRatio: number; // rows where meanScore >= 0.5
|
|
96
|
+
perScorer: Record<string, { mean; median; min; max }>;
|
|
97
|
+
totalRows: number;
|
|
98
|
+
errorRows: number;
|
|
99
|
+
durationMsP50: number;
|
|
100
|
+
durationMsP95: number;
|
|
101
|
+
tokensInTotal: number;
|
|
102
|
+
tokensOutTotal: number;
|
|
103
|
+
}
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
`EvalRun` is plain JSON — `JSON.stringify(run)` works directly.
|
|
107
|
+
|
|
108
|
+
## Concurrency
|
|
109
|
+
|
|
110
|
+
`concurrency` defaults to 4. Allowed range: `[1, 64]` (integer). 0 and
|
|
111
|
+
Infinity are rejected at `Eval.create` time.
|
|
112
|
+
|
|
113
|
+
## Concurrent runs
|
|
114
|
+
|
|
115
|
+
Per-process single-flight per `name`. Two `Eval.run` calls with the same
|
|
116
|
+
`name` running simultaneously throw `EvalAlreadyRunningError`. Include model
|
|
117
|
+
id in the name for matrix runs.
|
|
118
|
+
|
|
119
|
+
## CLI integration
|
|
120
|
+
|
|
121
|
+
The `theokit eval` CLI invokes `Eval.run` internally. User-authored
|
|
122
|
+
`eval.config.{ts,mjs}` files are forward-compatible.
|
|
123
|
+
|
|
124
|
+
## Telemetry
|
|
125
|
+
|
|
126
|
+
When `agent.telemetry.enabled === true`, `Eval.run` emits a parent `eval.run`
|
|
127
|
+
OTel span; `agent.send` / `llm.call` spans nest under it.
|
|
128
|
+
|
|
129
|
+
## Cost forecasting
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
aggregate.tokensInTotal x provider_input_price
|
|
133
|
+
+ aggregate.tokensOutTotal x provider_output_price
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
With `llmJudge`, add ~1 judge call per row. 1000 rows with `gpt-4o-mini`
|
|
137
|
+
costs roughly $3.00 total (base + judge).
|
|
138
|
+
|
|
139
|
+
## Errors
|
|
140
|
+
|
|
141
|
+
| Error | When |
|
|
142
|
+
|---|---|
|
|
143
|
+
| `EvalAlreadyRunningError` | Same `name` already running in this process |
|
|
144
|
+
| `ConfigurationError` | Invalid concurrency, missing required fields |
|