@tuttiai/core 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +72 -2
- package/dist/index.js +335 -126
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { TuttiHooks, TelemetryConfig, TuttiEventType, TuttiEvent,
|
|
1
|
+
import { TuttiHooks, TokenUsage, ScoreConfig, TelemetryConfig, TuttiEventType, TuttiEvent, AgentResult, Session, LLMProvider, SessionStore, AgentConfig, ChatMessage, Voice, Permission, BudgetConfig, ChatRequest, ChatResponse, StreamChunk } from '@tuttiai/types';
|
|
2
2
|
export { AgentConfig, AgentMemoryConfig, AgentResult, BudgetConfig, ChatMessage, ChatRequest, ChatResponse, ContentBlock, HookContext, LLMProvider, MemoryConfig, Permission, ScoreConfig, Session, SessionStore, StopReason, StreamChunk, TelemetryConfig, TextBlock, TokenUsage, Tool, ToolContext, ToolDefinition, ToolMemoryHelpers, ToolResult, ToolResultBlock, ToolUseBlock, TuttiEvent, TuttiEventHandler, TuttiEventType, TuttiHooks, Voice, VoiceContext } from '@tuttiai/types';
|
|
3
3
|
import pino from 'pino';
|
|
4
4
|
|
|
@@ -84,6 +84,76 @@ declare function createBlocklistHook(blockedTools: string[]): TuttiHooks;
|
|
|
84
84
|
*/
|
|
85
85
|
declare function createMaxCostHook(maxUsd: number): TuttiHooks;
|
|
86
86
|
|
|
87
|
+
/** Evaluation framework types. */
|
|
88
|
+
|
|
89
|
+
interface EvalAssertion {
|
|
90
|
+
type: "contains" | "not_contains" | "matches_regex" | "tool_called" | "tool_not_called" | "turns_lte" | "cost_lte";
|
|
91
|
+
value: string | number;
|
|
92
|
+
description?: string;
|
|
93
|
+
}
|
|
94
|
+
interface EvalCase {
|
|
95
|
+
id: string;
|
|
96
|
+
name: string;
|
|
97
|
+
agent_id: string;
|
|
98
|
+
input: string;
|
|
99
|
+
assertions: EvalAssertion[];
|
|
100
|
+
}
|
|
101
|
+
interface EvalSuite {
|
|
102
|
+
name: string;
|
|
103
|
+
cases: EvalCase[];
|
|
104
|
+
}
|
|
105
|
+
interface AssertionResult {
|
|
106
|
+
assertion: EvalAssertion;
|
|
107
|
+
passed: boolean;
|
|
108
|
+
actual: string | number;
|
|
109
|
+
}
|
|
110
|
+
interface EvalResult {
|
|
111
|
+
case_id: string;
|
|
112
|
+
case_name: string;
|
|
113
|
+
passed: boolean;
|
|
114
|
+
score: number;
|
|
115
|
+
output: string;
|
|
116
|
+
turns: number;
|
|
117
|
+
usage: TokenUsage;
|
|
118
|
+
cost_usd: number;
|
|
119
|
+
duration_ms: number;
|
|
120
|
+
assertions: AssertionResult[];
|
|
121
|
+
error?: string;
|
|
122
|
+
}
|
|
123
|
+
interface EvalSummary {
|
|
124
|
+
total: number;
|
|
125
|
+
passed: number;
|
|
126
|
+
failed: number;
|
|
127
|
+
avg_score: number;
|
|
128
|
+
total_cost_usd: number;
|
|
129
|
+
total_duration_ms: number;
|
|
130
|
+
}
|
|
131
|
+
interface EvalReport {
|
|
132
|
+
suite_name: string;
|
|
133
|
+
results: EvalResult[];
|
|
134
|
+
summary: EvalSummary;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/** Evaluation runner — executes test suites against a score. */
|
|
138
|
+
|
|
139
|
+
declare class EvalRunner {
|
|
140
|
+
private runtime;
|
|
141
|
+
constructor(score: ScoreConfig);
|
|
142
|
+
run(suite: EvalSuite): Promise<EvalReport>;
|
|
143
|
+
private runCase;
|
|
144
|
+
private checkAssertion;
|
|
145
|
+
private summarize;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/** Evaluation report formatting — table, JSON, Markdown. */
|
|
149
|
+
|
|
150
|
+
/** Print a formatted results table to stdout. */
|
|
151
|
+
declare function printTable(report: EvalReport): void;
|
|
152
|
+
/** Convert report to a plain JSON object for storage or CI. */
|
|
153
|
+
declare function toJSON(report: EvalReport): string;
|
|
154
|
+
/** Convert report to a GitHub-friendly markdown table. */
|
|
155
|
+
declare function toMarkdown(report: EvalReport): string;
|
|
156
|
+
|
|
87
157
|
declare const createLogger: (name: string) => pino.Logger<never, boolean>;
|
|
88
158
|
declare const logger: pino.Logger<never, boolean>;
|
|
89
159
|
|
|
@@ -336,4 +406,4 @@ declare class GeminiProvider implements LLMProvider {
|
|
|
336
406
|
stream(request: ChatRequest): AsyncGenerator<StreamChunk>;
|
|
337
407
|
}
|
|
338
408
|
|
|
339
|
-
export { AgentNotFoundError, AgentRouter, AgentRunner, AnthropicProvider, type AnthropicProviderOptions, AuthenticationError, BudgetExceededError, ContextWindowError, EventBus, GeminiProvider, type GeminiProviderOptions, InMemorySemanticStore, InMemorySessionStore, type MemoryEntry, OpenAIProvider, type OpenAIProviderOptions, PathTraversalError, PermissionError, PermissionGuard, PostgresSessionStore, PromptGuard, ProviderError, RateLimitError, ScoreLoader, ScoreValidationError, SecretsManager, type SemanticMemoryStore, TokenBudget, ToolTimeoutError, TuttiError, TuttiRuntime, TuttiTracer, UrlValidationError, VoiceError, createBlocklistHook, createCacheHook, createLogger, createLoggingHook, createMaxCostHook, defineScore, initTelemetry, logger, shutdownTelemetry, validateScore };
|
|
409
|
+
export { AgentNotFoundError, AgentRouter, AgentRunner, AnthropicProvider, type AnthropicProviderOptions, AuthenticationError, BudgetExceededError, ContextWindowError, type EvalAssertion, type EvalCase, type EvalReport, type EvalResult, EvalRunner, type EvalSuite, type EvalSummary, EventBus, GeminiProvider, type GeminiProviderOptions, InMemorySemanticStore, InMemorySessionStore, type MemoryEntry, OpenAIProvider, type OpenAIProviderOptions, PathTraversalError, PermissionError, PermissionGuard, PostgresSessionStore, PromptGuard, ProviderError, RateLimitError, ScoreLoader, ScoreValidationError, SecretsManager, type SemanticMemoryStore, TokenBudget, ToolTimeoutError, TuttiError, TuttiRuntime, TuttiTracer, UrlValidationError, VoiceError, createBlocklistHook, createCacheHook, createLogger, createLoggingHook, createMaxCostHook, defineScore, toJSON as evalToJSON, toMarkdown as evalToMarkdown, initTelemetry, logger, printTable as printEvalTable, shutdownTelemetry, validateScore };
|
package/dist/index.js
CHANGED
|
@@ -122,20 +122,21 @@ Only http:// and https:// URLs to public hosts are allowed.`,
|
|
|
122
122
|
// src/hooks/index.ts
|
|
123
123
|
function createLoggingHook(log) {
|
|
124
124
|
return {
|
|
125
|
-
|
|
125
|
+
beforeLLMCall(ctx, request) {
|
|
126
126
|
log.info({ agent: ctx.agent_name, turn: ctx.turn, model: request.model }, "LLM call");
|
|
127
|
-
return request;
|
|
127
|
+
return Promise.resolve(request);
|
|
128
128
|
},
|
|
129
|
-
|
|
129
|
+
afterLLMCall(ctx, response) {
|
|
130
130
|
log.info({ agent: ctx.agent_name, turn: ctx.turn, usage: response.usage }, "LLM response");
|
|
131
|
+
return Promise.resolve();
|
|
131
132
|
},
|
|
132
|
-
|
|
133
|
+
beforeToolCall(ctx, tool, input) {
|
|
133
134
|
log.info({ agent: ctx.agent_name, tool, input }, "Tool call");
|
|
134
|
-
return input;
|
|
135
|
+
return Promise.resolve(input);
|
|
135
136
|
},
|
|
136
|
-
|
|
137
|
+
afterToolCall(ctx, tool, result) {
|
|
137
138
|
log.info({ agent: ctx.agent_name, tool, is_error: result.is_error }, "Tool result");
|
|
138
|
-
return result;
|
|
139
|
+
return Promise.resolve(result);
|
|
139
140
|
}
|
|
140
141
|
};
|
|
141
142
|
}
|
|
@@ -144,150 +145,47 @@ function createCacheHook(store) {
|
|
|
144
145
|
return tool + ":" + JSON.stringify(input);
|
|
145
146
|
}
|
|
146
147
|
return {
|
|
147
|
-
|
|
148
|
+
beforeToolCall(_ctx, tool, input) {
|
|
148
149
|
const cached = store.get(cacheKey(tool, input));
|
|
149
|
-
if (cached) return cached;
|
|
150
|
-
return input;
|
|
150
|
+
if (cached) return Promise.resolve(cached);
|
|
151
|
+
return Promise.resolve(input);
|
|
151
152
|
},
|
|
152
|
-
|
|
153
|
+
afterToolCall(_ctx, tool, result) {
|
|
153
154
|
if (!result.is_error) {
|
|
154
155
|
store.set(cacheKey(tool, result.content), result.content);
|
|
155
156
|
}
|
|
156
|
-
return result;
|
|
157
|
+
return Promise.resolve(result);
|
|
157
158
|
}
|
|
158
159
|
};
|
|
159
160
|
}
|
|
160
161
|
function createBlocklistHook(blockedTools) {
|
|
161
162
|
const blocked = new Set(blockedTools);
|
|
162
163
|
return {
|
|
163
|
-
|
|
164
|
-
return !blocked.has(tool);
|
|
164
|
+
beforeToolCall(_ctx, tool) {
|
|
165
|
+
return Promise.resolve(!blocked.has(tool));
|
|
165
166
|
}
|
|
166
167
|
};
|
|
167
168
|
}
|
|
168
169
|
function createMaxCostHook(maxUsd) {
|
|
169
170
|
let totalCost = 0;
|
|
170
|
-
const
|
|
171
|
-
const
|
|
171
|
+
const INPUT_PER_M2 = 3;
|
|
172
|
+
const OUTPUT_PER_M2 = 15;
|
|
172
173
|
return {
|
|
173
|
-
|
|
174
|
-
totalCost += response.usage.input_tokens / 1e6 *
|
|
174
|
+
afterLLMCall(_ctx, response) {
|
|
175
|
+
totalCost += response.usage.input_tokens / 1e6 * INPUT_PER_M2 + response.usage.output_tokens / 1e6 * OUTPUT_PER_M2;
|
|
176
|
+
return Promise.resolve();
|
|
175
177
|
},
|
|
176
|
-
|
|
178
|
+
beforeLLMCall(ctx, request) {
|
|
177
179
|
if (totalCost >= maxUsd) {
|
|
178
|
-
|
|
180
|
+
return Promise.reject(new Error(
|
|
179
181
|
"Max cost hook: $" + totalCost.toFixed(4) + " exceeds limit $" + maxUsd.toFixed(2) + " for agent " + ctx.agent_name
|
|
180
|
-
);
|
|
182
|
+
));
|
|
181
183
|
}
|
|
182
|
-
return request;
|
|
184
|
+
return Promise.resolve(request);
|
|
183
185
|
}
|
|
184
186
|
};
|
|
185
187
|
}
|
|
186
188
|
|
|
187
|
-
// src/logger.ts
|
|
188
|
-
import pino from "pino";
|
|
189
|
-
var createLogger = (name) => pino({
|
|
190
|
-
name,
|
|
191
|
-
level: process.env.TUTTI_LOG_LEVEL ?? "info",
|
|
192
|
-
transport: process.env.NODE_ENV === "production" ? void 0 : {
|
|
193
|
-
target: "pino-pretty",
|
|
194
|
-
options: {
|
|
195
|
-
colorize: true,
|
|
196
|
-
translateTime: "HH:MM:ss",
|
|
197
|
-
ignore: "pid,hostname"
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
});
|
|
201
|
-
var logger = createLogger("tutti");
|
|
202
|
-
|
|
203
|
-
// src/telemetry.ts
|
|
204
|
-
import { trace, SpanStatusCode } from "@opentelemetry/api";
|
|
205
|
-
var tracer = trace.getTracer("tutti", "1.0.0");
|
|
206
|
-
var TuttiTracer = {
|
|
207
|
-
agentRun(agentName, sessionId, fn) {
|
|
208
|
-
return tracer.startActiveSpan("agent.run", async (span) => {
|
|
209
|
-
span.setAttribute("agent.name", agentName);
|
|
210
|
-
span.setAttribute("session.id", sessionId);
|
|
211
|
-
try {
|
|
212
|
-
const result = await fn();
|
|
213
|
-
span.setStatus({ code: SpanStatusCode.OK });
|
|
214
|
-
return result;
|
|
215
|
-
} catch (err) {
|
|
216
|
-
span.setStatus({
|
|
217
|
-
code: SpanStatusCode.ERROR,
|
|
218
|
-
message: err instanceof Error ? err.message : String(err)
|
|
219
|
-
});
|
|
220
|
-
throw err;
|
|
221
|
-
} finally {
|
|
222
|
-
span.end();
|
|
223
|
-
}
|
|
224
|
-
});
|
|
225
|
-
},
|
|
226
|
-
llmCall(model, fn) {
|
|
227
|
-
return tracer.startActiveSpan("llm.call", async (span) => {
|
|
228
|
-
span.setAttribute("llm.model", model);
|
|
229
|
-
try {
|
|
230
|
-
const result = await fn();
|
|
231
|
-
span.setStatus({ code: SpanStatusCode.OK });
|
|
232
|
-
return result;
|
|
233
|
-
} catch (err) {
|
|
234
|
-
span.setStatus({
|
|
235
|
-
code: SpanStatusCode.ERROR,
|
|
236
|
-
message: err instanceof Error ? err.message : String(err)
|
|
237
|
-
});
|
|
238
|
-
throw err;
|
|
239
|
-
} finally {
|
|
240
|
-
span.end();
|
|
241
|
-
}
|
|
242
|
-
});
|
|
243
|
-
},
|
|
244
|
-
toolCall(toolName, fn) {
|
|
245
|
-
return tracer.startActiveSpan("tool.call", async (span) => {
|
|
246
|
-
span.setAttribute("tool.name", toolName);
|
|
247
|
-
try {
|
|
248
|
-
const result = await fn();
|
|
249
|
-
span.setStatus({ code: SpanStatusCode.OK });
|
|
250
|
-
return result;
|
|
251
|
-
} catch (err) {
|
|
252
|
-
span.setStatus({
|
|
253
|
-
code: SpanStatusCode.ERROR,
|
|
254
|
-
message: err instanceof Error ? err.message : String(err)
|
|
255
|
-
});
|
|
256
|
-
throw err;
|
|
257
|
-
} finally {
|
|
258
|
-
span.end();
|
|
259
|
-
}
|
|
260
|
-
});
|
|
261
|
-
}
|
|
262
|
-
};
|
|
263
|
-
|
|
264
|
-
// src/telemetry-setup.ts
|
|
265
|
-
import { NodeSDK } from "@opentelemetry/sdk-node";
|
|
266
|
-
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
|
267
|
-
import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
|
|
268
|
-
var sdk;
|
|
269
|
-
function initTelemetry(config) {
|
|
270
|
-
if (!config.enabled || sdk) return;
|
|
271
|
-
const endpoint = config.endpoint ?? "http://localhost:4318";
|
|
272
|
-
const exporter = new OTLPTraceExporter({
|
|
273
|
-
url: `${endpoint}/v1/traces`,
|
|
274
|
-
headers: config.headers
|
|
275
|
-
});
|
|
276
|
-
sdk = new NodeSDK({
|
|
277
|
-
traceExporter: exporter,
|
|
278
|
-
instrumentations: [getNodeAutoInstrumentations({ "@opentelemetry/instrumentation-fs": { enabled: false } })],
|
|
279
|
-
serviceName: process.env.OTEL_SERVICE_NAME ?? "tutti"
|
|
280
|
-
});
|
|
281
|
-
sdk.start();
|
|
282
|
-
logger.info({ endpoint }, "OpenTelemetry tracing enabled");
|
|
283
|
-
}
|
|
284
|
-
async function shutdownTelemetry() {
|
|
285
|
-
if (sdk) {
|
|
286
|
-
await sdk.shutdown();
|
|
287
|
-
sdk = void 0;
|
|
288
|
-
}
|
|
289
|
-
}
|
|
290
|
-
|
|
291
189
|
// src/agent-runner.ts
|
|
292
190
|
import { z } from "zod";
|
|
293
191
|
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
@@ -415,6 +313,83 @@ var TokenBudget = class {
|
|
|
415
313
|
}
|
|
416
314
|
};
|
|
417
315
|
|
|
316
|
+
// src/logger.ts
|
|
317
|
+
import pino from "pino";
|
|
318
|
+
var createLogger = (name) => pino({
|
|
319
|
+
name,
|
|
320
|
+
level: process.env.TUTTI_LOG_LEVEL ?? "info",
|
|
321
|
+
transport: process.env.NODE_ENV === "production" ? void 0 : {
|
|
322
|
+
target: "pino-pretty",
|
|
323
|
+
options: {
|
|
324
|
+
colorize: true,
|
|
325
|
+
translateTime: "HH:MM:ss",
|
|
326
|
+
ignore: "pid,hostname"
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
});
|
|
330
|
+
var logger = createLogger("tutti");
|
|
331
|
+
|
|
332
|
+
// src/telemetry.ts
|
|
333
|
+
import { trace, SpanStatusCode } from "@opentelemetry/api";
|
|
334
|
+
var tracer = trace.getTracer("tutti", "1.0.0");
|
|
335
|
+
var TuttiTracer = {
|
|
336
|
+
agentRun(agentName, sessionId, fn) {
|
|
337
|
+
return tracer.startActiveSpan("agent.run", async (span) => {
|
|
338
|
+
span.setAttribute("agent.name", agentName);
|
|
339
|
+
span.setAttribute("session.id", sessionId);
|
|
340
|
+
try {
|
|
341
|
+
const result = await fn();
|
|
342
|
+
span.setStatus({ code: SpanStatusCode.OK });
|
|
343
|
+
return result;
|
|
344
|
+
} catch (err) {
|
|
345
|
+
span.setStatus({
|
|
346
|
+
code: SpanStatusCode.ERROR,
|
|
347
|
+
message: err instanceof Error ? err.message : String(err)
|
|
348
|
+
});
|
|
349
|
+
throw err;
|
|
350
|
+
} finally {
|
|
351
|
+
span.end();
|
|
352
|
+
}
|
|
353
|
+
});
|
|
354
|
+
},
|
|
355
|
+
llmCall(model, fn) {
|
|
356
|
+
return tracer.startActiveSpan("llm.call", async (span) => {
|
|
357
|
+
span.setAttribute("llm.model", model);
|
|
358
|
+
try {
|
|
359
|
+
const result = await fn();
|
|
360
|
+
span.setStatus({ code: SpanStatusCode.OK });
|
|
361
|
+
return result;
|
|
362
|
+
} catch (err) {
|
|
363
|
+
span.setStatus({
|
|
364
|
+
code: SpanStatusCode.ERROR,
|
|
365
|
+
message: err instanceof Error ? err.message : String(err)
|
|
366
|
+
});
|
|
367
|
+
throw err;
|
|
368
|
+
} finally {
|
|
369
|
+
span.end();
|
|
370
|
+
}
|
|
371
|
+
});
|
|
372
|
+
},
|
|
373
|
+
toolCall(toolName, fn) {
|
|
374
|
+
return tracer.startActiveSpan("tool.call", async (span) => {
|
|
375
|
+
span.setAttribute("tool.name", toolName);
|
|
376
|
+
try {
|
|
377
|
+
const result = await fn();
|
|
378
|
+
span.setStatus({ code: SpanStatusCode.OK });
|
|
379
|
+
return result;
|
|
380
|
+
} catch (err) {
|
|
381
|
+
span.setStatus({
|
|
382
|
+
code: SpanStatusCode.ERROR,
|
|
383
|
+
message: err instanceof Error ? err.message : String(err)
|
|
384
|
+
});
|
|
385
|
+
throw err;
|
|
386
|
+
} finally {
|
|
387
|
+
span.end();
|
|
388
|
+
}
|
|
389
|
+
});
|
|
390
|
+
}
|
|
391
|
+
};
|
|
392
|
+
|
|
418
393
|
// src/agent-runner.ts
|
|
419
394
|
var DEFAULT_MAX_TURNS = 10;
|
|
420
395
|
var DEFAULT_MAX_TOOL_CALLS = 20;
|
|
@@ -1106,6 +1081,33 @@ var PermissionGuard = class {
|
|
|
1106
1081
|
}
|
|
1107
1082
|
};
|
|
1108
1083
|
|
|
1084
|
+
// src/telemetry-setup.ts
|
|
1085
|
+
import { NodeSDK } from "@opentelemetry/sdk-node";
|
|
1086
|
+
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
|
1087
|
+
import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
|
|
1088
|
+
var sdk;
|
|
1089
|
+
function initTelemetry(config) {
|
|
1090
|
+
if (!config.enabled || sdk) return;
|
|
1091
|
+
const endpoint = config.endpoint ?? "http://localhost:4318";
|
|
1092
|
+
const exporter = new OTLPTraceExporter({
|
|
1093
|
+
url: `${endpoint}/v1/traces`,
|
|
1094
|
+
headers: config.headers
|
|
1095
|
+
});
|
|
1096
|
+
sdk = new NodeSDK({
|
|
1097
|
+
traceExporter: exporter,
|
|
1098
|
+
instrumentations: [getNodeAutoInstrumentations({ "@opentelemetry/instrumentation-fs": { enabled: false } })],
|
|
1099
|
+
serviceName: process.env.OTEL_SERVICE_NAME ?? "tutti"
|
|
1100
|
+
});
|
|
1101
|
+
sdk.start();
|
|
1102
|
+
logger.info({ endpoint }, "OpenTelemetry tracing enabled");
|
|
1103
|
+
}
|
|
1104
|
+
async function shutdownTelemetry() {
|
|
1105
|
+
if (sdk) {
|
|
1106
|
+
await sdk.shutdown();
|
|
1107
|
+
sdk = void 0;
|
|
1108
|
+
}
|
|
1109
|
+
}
|
|
1110
|
+
|
|
1109
1111
|
// src/runtime.ts
|
|
1110
1112
|
var TuttiRuntime = class _TuttiRuntime {
|
|
1111
1113
|
events;
|
|
@@ -1196,6 +1198,209 @@ Supported: "in-memory", "postgres"`,
|
|
|
1196
1198
|
}
|
|
1197
1199
|
};
|
|
1198
1200
|
|
|
1201
|
+
// src/eval/runner.ts
|
|
1202
|
+
var INPUT_PER_M = 3;
|
|
1203
|
+
var OUTPUT_PER_M = 15;
|
|
1204
|
+
function estimateCost(inputTokens, outputTokens) {
|
|
1205
|
+
return inputTokens / 1e6 * INPUT_PER_M + outputTokens / 1e6 * OUTPUT_PER_M;
|
|
1206
|
+
}
|
|
1207
|
+
var EvalRunner = class {
|
|
1208
|
+
runtime;
|
|
1209
|
+
constructor(score) {
|
|
1210
|
+
this.runtime = new TuttiRuntime(score);
|
|
1211
|
+
}
|
|
1212
|
+
async run(suite) {
|
|
1213
|
+
const results = [];
|
|
1214
|
+
for (const testCase of suite.cases) {
|
|
1215
|
+
const result = await this.runCase(testCase);
|
|
1216
|
+
results.push(result);
|
|
1217
|
+
}
|
|
1218
|
+
const summary = this.summarize(results);
|
|
1219
|
+
return { suite_name: suite.name, results, summary };
|
|
1220
|
+
}
|
|
1221
|
+
async runCase(testCase) {
|
|
1222
|
+
const toolsCalled = [];
|
|
1223
|
+
const unsubscribeToolStart = this.runtime.events.on("tool:start", (e) => {
|
|
1224
|
+
toolsCalled.push(e.tool_name);
|
|
1225
|
+
});
|
|
1226
|
+
const start = Date.now();
|
|
1227
|
+
let output = "";
|
|
1228
|
+
let turns = 0;
|
|
1229
|
+
let usage = { input_tokens: 0, output_tokens: 0 };
|
|
1230
|
+
let error;
|
|
1231
|
+
try {
|
|
1232
|
+
const result = await this.runtime.run(testCase.agent_id, testCase.input);
|
|
1233
|
+
output = result.output;
|
|
1234
|
+
turns = result.turns;
|
|
1235
|
+
usage = result.usage;
|
|
1236
|
+
} catch (err) {
|
|
1237
|
+
error = err instanceof Error ? err.message : String(err);
|
|
1238
|
+
output = "[error] " + error;
|
|
1239
|
+
}
|
|
1240
|
+
unsubscribeToolStart();
|
|
1241
|
+
const durationMs = Date.now() - start;
|
|
1242
|
+
const costUsd = estimateCost(usage.input_tokens, usage.output_tokens);
|
|
1243
|
+
const assertionResults = testCase.assertions.map(
|
|
1244
|
+
(assertion) => this.checkAssertion(assertion, output, toolsCalled, turns, costUsd)
|
|
1245
|
+
);
|
|
1246
|
+
const passedCount = assertionResults.filter((a) => a.passed).length;
|
|
1247
|
+
const score = testCase.assertions.length > 0 ? passedCount / testCase.assertions.length : error ? 0 : 1;
|
|
1248
|
+
return {
|
|
1249
|
+
case_id: testCase.id,
|
|
1250
|
+
case_name: testCase.name,
|
|
1251
|
+
passed: assertionResults.every((a) => a.passed) && !error,
|
|
1252
|
+
score,
|
|
1253
|
+
output,
|
|
1254
|
+
turns,
|
|
1255
|
+
usage,
|
|
1256
|
+
cost_usd: costUsd,
|
|
1257
|
+
duration_ms: durationMs,
|
|
1258
|
+
assertions: assertionResults,
|
|
1259
|
+
error
|
|
1260
|
+
};
|
|
1261
|
+
}
|
|
1262
|
+
checkAssertion(assertion, output, toolsCalled, turns, costUsd) {
|
|
1263
|
+
const val = assertion.value;
|
|
1264
|
+
switch (assertion.type) {
|
|
1265
|
+
case "contains":
|
|
1266
|
+
return {
|
|
1267
|
+
assertion,
|
|
1268
|
+
passed: output.toLowerCase().includes(String(val).toLowerCase()),
|
|
1269
|
+
actual: output.slice(0, 200)
|
|
1270
|
+
};
|
|
1271
|
+
case "not_contains":
|
|
1272
|
+
return {
|
|
1273
|
+
assertion,
|
|
1274
|
+
passed: !output.toLowerCase().includes(String(val).toLowerCase()),
|
|
1275
|
+
actual: output.slice(0, 200)
|
|
1276
|
+
};
|
|
1277
|
+
case "matches_regex": {
|
|
1278
|
+
const regex = new RegExp(String(val), "i");
|
|
1279
|
+
return {
|
|
1280
|
+
assertion,
|
|
1281
|
+
passed: regex.test(output),
|
|
1282
|
+
actual: output.slice(0, 200)
|
|
1283
|
+
};
|
|
1284
|
+
}
|
|
1285
|
+
case "tool_called":
|
|
1286
|
+
return {
|
|
1287
|
+
assertion,
|
|
1288
|
+
passed: toolsCalled.includes(String(val)),
|
|
1289
|
+
actual: toolsCalled.join(", ") || "(none)"
|
|
1290
|
+
};
|
|
1291
|
+
case "tool_not_called":
|
|
1292
|
+
return {
|
|
1293
|
+
assertion,
|
|
1294
|
+
passed: !toolsCalled.includes(String(val)),
|
|
1295
|
+
actual: toolsCalled.join(", ") || "(none)"
|
|
1296
|
+
};
|
|
1297
|
+
case "turns_lte":
|
|
1298
|
+
return {
|
|
1299
|
+
assertion,
|
|
1300
|
+
passed: turns <= Number(val),
|
|
1301
|
+
actual: turns
|
|
1302
|
+
};
|
|
1303
|
+
case "cost_lte":
|
|
1304
|
+
return {
|
|
1305
|
+
assertion,
|
|
1306
|
+
passed: costUsd <= Number(val),
|
|
1307
|
+
actual: Number(costUsd.toFixed(4))
|
|
1308
|
+
};
|
|
1309
|
+
default:
|
|
1310
|
+
logger.warn({ type: assertion.type }, "Unknown assertion type");
|
|
1311
|
+
return { assertion, passed: false, actual: "unknown assertion type" };
|
|
1312
|
+
}
|
|
1313
|
+
}
|
|
1314
|
+
summarize(results) {
|
|
1315
|
+
const passed = results.filter((r) => r.passed).length;
|
|
1316
|
+
const scores = results.map((r) => r.score);
|
|
1317
|
+
const avgScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0;
|
|
1318
|
+
const totalCost = results.reduce((s, r) => s + r.cost_usd, 0);
|
|
1319
|
+
const totalDuration = results.reduce((s, r) => s + r.duration_ms, 0);
|
|
1320
|
+
return {
|
|
1321
|
+
total: results.length,
|
|
1322
|
+
passed,
|
|
1323
|
+
failed: results.length - passed,
|
|
1324
|
+
avg_score: Number(avgScore.toFixed(2)),
|
|
1325
|
+
total_cost_usd: Number(totalCost.toFixed(4)),
|
|
1326
|
+
total_duration_ms: totalDuration
|
|
1327
|
+
};
|
|
1328
|
+
}
|
|
1329
|
+
};
|
|
1330
|
+
|
|
1331
|
+
// src/eval/report.ts
|
|
1332
|
+
function pad(str, len) {
|
|
1333
|
+
return str.length >= len ? str.slice(0, len) : str + " ".repeat(len - str.length);
|
|
1334
|
+
}
|
|
1335
|
+
function printTable(report) {
|
|
1336
|
+
const { results, summary } = report;
|
|
1337
|
+
console.log();
|
|
1338
|
+
console.log(" Eval suite: " + report.suite_name + " (" + summary.total + " cases)");
|
|
1339
|
+
console.log();
|
|
1340
|
+
for (const r of results) {
|
|
1341
|
+
const icon = r.passed ? "\x1B[32m\u2714\x1B[0m" : "\x1B[31m\u2717\x1B[0m";
|
|
1342
|
+
const score = r.score.toFixed(2);
|
|
1343
|
+
const cost = "$" + r.cost_usd.toFixed(3);
|
|
1344
|
+
const line = " " + icon + " " + pad(r.case_id, 10) + " " + pad(r.case_name, 28) + " " + pad(score, 6) + " " + r.turns + " turns " + cost;
|
|
1345
|
+
console.log(line);
|
|
1346
|
+
if (!r.passed) {
|
|
1347
|
+
for (const a of r.assertions) {
|
|
1348
|
+
if (!a.passed) {
|
|
1349
|
+
const desc = a.assertion.description ?? a.assertion.type + ": " + String(a.assertion.value);
|
|
1350
|
+
console.log(" \x1B[31m\u21B3 FAIL: " + desc + " (actual: " + String(a.actual).slice(0, 60) + ")\x1B[0m");
|
|
1351
|
+
}
|
|
1352
|
+
}
|
|
1353
|
+
if (r.error) {
|
|
1354
|
+
console.log(" \x1B[31m\u21B3 ERROR: " + r.error.slice(0, 80) + "\x1B[0m");
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
}
|
|
1358
|
+
const pct = summary.total > 0 ? Math.round(summary.passed / summary.total * 100) : 0;
|
|
1359
|
+
console.log();
|
|
1360
|
+
console.log(
|
|
1361
|
+
" Results: " + summary.passed + "/" + summary.total + " passed (" + pct + "%) | Avg: " + summary.avg_score.toFixed(2) + " | Total: $" + summary.total_cost_usd.toFixed(3)
|
|
1362
|
+
);
|
|
1363
|
+
console.log();
|
|
1364
|
+
}
|
|
1365
|
+
function toJSON(report) {
|
|
1366
|
+
return JSON.stringify(report, null, 2);
|
|
1367
|
+
}
|
|
1368
|
+
function toMarkdown(report) {
|
|
1369
|
+
const { results, summary } = report;
|
|
1370
|
+
const lines = [];
|
|
1371
|
+
lines.push("## Eval: " + report.suite_name);
|
|
1372
|
+
lines.push("");
|
|
1373
|
+
lines.push("| Status | ID | Name | Score | Turns | Cost |");
|
|
1374
|
+
lines.push("|--------|-----|------|-------|-------|------|");
|
|
1375
|
+
for (const r of results) {
|
|
1376
|
+
const icon = r.passed ? "pass" : "FAIL";
|
|
1377
|
+
lines.push(
|
|
1378
|
+
"| " + icon + " | " + r.case_id + " | " + r.case_name + " | " + r.score.toFixed(2) + " | " + r.turns + " | $" + r.cost_usd.toFixed(3) + " |"
|
|
1379
|
+
);
|
|
1380
|
+
}
|
|
1381
|
+
lines.push("");
|
|
1382
|
+
const pct = summary.total > 0 ? Math.round(summary.passed / summary.total * 100) : 0;
|
|
1383
|
+
lines.push(
|
|
1384
|
+
"**Results:** " + summary.passed + "/" + summary.total + " passed (" + pct + "%) | Avg score: " + summary.avg_score.toFixed(2) + " | Total cost: $" + summary.total_cost_usd.toFixed(3)
|
|
1385
|
+
);
|
|
1386
|
+
const failed = results.filter((r) => !r.passed);
|
|
1387
|
+
if (failed.length > 0) {
|
|
1388
|
+
lines.push("");
|
|
1389
|
+
lines.push("### Failures");
|
|
1390
|
+
lines.push("");
|
|
1391
|
+
for (const r of failed) {
|
|
1392
|
+
lines.push("**" + r.case_id + "** \u2014 " + r.case_name);
|
|
1393
|
+
for (const a of r.assertions.filter((x) => !x.passed)) {
|
|
1394
|
+
const desc = a.assertion.description ?? a.assertion.type + ": " + String(a.assertion.value);
|
|
1395
|
+
lines.push("- " + desc + " (actual: `" + String(a.actual).slice(0, 80) + "`)");
|
|
1396
|
+
}
|
|
1397
|
+
if (r.error) lines.push("- Error: " + r.error);
|
|
1398
|
+
lines.push("");
|
|
1399
|
+
}
|
|
1400
|
+
}
|
|
1401
|
+
return lines.join("\n");
|
|
1402
|
+
}
|
|
1403
|
+
|
|
1199
1404
|
// src/agent-router.ts
|
|
1200
1405
|
import { z as z2 } from "zod";
|
|
1201
1406
|
var AgentRouter = class {
|
|
@@ -2050,6 +2255,7 @@ export {
|
|
|
2050
2255
|
AuthenticationError,
|
|
2051
2256
|
BudgetExceededError,
|
|
2052
2257
|
ContextWindowError,
|
|
2258
|
+
EvalRunner,
|
|
2053
2259
|
EventBus,
|
|
2054
2260
|
GeminiProvider,
|
|
2055
2261
|
InMemorySemanticStore,
|
|
@@ -2078,8 +2284,11 @@ export {
|
|
|
2078
2284
|
createLoggingHook,
|
|
2079
2285
|
createMaxCostHook,
|
|
2080
2286
|
defineScore,
|
|
2287
|
+
toJSON as evalToJSON,
|
|
2288
|
+
toMarkdown as evalToMarkdown,
|
|
2081
2289
|
initTelemetry,
|
|
2082
2290
|
logger,
|
|
2291
|
+
printTable as printEvalTable,
|
|
2083
2292
|
shutdownTelemetry,
|
|
2084
2293
|
validateScore
|
|
2085
2294
|
};
|